# BERT Spam Classification using Trainer HuggingFace

Try to classify spam messages using BERT and Trainer. This gonna use multiple GPUs if available.   
Make sure to set the `batch_size` accordingly to your GPU memory.

In [5]:
import pandas as pd

df = pd.read_csv("data/SMSSpamCollection", sep="\t", names=["label", 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['spam'] = df['label'].apply(lambda x: 1 if x == "spam" else 0)
df.head()

Unnamed: 0,label,message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split
import torch

X_train, X_test, y_train, y_test = train_test_split(df.message, df.spam, stratify=df.spam)

# print to make sure no data leakage
# what is data leakage? https://machinelearningmastery.com/data-leakage-machine-learning/
# in summary, data leakage is when information from outside the training dataset is used to create the model
# which means the model will not be able to generalize well to new data
# how the information from outside the training dataset is used to create the model?
# in this case, the information is the test dataset
# if we use the test dataset to create the model, then the model will not be able to generalize well to new data
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

4179 4179
1393 1393


In [8]:
class SMSSpamDataset(torch.utils.data.Dataset):
    """
    Custom Dataset class to select the right columns from the dataframe
    """
    def __init__(self, tokenizer, messages, labels, max_length=128):
        self.tokenizer = tokenizer
        self.messages = messages
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.messages)

    def __getitem__(self, idx):
        # pick the message and label
        text = self.messages.iloc[idx]
        label = self.labels.iloc[idx]

        # tokenize the message
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

        # convert into a new item format containing the input ids, attention mask, and label
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label).float()

        return item

In [9]:
from transformers import BertTokenizer

# choose the checkpoint
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)

# create the dataset, wrap dataframe into a Dataset class and tokenize the messages on the fly
# another approach can be found in pytorch example
train_dataset = SMSSpamDataset(tokenizer, X_train, y_train)
test_dataset = SMSSpamDataset(tokenizer, X_test, y_test)

In [10]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader
from transformers import AdamW
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

# create the data collator to encode the labels and tokenize the messages
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# setup config for dropout layer
config = BertConfig.from_pretrained(checkpoint, num_labels=1)
config.attention_probs_dropout_prob = 0.1
config.hidden_dropout_prob = 0.1

# create the model using BERT classification model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# create the training arguments for trainers
training_args = TrainingArguments('output', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=64)

# create the metrics accuracy score function
# this will be used to compute the accuracy score during training, evaluation, and testing
# accuracy will be in percentage
def compute_metrics(p):
    logits = p.predictions
    labels = p.label_ids

    preds = (logits >= 0.5).astype(int)

    print("Size of preds:", len(preds))
    print("Size of labels:", len(labels))

    return {"accuracy": accuracy_score(labels, preds)}

# create the trainer
class CustomTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        # this function is used to compute the loss
        # the loss is not in percentage, it is the loss value
        # the loss value is used to update the model

        # what is logits? https://stackoverflow.com/questions/41455101/what-is-the-meaning-of-the-word-logits-in-tensorflow
        logits = model(**inputs).logits

        # what is BCEWithLogitsLoss? https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html
        # BCEWithLogitsLoss is used for binary classification
        loss_fct = nn.BCEWithLogitsLoss()

        # extract labels from inputs
        labels = inputs.pop("labels")

        # compute the loss
        loss = loss_fct(logits.view(-1), labels.float().view(-1))
        
        # return the loss and logits if return_outputs is True
        # BEWARE: this format will help to solve bug in compute_metrics function
        # where usually its return mistmatch error between input and labels
        return (loss, (loss, logits)) if return_outputs else loss

# create the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# start training
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Size of preds: 1393
Size of labels: 1393
{'eval_loss': 0.0495595782995224, 'eval_accuracy': 0.9885139985642498, 'eval_runtime': 3.843, 'eval_samples_per_second': 362.48, 'eval_steps_per_second': 22.899, 'epoch': 1.0}


  0%|          | 0/88 [00:00<?, ?it/s]

Size of preds: 1393
Size of labels: 1393
{'eval_loss': 0.036654483526945114, 'eval_accuracy': 0.990667623833453, 'eval_runtime': 3.813, 'eval_samples_per_second': 365.328, 'eval_steps_per_second': 23.079, 'epoch': 2.0}


  0%|          | 0/88 [00:00<?, ?it/s]

Size of preds: 1393
Size of labels: 1393
{'eval_loss': 0.03664017468690872, 'eval_accuracy': 0.990667623833453, 'eval_runtime': 3.5616, 'eval_samples_per_second': 391.114, 'eval_steps_per_second': 24.708, 'epoch': 3.0}
{'train_runtime': 40.9376, 'train_samples_per_second': 306.246, 'train_steps_per_second': 2.418, 'train_loss': 0.062381301263366086, 'epoch': 3.0}


TrainOutput(global_step=99, training_loss=0.062381301263366086, metrics={'train_runtime': 40.9376, 'train_samples_per_second': 306.246, 'train_steps_per_second': 2.418, 'train_loss': 0.062381301263366086, 'epoch': 3.0})

In [11]:
# evaluate the model
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']}")

for k, v in results.items():
    print(f'{k}: {v:.4f}')

  0%|          | 0/88 [00:00<?, ?it/s]

Size of preds: 1393
Size of labels: 1393
Accuracy: 0.990667623833453
eval_loss: 0.0366
eval_accuracy: 0.9907
eval_runtime: 3.8402
eval_samples_per_second: 362.7420
eval_steps_per_second: 22.9150
epoch: 3.0000


In [12]:
# show prediction
predict = trainer.predict(test_dataset)
predict.label_ids[:100]

  0%|          | 0/88 [00:00<?, ?it/s]

Size of preds: 1393
Size of labels: 1393


array([0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [13]:
# helper function to clean-up memory
import torch, gc

def clean_up():
    gc.collect()
    torch.cuda.empty_cache()

clean_up()