fine tuning BERT on each dataset

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
# import libraries
import numpy as np
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from google.colab import drive
drive.mount('/content/drive')
torch.cuda.manual_seed_all(56)
torch.manual_seed(56)

Mounted at /content/drive


<torch._C.Generator at 0x78c3bc2a1070>

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# to take advantage of gpu acceleration on training
print('Using device:', device)

Using device: cuda


In [None]:
# load the data

# sms = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv') # load sms spam dataset

# enron = pd.read_csv('/content/drive/MyDrive/spam_detection/enron_full.csv')

youtube = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')


In [None]:
# train-test split

language = 'en' # language of message
data = youtube # dataset

if language == 'en':

  X = data.Message
  y = data.Category.values

else:

  X = data.gtrans_el
  y = data.Category.values


# split into 60:20:20
Xtrain, Xtest,ytrain, ytest = train_test_split(X, y, random_state=56, test_size=0.2, stratify = y)
x_train, x_valid ,y_train, y_valid = train_test_split(Xtrain, ytrain, random_state=56, test_size=0.25, stratify = ytrain)

In [None]:
if language == 'en':

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenizer
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 2, # The number of output labels--2 for binary classification.
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  ).to(device)

else:

  # greek BERT for classification
  tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
  model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",num_labels=2).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# encoding the input to be compatible with BERT model
# encoding train - test data and store them  representations in dataloaders

def train_test(Xtrain,Xtest,ytrain,ytest,batch_size):

  encoded_train = tokenizer.batch_encode_plus(Xtrain.tolist(), add_special_tokens=True, max_length = 50, padding='max_length' , truncation=True, return_tensors = 'pt')
  encoded_test = tokenizer.batch_encode_plus(Xtest.tolist(), add_special_tokens=True, max_length = 50, padding='max_length' , truncation=True, return_tensors = 'pt')
  input_ids_train = encoded_train['input_ids']
  attention_mask_train = encoded_train['attention_mask']
  labels_train = torch.tensor(ytrain)
  input_ids_test = encoded_test['input_ids']
  attention_mask_test = encoded_test['attention_mask']
  labels_test = torch.tensor(ytest)

  # combine the training/testing inputs into a TensorDataset
  data_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
  data_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)

  dataloader_train = DataLoader(
              data_train,  # the training samples
              batch_size = batch_size, #traversing through the dataset with batch_size
              shuffle = True
          )
  # Shuffling the data after each epoch ensures that you will not be “stuck” with too many bad batches

  dataloader_test = DataLoader(
              data_test, # The validation samples.
              batch_size = batch_size, # Evaluate with this batch size.
              shuffle = False
          )

  return dataloader_train, dataloader_test


In [None]:
# get logits (tensors) and pass them through a softmax layer. Then turn them into predictions that stored in a numpy array

def get_predictions(logits):
 prob_softmax = F.softmax(logits,dim=1)
 pred = np.array(np.argmax(prob_softmax,axis=1))
 return pred


In [None]:
# compute the class weights

# wj=n_samples / (n_classes * n_samplesj), for j=0,1 classes


def compute_class_weights(ytrain):

  total_samples = len(ytrain)
  w0 = total_samples/(2*len([y for y in ytrain if y == 0]))
  w1 = total_samples/(2*len([y for y in ytrain if y == 1]))


  return w0,w1

In [None]:
# # fine tuning hyperparameters: num_epochs and learning_rate by evaluating on validation set

# batch_size = 32 # for training
# epochs = 4 # num of epochs to train
# warm_up = 0.02

# dataloader_train, dataloader_valid = train_test(x_train,x_valid,y_train,y_valid,batch_size)
# w0,w1 = compute_class_weights(y_train)
# weights = torch.tensor([w0, w1]).to(device)

# # define optimizer and scheduler

# # applying weight decay to all trainable parameters except bias and normalization layer weigths
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=4e-5)

# scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=warm_up*epochs*len(dataloader_train), num_training_steps = epochs*len(dataloader_train))


# # metrics per epoch

# Loss_train = []


# f1_valid = []
# accuracy_valid = []
# Loss_valid = []


# for epoch in range(epochs):

#   model.train() # set model to training mode
#   train_loss = 0 # accumulate loss for every batch per epoch

#   # training loop
#   for step,batch in enumerate(tqdm(dataloader_train)):

#     input_ids = batch[0].to(device)
#     attention_mask = batch[1].to(device)
#     labels = batch[2].to(device)
#     model.zero_grad() # clear gradients
#     outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     # # weighted loss for class imbalance
#     criterion = torch.nn.CrossEntropyLoss(weight=weights,reduction='mean')
#     batch_loss = criterion(outputs.logits, labels)
#     # batch_loss = outputs.loss # loss for the batch if we dont have weighted loss function
#     train_loss += batch_loss
#     batch_loss.backward()
#     # update parameters
#     optimizer.step()
#     # Update the learning rate.
#     scheduler.step()


#   Loss_train.append(train_loss/len(dataloader_train))# compute the average loss for all the batches in epoch

#   model.eval() # set model to evaluation mode
#   valid_loss = 0 # accumulate loss for every batch
#   all_logits = [] # store logits of every batch to pass them all into function get predictions and take the predictions overall

#   # evaluation
#   for step,batch in enumerate(dataloader_valid):

#     input_ids = batch[0].to(device)
#     attention_mask = batch[1].to(device)
#     labels = batch[2].to(device)

#     with torch.no_grad():
#       outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     batch_loss = outputs.loss # loss for the batch
#     all_logits.append(outputs.logits.cpu())
#     valid_loss += batch_loss
#     # del  input_ids, attention_mask, labels, outputs

#   all_logits = torch.cat(all_logits, dim=0)
#   pred = get_predictions(all_logits)
#   valid_loss = (valid_loss/len(dataloader_valid)) # compute the average loss for all the batches in epoch

#    # validation metrics
#   Loss_valid.append(valid_loss)
#   accuracy_valid.append(accuracy_score(y_valid,pred))
#   f1_valid.append(f1_score(y_valid, pred, average='macro'))



# # learning curves

# epoch = [c for c in range(1,epochs+1)]

# validation_loss = [loss.cpu() for loss in Loss_valid]
# training_loss = [tensor.detach().cpu() for tensor in Loss_train]

# # plot learning curve
# plt.figure()
# plt.title('Loss')
# plt.plot(epoch,validation_loss,color='orange',label='validation')
# plt.plot(epoch,training_loss,color='blue',label='train')
# plt.xlabel('# of epochs')
# plt.xticks(epoch)
# plt.legend(['val_loss', 'loss'])
# plt.show()

# optimal_epochs = np.argmin(validation_loss) + 1
# print("optimal number of epochs found = " +str(optimal_epochs)+" with training Loss = "+str(training_loss[optimal_epochs-1])+" and validation Loss = "+str(validation_loss[optimal_epochs-1]))


In [None]:
# after we found the optimal epochs and learning rate values
# we reset the model in pretrained state. Then we retrain it in all train data = train + validation
# finally evaluate on test set

optimal_epochs = 3
batch_size = 32

dataloader_train, dataloader_test = train_test(Xtrain,Xtest,ytrain,ytest,batch_size)
w0,w1 = compute_class_weights(ytrain)
weights = torch.tensor([w0, w1]).to(device)

# define optimizer and scheduler

# applying weight decay to all trainable parameters except bias and normalization layer weigths
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=4e-5)


# new scheduler to train the model in all avaliable training data
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0*optimal_epochs*len(dataloader_train), num_training_steps = optimal_epochs*len(dataloader_train))


model.train() # set model to training mode

for epoch in range(optimal_epochs):

  # training loop
  for step,batch in enumerate(tqdm(dataloader_train)):

    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    model.zero_grad() # clear gradients
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # forward pass
    ## weighted loss for class imbalance
    # criterion = torch.nn.CrossEntropyLoss(weight=weights,reduction='mean')
    # batch_loss = criterion(outputs.logits, labels)
    batch_loss = outputs.loss # loss for the batch if we dont have weighted loss function
    batch_loss.backward() # compute gradients of cost function (Cross Entropy Loss) wih respect to all parameters
    # update parameters
    optimizer.step()
    # Update the learning rate.
    scheduler.step()



# evaluation of model

model.eval() # set model to evaluation mode
test_loss = 0 # accumulate loss for every batch
all_logits = [] # store logits of every batch to pass them all into function get predictions and take the predictions overall

# evaluation
for step,batch in enumerate(dataloader_test):

  input_ids = batch[0].to(device)
  attention_mask = batch[1].to(device)
  labels = batch[2].to(device)

  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
  batch_loss = outputs.loss # loss for the batch
  all_logits.append(outputs.logits.cpu())
  test_loss += batch_loss


all_logits = torch.cat(all_logits, dim=0)
pred = get_predictions(all_logits)
test_loss = test_loss/len(dataloader_test) # compute the average loss for all the batches
print("results:")
print("Loss is "+str(test_loss))
print("Classification report:\n\n"+str(classification_report(ytest,pred,target_names=['ham','spam'])))
print("accuracy is "+str(round(accuracy_score(ytest,pred),4)))
print("balanced accuracy is "+str(round(balanced_accuracy_score(ytest,pred),4)))
print("f1 macro is "+str(round(f1_score(ytest, pred, average='macro'),4))+"\n")
print("confusion matrix"+str(confusion_matrix(ytest, pred))+"\n\n") # [[TN FP],[FN TP]]

100%|██████████| 42/42 [00:13<00:00,  3.03it/s]
100%|██████████| 42/42 [00:10<00:00,  4.10it/s]
100%|██████████| 42/42 [00:10<00:00,  4.09it/s]


results:
Loss is tensor(0.1971, device='cuda:0')
Classification report:

              precision    recall  f1-score   support

         ham       0.93      0.97      0.95       176
        spam       0.96      0.92      0.94       153

    accuracy                           0.95       329
   macro avg       0.95      0.94      0.94       329
weighted avg       0.95      0.95      0.95       329

accuracy is 0.9453
balanced accuracy is 0.9437
f1 macro is 0.9449

confusion matrix[[170   6]
 [ 12 141]]




In [None]:
# torch.save(model.state_dict(),'/content/drive/My Drive/spam_detection/fine_tuned_models/Greek_BERT_youtube.pth')