<a href="https://colab.research.google.com/github/prateekjoshi565/Fine-Tuning-BERT/blob/master/Fine_Tuning_BERT_for_Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
'''
credit: https://github.com/prateekjoshi565/Fine-Tuning-BERT
'''
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import r2_score
from transformers import AutoModel, AutoTokenizer, BertTokenizerFast
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# specify GPU
device = torch.device("cuda")
aggregation_method = 'min'
dim = 4096

# Load Dataset

In [65]:
from typing import List, Union
from datasets import load_dataset

def load_raw_dataset(train_files: Union[List[str], str]):
    """ load raw dataset """
    if isinstance(train_files, str):
        train_files = [train_files]
    processed_datasets = load_dataset(
        "json",
        data_files=train_files,
    )
    return processed_datasets

In [66]:
all_data = load_raw_dataset(f"../data/filter/dolly/{aggregation_method}_dim{dim}_all.jsonl")['train']

In [67]:
def split_dataset(dataset, test_size=0.2, val_size=0.1, seed=0):
    # Split the dataset into train and test
    train_test_splits = dataset.train_test_split(test_size=test_size, seed=seed)
    # Split the train set into train and validation
    train_val_splits = train_test_splits['train'].train_test_split(test_size=val_size, seed=seed)
    
    return train_val_splits['train'], train_val_splits['test'], train_test_splits['test']

train_data, val_data, test_data = split_dataset(all_data)

train_data.shape, val_data.shape, test_data.shape

((720, 3), (80, 3), (200, 3))

# Import BERT Model and BERT Tokenizer

In [68]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# gte = AutoModel.from_pretrained('thenlper/gte-small', return_dict=False)
# tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-small')



# Tokenization

In [69]:
def unfold_QA(data):
    # Create a new dataset with the unfolded QA format
    new_dataset = []
    for text_entry in data:
        unfolded_text_entry = ''
        for text_pair in text_entry['messages']:
            unfolded_text_entry += text_pair['role'] + ': ' + text_pair['content'] + ' '
        new_dataset.append(unfolded_text_entry)
    return new_dataset

In [70]:
max_seq_len = 512
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    unfold_QA(train_data['text']),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    unfold_QA(val_data['text']),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    unfold_QA(test_data['text']),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



# Convert Integer Sequences to Tensors

In [71]:
def get_labels(data):
    return [text_entry['cos'] for text_entry in data]

In [72]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(get_labels(train_data)).reshape(-1, 1)

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(get_labels(val_data)).reshape(-1, 1)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(get_labels(test_data)).reshape(-1, 1)

# Create DataLoaders

In [73]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Freeze BERT Parameters

In [74]:
text_embedder = bert
# freeze all the parameters
for param in text_embedder.parameters():
    param.requires_grad = False

# Define Model Architecture

In [75]:
class Model_Arch(nn.Module):

    def __init__(self, text_embedder):
      
      super(Model_Arch, self).__init__()

      self.text_embedder = text_embedder 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,1)

      # self.fc1 = nn.Linear(384, 3) # for GTE-small

      # #softmax activation function
      # self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.text_embedder(sent_id, attention_mask=mask, return_dict=False)
      # print(cls_hs.shape)
      
      x = self.fc1(cls_hs) # GTE-small

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # # apply softmax activation
      # x = self.softmax(x)

      return x

In [76]:

# pass the pre-trained BERT to our define architecture
model = Model_Arch(text_embedder)

# push the model to GPU
model = model.to(device)

In [77]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)



# Define Tartget Function

In [78]:

# loss function
mse  = nn.MSELoss() 

# number of training epochs
epochs = 10

# Fine-Tune BERT

## TRAIN

In [79]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = mse(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

## TEST

In [80]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # # Calculate elapsed time in minutes.
      # elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = mse(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# Start Model Training

In [81]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_temp_regression_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10



Evaluating...

Training Loss: 0.347
Validation Loss: 0.013

 Epoch 2 / 10

Evaluating...

Training Loss: 0.018
Validation Loss: 0.002

 Epoch 3 / 10

Evaluating...

Training Loss: 0.014
Validation Loss: 0.002

 Epoch 4 / 10

Evaluating...

Training Loss: 0.009
Validation Loss: 0.000

 Epoch 5 / 10

Evaluating...

Training Loss: 0.001
Validation Loss: 0.001

 Epoch 6 / 10

Evaluating...

Training Loss: 0.002
Validation Loss: 0.016

 Epoch 7 / 10

Evaluating...

Training Loss: 0.000
Validation Loss: 0.022

 Epoch 8 / 10

Evaluating...

Training Loss: 0.001
Validation Loss: 0.009

 Epoch 9 / 10

Evaluating...

Training Loss: 0.001
Validation Loss: 0.000

 Epoch 10 / 10

Evaluating...

Training Loss: 0.002
Validation Loss: 0.000


# Load Saved Model

In [82]:
#load weights of best model
path = 'saved_temp_regression_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# Get Predictions for Test Data

In [83]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [84]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(r2_score(test_y, preds))

-0.001482513210184777
