# Fine-tuning a pre-trained BERT model for classification using native PyTorch
Competition: [Covid-19 tweet classification](https://zindi.africa/competitions/covid-19-tweet-classification)

## Installing libraries

In [None]:
# %pip install -r requirements.txt

## Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

## Reading the data
Substitute the file paths with the paths for your own data.

In [None]:
data = pd.read_csv('./data/Train.csv')
test_data = pd.read_csv('./data/Test.csv')

In [None]:
data.head()

In [None]:
data['target'].value_counts()

## Splitting the data
Train size = 70% of the total size
Test size = 30% of the total size

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(data['text'], data['target'], train_size = 0.7, shuffle = True, random_state=42)

## Data Preparation

Initialize the tokenizer and pass the text data to get tokens that can be passed to the BERT model.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

In [None]:
train_tokens.keys()

In [None]:
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

In [None]:
print(train_tokens['attention_mask'][0])

Create a custom Dataset class

In [None]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

Setting batch size. Create Dataset class objects for training and testing data. Declare Dataloader objects for these Dataset objects.

In [None]:
batch_size = 40
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

Iterating through the train data loader

In [None]:
# train_iter = iter(train_loader)
# sample = next(train_iter)
# print(sample.items())

## BERT Model, Optimizer Function, and Loss Function

We will declare the model, the optimizer function used to optimize the model, and the loss function that is to be minimized as part of the training phase.

In [None]:
# configuration = BertConfig(hidden_dropout_prob=0.3, num_hidden_layers = 12, attention_probs_dropout_prob = 0.4)
bert_model = BertForSequenceClassification.from_pretrained('bert-base-cased')

In [None]:
optimizer = AdamW(bert_model.parameters(), lr=1e-5)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

## Training and testing blocks

In [None]:
num_epochs = 10
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [None]:
bert_model.to(device) # Transfer model to GPU if available

In [None]:
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # TRAINING BLOCK STARTS
    bert_model.train()
    for i,batch in enumerate(train_loader):    
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        print('Training batch {}/{} last loss: {}'.format(i + 1, len(train_loader), train_last_loss), end="\r")
    # Logging epoch-wise training loss
    print("")
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # Logits act as predictions
        logits = outputs.logits
        
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        # print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
        
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size), end="\r")
    
    print("")
    # TESTING BLOCK ENDS

In [None]:
print('Validation accuracy: ',correct/len(test_X))

Saving the model

In [None]:
torch.save(bert_model.state_dict(), "./model/model.pt")

## Testing the model (Optional)

In [None]:
test_data

Tokenizing test data

In [None]:
test_data_tokens = tokenizer(list(test_data['text']), padding = True, truncation=True)
len(test_data_tokens['input_ids'])

Preparing Dataset class for test data

In [None]:
class TestData(Dataset):
    def __init__(self):
            self.text_data = test_data['text']
            self.tokens = test_data_tokens
    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        return sample

Declaring an object for test dataset and test data loader

In [None]:
test_data_dataset = TestData()
test_data_loader = DataLoader(test_data_dataset, batch_size=1) 

Loading the saved (trained) model 

In [None]:
weights = torch.load("./model/model.pt")
bert_model.load_state_dict(weights)
bert_model.to(device)

Preparing the submission file

In [None]:
bert_model.eval()
result = []
for i, batch in enumerate(test_data_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
    
    logits = outputs.logits
    result.extend(list(torch.nn.functional.softmax(logits, dim = 1).type(torch.float)))
result = [i[1].item() for i in result]

In [None]:
result_df = pd.DataFrame()
result_df['ID'] = test_data['ID']
result_df['target'] = result

In [None]:
result_df.to_csv("./submission1.csv", index = False)