# Steps for Training the Batch Classification Model

### 1. Pull the data

>* Need to get the trianing and testing data from the Preprocessing data

### 2. Build the Dataloader class

### 3. Build the Model

### 4. Build the Trainer Algorithm

### 5. Train the Model

### 6. Validate the Model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
def save_as_pickle(filename, data):
  with open(filename, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(data, handle)

In [None]:
!pip install transformers

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
 
import pandas as pd 
import transformers
from torch.utils.data import Dataset, DataLoader 


from transformers import AutoTokenizer, AutoModel
sentenc_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(sentenc_model_name) 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from tqdm.notebook import tqdm
 
import os
import pickle
import time

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

This is a manual way of pulling the training / testing files for this model. You can also use the list of files generated at the end of the "Batch_Model_Preprocessor" file. 

In [None]:
folder_location = '/content/gdrive/MyDrive/Thesis/Data/BATCHClassTrainTest/'

file_numbers = ['2000', '4000', '6000', '8000', '16000', '18000', '19999']
#file_numbers = ['6000', '8000', '16000', '18000', '19999']

In [None]:
list_of_training_files = []
list_of_testing_files = []

for number in file_numbers:
  train_file = folder_location+number+'_batch_training_bdf.pickle'
  test_file = folder_location+number+'_batch_testing_bdf.pickle'
  list_of_training_files.append(train_file)
  list_of_testing_files.append(test_file)

Builing the Data loader Class

In [None]:
class BatchedCasesData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):

        document = str(self.data.iloc[index].input_sentences)
        document = " ".join(document.split())

        inputs = self.tokenizer.batch_encode_plus(
            [document], 
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'doc_id': torch.tensor(ids[0], dtype=torch.long),
            'doc_mask': torch.tensor(mask[0], dtype=torch.long),
            'targets': torch.tensor([self.data.iloc[index].contains_summ], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

Building the Model

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


class BatchBertClass(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", in_features=768):
        super(BatchBertClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(in_features, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.classifierSigmoid = torch.nn.Sigmoid()

    def forward(self, doc_ids, doc_mask):

        doc_output = self.l1(input_ids=doc_ids, attention_mask=doc_mask) 
        doc_embeddings = mean_pooling(doc_output, doc_mask)
        
        pooler = self.pre_classifier(doc_embeddings) 
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.classifierSigmoid(output) 

        return output

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = BatchBertClass(model_name=sentenc_model_name)
model.to(device);

loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Build the training algorithm

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
print_n_steps = 1000
EPOCHS = 3 
acc_step_holder, loss_step_holder = [], []


def train(epoch):    
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        doc_ids = data['doc_id'].to(device, dtype = torch.long)
        doc_mask = data['doc_mask'].to(device, dtype = torch.long) 
        targets = data['targets'].to(device, dtype = torch.float)  

        outputs = model(doc_ids, doc_mask) 
        loss = loss_function(outputs, targets)
        tr_loss += loss.item() 
        n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%print_n_steps==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(str(_* train_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)
            acc_step_holder.append(accu_step), loss_step_holder.append(loss_step)
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

Train the model

In [None]:
start_time = time.time()

for epoch in range(EPOCHS):

    #For each batch of data
  for i in range(len(list_of_training_files)):


    print('Batch', i, ':')

    #Pull training dataframe
    train_df = pd.read_pickle(list_of_training_files[i])

    #Make it a class
    training_set =  BatchedCasesData(train_df, tokenizer, MAX_LEN)

    training_loader = DataLoader(training_set, **train_params)

    train(epoch)
    
    torch.save(model.state_dict(), "/content/gdrive/MyDrive/Thesis/Models/batch_label_state_dict.pt")
  
    #save the model at each epoch just in case!
  torch.save(model.state_dict(), "/content/gdrive/MyDrive/Thesis/Models/batch_label_state_dict.pt")


end_time = time.time()
total_time = end_time - start_time
print('Training Took:', total_time)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,5))
ax1.plot(acc_step_holder, label="Accuracy")
ax2.plot(loss_step_holder, label="Loss")
ax1.title.set_text("Accuracy")
ax2.title.set_text("Loss")
fig.tight_layout()
plt.show()

In [None]:
output_file = '/content/gdrive/MyDrive/Thesis/Data/batch_training_accuracy.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(acc_step_holder, handle)

output_file = '/content/gdrive/MyDrive/Thesis/Data/batch_training_loss.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(loss_step_holder, handle)

Validate the Model

In [None]:
def validate_model(model, testing_loader):
    model.eval()

    n_correct = 0; n_wrong = 0; total = 0;  tr_loss = 0; nb_tr_steps = 0 ; nb_tr_examples = 0;
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0): 
            
            doc_ids = data['doc_id'].to(device, dtype = torch.long)
            doc_mask = data['doc_mask'].to(device, dtype = torch.long) 
            targets = data['targets'].to(device, dtype = torch.float)  

            outputs = model(doc_ids, doc_mask) 
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            
            n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%print_n_steps==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples 
                print(str(_* test_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)

             
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
start_time = time.time()
val_acc = []

for i in range(len(list_of_testing_files)):

    print('Batch', i, ':')

    #Pull training dataframe
    test_df = pd.read_pickle(list_of_testing_files[i])

    #Make it a class
    testing_set =  BatchedCasesData(test_df, tokenizer, MAX_LEN)

    testing_loader = DataLoader(testing_set, **test_params)

    acc = validate_model(model, testing_loader)
    val_acc.append(acc)

#print("Accuracy on test data = %0.2f%%" % sum(val_acc) / len(val_acc))
print("List of Batch Accuracies:", val_acc)
end_time = time.time()
total_time = end_time - start_time
print('Validation took:', total_time)

In [None]:
output_file = '/content/gdrive/MyDrive/Thesis/Data/validation_accuracy.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(val_acc, handle)