# Steps for Model Training

### 1. Build Data Loader Class

### 2. Build Model
> * Make mean pooling helper function
* Make Sentence Classifying Class that includes a dropout and dense layer on top of pretrained model
* Define training function
* Train the Model


### 3. Run Validation on Test Set

### 4. Save the Model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install transformers

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
 
import pandas as pd 
import transformers
from torch.utils.data import Dataset, DataLoader 


from transformers import AutoTokenizer, AutoModel
sentenc_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(sentenc_model_name) 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from tqdm.notebook import tqdm
 
import os
import pickle

Now that we have multiple batches of data, I am going to concatenate the batches for training. If the memory storage is too large, then I may need to train in batches of data.

Concatenation first:

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

This is a manual way of creating the files we are going to look at for training and testing the model. You could also use the list of training examples and the list of testing examples created at the end of the "SentenceModelPreprocessor" Colab File.

In [None]:
folder_location = '/content/gdrive/MyDrive/Thesis/Data/TrainTestBinClass/'

In [None]:
file_numbers = ['1000', '2000', '3000', '4000', '5000', '5999', '7000', '8000', '9000', '10000', '11000', '12000']

In [None]:
list_of_training_files = []
list_of_testing_files = []

for number in file_numbers:
  train_file = folder_location+number+'_training_bdf.pickle'
  test_file = folder_location+number+'_testing_bdf.pickle'
  list_of_training_files.append(train_file)
  list_of_testing_files.append(test_file)

### 1. Build Data Loader Class

In [None]:
class BatchedCasesData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = str(self.data.iloc[index].input_sentences)
        sentence = " ".join(sentence.split())

        document = str(self.data.iloc[index].input_texts)
        document = " ".join(document.split())

        inputs = self.tokenizer.batch_encode_plus(
            [sentence, document], 
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'sent_id': torch.tensor(ids[0], dtype=torch.long),
            'doc_id': torch.tensor(ids[1], dtype=torch.long),
            'sent_mask': torch.tensor(mask[0], dtype=torch.long),
            'doc_mask': torch.tensor(mask[1], dtype=torch.long),
            'targets': torch.tensor([self.data.iloc[index].labels], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

### Build Model

Make mean pooling helper function

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Make Sentence Classifying Class that includes a dropout and dense layer on top of pretrained model

In [None]:
class SentenceBertClass(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", in_features=768):
        super(SentenceBertClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(in_features*3, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.classifierSigmoid = torch.nn.Sigmoid()

    def forward(self, sent_ids, doc_ids, sent_mask, doc_mask):

        sent_output = self.l1(input_ids=sent_ids, attention_mask=sent_mask) 
        sentence_embeddings = mean_pooling(sent_output, sent_mask) 

        doc_output = self.l1(input_ids=doc_ids, attention_mask=doc_mask) 
        doc_embeddings = mean_pooling(doc_output, doc_mask)

        # elementwise product of sentence embs and doc embs
        combined_features = sentence_embeddings * doc_embeddings  

        # Concatenate input features and their elementwise product
        concat_features = torch.cat((sentence_embeddings, doc_embeddings, combined_features), dim=1)   
        
        pooler = self.pre_classifier(concat_features) 
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.classifierSigmoid(output) 

        return output

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

state_dict = torch.load('/content/gdrive/MyDrive/Thesis/Models/state_dict.pt')

model = SentenceBertClass(model_name=sentenc_model_name)
model.to(device);
model.load_state_dict(state_dict)

loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Define training function

In [None]:
print_n_steps = 500
EPOCHS = 2 
acc_step_holder, loss_step_holder = [], []


def train(epoch):    
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        sent_ids = data['sent_id'].to(device, dtype = torch.long)
        doc_ids = data['doc_id'].to(device, dtype = torch.long)
        sent_mask = data['sent_mask'].to(device, dtype = torch.long)
        doc_mask = data['doc_mask'].to(device, dtype = torch.long) 
        targets = data['targets'].to(device, dtype = torch.float)  

        outputs = model(sent_ids, doc_ids, sent_mask, doc_mask) 
        loss = loss_function(outputs, targets)
        tr_loss += loss.item() 
        n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%print_n_steps==0 and _ != 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(str(_* train_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)
            acc_step_holder.append(accu_step), loss_step_holder.append(loss_step)
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch for Batch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss for Epoch for Batch: {epoch_loss}")
    print(f"Training Accuracy for Epoch for Batch: {epoch_accu}")

    return

Train the model

In [None]:
import time

In [None]:
start_time = time.time()

for epoch in range(EPOCHS):

  #For each batch of data
  for i in range(len(list_of_training_files)):

    print('Batch', i, ':')

    #Pull training dataframe
    train_df = pd.read_pickle(list_of_training_files[i])

    #Make it a class
    training_set =  BatchedCasesData(train_df, tokenizer, MAX_LEN)

    training_loader = DataLoader(training_set, **train_params)

    train(epoch)
  
  #save the model at each epoch just in case!
  torch.save(model.state_dict(), "/content/gdrive/MyDrive/Thesis/Models/state_dict.pt")

end_time = time.time()
total_time = end_time - start_time
print('Training Took:', total_time)

In [None]:
#Saving the accuracy and loss so it can be analyzed further

output_file = '/content/gdrive/MyDrive/Thesis/Data/training_accuracy.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(acc_step_holder, handle)

output_file = '/content/gdrive/MyDrive/Thesis/Data/training_loss.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(loss_step_holder, handle)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,5))
ax1.plot(acc_step_holder, label="Accuracy")
ax2.plot(loss_step_holder, label="Loss")
ax1.title.set_text("Accuracy")
ax2.title.set_text("Loss")
fig.tight_layout()
plt.show()

### Run Validation on Test Set

In [None]:
def validate_model(model, testing_loader):
    model.eval()

    n_correct = 0; n_wrong = 0; total = 0;  tr_loss = 0; nb_tr_steps = 0 ; nb_tr_examples = 0;
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0): 
            
            sent_ids = data['sent_id'].to(device, dtype = torch.long)
            doc_ids = data['doc_id'].to(device, dtype = torch.long)
            sent_mask = data['sent_mask'].to(device, dtype = torch.long)
            doc_mask = data['doc_mask'].to(device, dtype = torch.long) 
            targets = data['targets'].to(device, dtype = torch.float)  

            outputs = model(sent_ids, doc_ids, sent_mask, doc_mask) 
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            
            n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%print_n_steps==0 and _!=0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples 
                print(str(_* test_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)

             
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
start_time = time.time()

val_acc = []

for i in range(len(list_of_testing_files)):

    print('Batch', i, ':')

    #Pull training dataframe
    test_df = pd.read_pickle(list_of_testing_files[i])
    print('Number of Examples:', len(test_df))

    #Make it a class
    testing_set =  BatchedCasesData(test_df, tokenizer, MAX_LEN)

    testing_loader = DataLoader(testing_set, **test_params)

    acc = validate_model(model, testing_loader)
    val_acc.append(acc)

print("Accuracy on test data = %0.2f%%" % sum(val_acc) / len(val_acc))
print("List of Batch Accuracies:", val_acc)
end_time = time.time()
total_time = end_time - start_time
print('Validation took:', total_time)

In [None]:
output_file = '/content/gdrive/MyDrive/Thesis/Data/validation_accuracy.pickle'
with open(output_file, 'wb') as handle:   #Saving as a pickle file
  pickle.dump(val_acc, handle)

### Save the Model

In [None]:
torch.save(model.state_dict(), "/content/gdrive/MyDrive/Thesis/Models/state_dict.pt")