In [None]:
# March 2024
# Model training - pretrained language model
# Violeta Berdejo-Espinola

In [2]:
import torch
import torch.nn as nn

device = torch.device('mps')
print(device)

mps


In [3]:
import mpu 

x  = mpu.io.read('../data/corpus_raw.pickle')
y = [1] * len(x[:62]) + [0] * len(x[62:])

pos = x[0:62]
neg = x[62:5020]

# split data

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# calculate weights for classes

class_weights = [(1 - (len(neg) / len(x))), (1 - (len(pos) / len(x)))]
class_weights = torch.Tensor(class_weights).to(device)

In [None]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
     
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # returns dictionary with two key:value. input ids:tensors and attention mask:tensors both of them contain tensors
    
X_train = tokenize(x_train)                              
X_test = tokenize(x_test)

y_train = torch.tensor(y_train) 
y_test = torch.tensor(y_test) 

X_train['input_ids'].size()
X_test['input_ids'].size()

In [None]:
import random
from torch.utils.data import TensorDataset, DataLoader

batch_size = 128 # number of training samples in one forward and backwards pass
workers = 0      # how many subprocesses to use for data loading / each worker gets their own subset of indices to construct each batch. they retrieve data and put it into a queue
seed = 42
G = torch.Generator()
G.manual_seed(seed)

# function to seed workers in multiprocessing data loading

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# compile tensorDataset and DataLoaders

train_xlm = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
train_dataloader = DataLoader(train_xlm,
                              batch_size=batch_size, 
                              shuffle=False, # set it to false so when dataset is split into batches, this isn't random
                              num_workers=workers, 
                              generator = G, # used to generate random indexes and multiprocessing to generate base seed for workers
                              worker_init_fn=seed_worker)


test_xlm = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
test_dataloader = DataLoader(test_xlm, 
                            batch_size=batch_size,
                            shuffle=False, 
                            num_workers=workers, 
                            generator = G,
                            worker_init_fn=seed_worker)

In [None]:
from transformers import (XLMRobertaForSequenceClassification)

id2label = {'negative': 0, 'positive': 1}
label2id = {0: 'negative',1: 'positive'}

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',
                                                            num_labels=2, 
                                                            id2label=id2label, 
                                                            label2id=label2id).to(device)

# freeze transformer model parametres

for param in model.base_model.parameters():
    param.requires_grad = False

In [86]:
# function to train model

num_epochs = 2
learning_rate = 0.1
optim = torch.optim.SGD(model.parameters(), lr = learning_rate)
criteria = nn.CrossEntropyLoss(weight=class_weights)


def Trainer(dataloader, epochs):

    loss_values = []
 
    for epoch in range(epochs):
        model.train() # set the model to training mode
        running_loss = 0
        
        print('training on epoch: ', epoch)
        
        # for step, batch in enumerate(dataloader):
        for batch in dataloader:

            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            target = batch[2].to(device)
            
            # clear gradients
            optim.zero_grad() 

            # forward pass
            outputs = model(input_ids,
                            token_type_ids=None,
                            attention_mask=input_mask)
            
            # logits from the model's output
            logits = outputs.logits
            # calculate loss
            loss = criteria(logits, target)
            # backward pass & compute gradients
            loss.backward()
            # optimization & update parametres weights
            optim.step()
            
            running_loss += loss.item() 
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss {running_loss / len(dataloader)}")
    
        # calculate the average loss over the training data
        avg_train_loss = running_loss / len(train_dataloader)
        loss_values.append(avg_train_loss)
        
        print("average training loss: {0:.4f}".format(avg_train_loss))

    return loss_values

In [None]:
import random
import time
import numpy as np

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

model_0  = Trainer(train_dataloader, num_epochs)

In [94]:
# evaluate

from sklearn.metrics import confusion_matrix, classification_report

def evaluate_model(dataloader):
    model.eval()  # set the model to evaluation mode

    all_predictions = []
    all_labels = []
    
    with torch.no_grad():  # disable gradient calculations
        for batch in dataloader:
            # Transfer batch to the device
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Access the logits directly

            # Get the predicted class (e.g., the class with the highest score)
            predictions = torch.argmax(logits, dim=-1)
            
            # Append predictions and labels for further processing
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    return all_predictions, all_labels

In [None]:
all_predictions, all_labels = evaluate_model(test_dataloader)

from sklearn.metrics import confusion_matrix, classification_report
cr = classification_report(all_labels, 
                           all_predictions, 
                           output_dict=False)
print(cr)