# NLI Finetuning Scripts

This script is used to Finetune NLI model on Google Colab
- Base model: "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
- Dataset: public_train_v4.json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install tqdm
!pip install imbalanced-learn

In [None]:
import torch
from torch import nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from datasets import Dataset, DatasetDict

from sklearn.metrics import classification_report
from sklearn.metrics._classification import _check_targets
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import matplotlib.pyplot as plt

import random
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

import re

import os
new_directory = "drive/MyDrive/FNU/mDeBERTa (ft) V6"
os.chdir(new_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # Check again

In [None]:
MODEL_NAME = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
BATCH_SIZE = 8
MODEL_TYPE = ['cls', 'mean']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def read_dataset(fileName, label2int):
    dataframe = pd.read_json(fileName)
    data_list = {0:[], 1:[], 2:[]}
    for key, values in dataframe.items():
        if isinstance(values['evidence'], str):
            data_list[ label2int[values['verdict']] ].append( (values['evidence'], values['claim']) )
        else:
            data_list[ label2int[values['verdict']] ].append( (values['context'].split('.')[0], values['claim']) )
            print(f"Evidence {key} no value !!!")
    return data_list

# dataDict: 'sample':[], 'labels':[]
def sampling_training_dataset(dataDict:dict):   # Dùng khi sampling dataset đều với các label khác + đảm bảo 1 batch các label sẽ theo order: 0,1,2,0,1,2,0,1,2,...
    ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
    x_sample, y_sample = ros.fit_resample(dataDict['sample'], dataDict['labels'])
    print(f"Oversampling dataset: {Counter(y_sample)}")

    # Filter by labels
    label_sample = {0:[], 1:[], 2:[]}
    for sample, label in zip(x_sample, y_sample):
        label_sample[label].append(sample)
    print(f"Sorted_dataset: {len(label_sample[0])} -- {len(label_sample[1])} -- {len(label_sample[2])}")

    sorted_dataset = {'sample':[], 'labels':[]}
    for first, second, third in zip(label_sample[0], label_sample[1], label_sample[2]):
        sorted_dataset['sample'] += [first, second, third]
        sorted_dataset['labels'] += [0, 1, 2]
    print(f"Length sample: {len(sorted_dataset['sample'])} -- Length labels: {len(sorted_dataset['labels'])}")
    return sorted_dataset

# dataDict: 'sample':[], 'labels':[]
def sampling_training_dataset_noEqual(dataDict:dict):   # Dùng khi sampling dataset vẫn không có đều với các label khác
    ros = RandomOverSampler(sampling_strategy={2:5000}, random_state=42)
    x_sample, y_sample = ros.fit_resample(dataDict['sample'], dataDict['labels'])
    print(f"Oversampling dataset: {Counter(y_sample)}")

    shuffle_dataset = [(sample, label) for sample, label in zip(x_sample, y_sample)]
    random.shuffle(shuffle_dataset)

    dataset = {'sample':[], 'labels':[]}
    for sample, label in shuffle_dataset:
        dataset['sample'].append(sample)
        dataset['labels'].append(label)
    print(f"Length sample: {len(dataset['sample'])} -- Length labels: {len(dataset['labels'])}")
    return dataset

def shuffling_dataset(dataDict:dict):                   # Chỉ shuffle lại data
    shuffle_dataset = [(sample, label) for sample, label in zip(dataDict['sample'], dataDict['labels'])]
    random.shuffle(shuffle_dataset)

    dataset = {'sample':[], 'labels':[]}
    for sample, label in shuffle_dataset:
        dataset['sample'].append(sample)
        dataset['labels'].append(label)
    print(f"Length sample: {len(dataset['sample'])} -- Length labels: {len(dataset['labels'])}")
    return dataset

# @dataList: {'0': [(evidence, claim), (evidence, claim), ...]}
def separate_dataset(dataList:dict, train_percent:float, validation_percent:float, test_percent:float):
    train_set, validation_set, test_set = [{'sample':[], 'labels':[]} for i in range(3)]
    for label, sample in dataList.items():
        train_x, valid_test_x, train_y, valid_test_y = train_test_split(sample, [label for i in range(len(sample))],
                                                                        test_size=(validation_percent + test_percent),
                                                                        random_state=42, shuffle=True)

        valid_x, test_x, valid_y, test_y = train_test_split(valid_test_x, [label for i in range(len(valid_test_x))],
                                                            test_size = test_percent/(test_percent + validation_percent),
                                                            random_state = 42, shuffle = True)

        train_set['sample'] += train_x
        train_set['labels'] += train_y
        validation_set['sample'] += valid_x
        validation_set['labels'] += valid_y
        test_set['sample'] += test_x
        test_set['labels'] += test_y

        print(f"{label}: Number of sample in train_set: {len(train_set['sample'])}, valid_set: {len(validation_set['sample'])} and test_set: {len(test_set['sample'])}")

    train_set = sampling_training_dataset(train_set)

    return train_set, validation_set, test_set

In [None]:
label2int = {'SUPPORTED':0, 'NEI':1, 'REFUTED':2}
raw_dataset = read_dataset("public_train_v4.json", label2int)

print(f"Total dataset: 0:{len(raw_dataset[0])} -- 1:{len(raw_dataset[1])} -- 2:{len(raw_dataset[2])}")

In [None]:
random.seed(42)
train_dict, valid_dict, test_dict = separate_dataset(raw_dataset, train_percent = 0.8, validation_percent = 0.1, test_percent = 0.1)

In [None]:
dataset = DatasetDict({
    'train':Dataset.from_dict(train_dict),
    'valid':Dataset.from_dict(valid_dict),
    'test':Dataset.from_dict(test_dict)
})

In [None]:
def data_cleaning(text):
    text = re.sub(r"\n", "", text)
    text = text.strip()
    text = text[1:].strip() if text[0]=='.' else text
    text = text[:-1].strip() if text[-1]=='.' else text
    text = " ".join(text.split())
    return text

def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
    premises = [data_cleaning(premise) for premise, _ in data['sample']]
    hypothesis = [data_cleaning(hypothesis) for _, hypothesis in data['sample']]

    with torch.no_grad():
        tokenized_result = tokenizer(premises, hypothesis, return_tensors="pt", padding = True)

    return tokenized_result

In [None]:
class NLI_model(nn.Module):
    def __init__(self, input_dims, class_weights = torch.tensor([0., 0., 0.])):
        super(NLI_model, self).__init__()

        self.classification = nn.Sequential(
            # # mDeBERTa (ft) V1
            # nn.Linear(input_dims, 256),
            # nn.Dropout(p=0.1),
            # nn.Tanh(),
            # nn.Linear(256, 3),
            # nn.Dropout(p=0.1)

            # # mDeBERTa (ft) V2 V4 V5
            # nn.Linear(input_dims, 256),
            # nn.Dropout(p=0.1),
            # nn.Tanh(),
            # nn.Linear(256, 3),

            # mDeBERTa (ft) V0 V3 V6
            nn.Linear(input_dims, 3)
        )

        self.criterion = nn.CrossEntropyLoss(class_weights)

    def forward(self, input):
        output_linear = self.classification(input)
        return output_linear

    def training_step(self, train_batch, batch_idx=0):
        input_data, targets = train_batch
        outputs = self.forward(input_data)
        loss = self.criterion(outputs, targets)
        return loss

    def predict_step(self, batch, batch_idx=0):
        input_data, _ = batch
        outputs = self.forward(input_data)
        prob = outputs.softmax(dim = -1)
        sort_prob, sort_indices = torch.sort(-prob, 1)
        return sort_indices[:,0], sort_prob[:,0]

    def validation_step(self, val_batch, batch_idx=0):
        _, targets = val_batch
        sort_indices, _ = self.predict_step(val_batch, batch_idx)
        report = classification_report(list(targets.to('cpu').numpy()), list(sort_indices.to('cpu').numpy()), output_dict=True, zero_division = 1)
        return report

    def test_step(self, batch, dict_form, batch_idx=0):
        _, targets = batch
        sort_indices, _ = self.predict_step(batch, batch_idx)
        report = classification_report(targets.to('cpu').numpy(), sort_indices.to('cpu').numpy(), output_dict=dict_form, zero_division = 1)
        return report

    def configure_criterion(self, class_weights):
        self.criterion = nn.CrossEntropyLoss(class_weights)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr = 1e-5)

# Training

In [None]:
def validation_phase(LLModel, classifier:NLI_model, valid_loader:DataLoader, process_outputLLModel_fn):
# Validation step for testing the model
    tensor_list = {'embedding':[], 'label':[]}
    with torch.no_grad():
        for databatch in valid_loader:
            embedding = LLModel(input_ids=databatch['input_ids'].to(envir),
                                attention_mask=databatch['attention_mask'].to(envir),
                                token_type_ids=databatch['token_type_ids'].to(envir)).last_hidden_state
            tensor_list['embedding'].append(process_outputLLModel_fn(embedding))
            tensor_list['label'].append(databatch['labels'])
        result = classifier.validation_step((torch.concat(tensor_list['embedding'], dim=0).to(envir),
                                             torch.concat(tensor_list['label'], dim=0).to(envir)))
    return result

def testing_phase(LLModel, classifier:NLI_model, test_loader:DataLoader, process_outputLLModel_fn, dict_form):
    tensor_list = {'embedding':[], 'label':[]}
    with torch.no_grad():
        for databatch in test_loader:
            embedding = LLModel(input_ids=databatch['input_ids'].to(envir),
                                attention_mask=databatch['attention_mask'].to(envir),
                                token_type_ids=databatch['token_type_ids'].to(envir)).last_hidden_state
            tensor_list['embedding'].append(process_outputLLModel_fn(embedding))
            tensor_list['label'].append(databatch['labels'])
        result = classifier.test_step((torch.concat(tensor_list['embedding'], dim=0).to(envir),
                                       torch.concat(tensor_list['label'], dim=0).to(envir)), dict_form=dict_form)
    return result

In [None]:
def training_model(LLModel, classifier:NLI_model, dataset:DatasetDict, process_outputLLModel_fn, checkpoint:int, checkpoint_action, epochs=1000, best_model_func=None):
    optimizer = torch.optim.Adam(list(classifier.parameters()) + list(LLModel.parameters()), lr=2e-05, weight_decay=0.01)
    record_dict = {'trainLoss':[], 'valAcc':[], 'marco avg f1-score':[]}        # Log training process
    if best_model_func != None:
        print("Has best model criteria")
        best_classifier = None
        bestResult = None
        best_index = 0

    train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, shuffle=False)
    valid_loader = DataLoader(dataset['valid'], batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE, shuffle=False)

    for epoch in range(epochs):
# Training step
        total_loss = 0
        for batch_idx, databatch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            embedding = LLModel(input_ids=databatch['input_ids'].to(envir),
                                attention_mask=databatch['attention_mask'].to(envir),
                                token_type_ids=databatch['token_type_ids'].to(envir)).last_hidden_state
            processed_embedding = process_outputLLModel_fn(embedding)       # Thực hiện việc xử lý embedding sau khi input qua model (Lấy MEAN hoặc lấy vector CLS)

            loss = classifier.training_step((processed_embedding, databatch['labels'].to(envir)))   # Lấy cross entropy qua model classifier
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if batch_idx % checkpoint == 0:
                result = validation_phase(LLModel, classifier, valid_loader, process_outputLLModel_fn)  # Đưa model và data qua validation phase
                if best_model_func != None:     # Save lại best model bằng cách save checkpoint của LLModel và classifier
                    if best_model_func(result, bestResult):
                        bestResult = result
                        best_classifier = classifier.state_dict()
                        LLModel.save_pretrained(f'/content/checkpoint')
                        best_index = batch_idx
                        print(f"\nCheckpoint {batch_idx} -- Accuracy: {result['accuracy']} -- macro_f1: {result['macro avg']['f1-score']} -- loss: {loss.item()} -- best state: {best_index}")
                record_dict['valAcc'].append(result['accuracy'])    # Save eval step
                record_dict['marco avg f1-score'].append(result['macro avg']['f1-score'])   # Save eval step
                # Save lại eval step vào output file
                checkpoint_action(f"\nStep {batch_idx + epoch*len(train_loader)} -- Accuracy: {result['accuracy']} -- macro_f1: {result['macro avg']['f1-score']} -- loss: {loss.item()}")
            record_dict['trainLoss'].append(total_loss / (batch_idx + 1))       # Save eval step

# Load best_model and infer the test_set
    if best_model_func != None:
        if best_classifier != None:
            print(f"Load Best Model State Dict: {best_index}")
            classifier.load_state_dict(best_classifier)
            LLModel = AutoModel.from_pretrained("/content/checkpoint").to(envir)

    result = validation_phase(LLModel, classifier, valid_loader, process_outputLLModel_fn)
    print(f"- Finish --- {input_type} --- Validation Test Accuracy: {result['accuracy']} --- Validation Test F1 Score: {result['macro avg']['f1-score']}")

    result = testing_phase(LLModel, classifier, test_loader, process_outputLLModel_fn, True)
    test_acc = result['accuracy']
    test_f1score = result['macro avg']['f1-score']

    return LLModel, record_dict, test_acc, test_f1score

In [None]:
class_weights = torch.tensor([1., 1., 1.], dtype=torch.float32).to(envir)
print(class_weights)

In [None]:
def createLLModel(model_name):
    model = AutoModel.from_pretrained(model_name).to(envir)
    for param in model.parameters():
        param.requires_grad = True

    return model

def best_model_func(result, bestResult):
    if bestResult == None:
        return True
    else:
        return result['macro avg']['f1-score'] > bestResult['macro avg']['f1-score']

In [None]:
log_dict = {i:{'trainLoss':[], 'valAcc':[], 'marco avg f1-score':[]} for i in MODEL_TYPE}
for input_type, fn in zip(MODEL_TYPE, [lambda x: x[:, 0, :], lambda x: torch.mean(x[:, 1:, :], dim=1)]):
    classifier_model = NLI_model(input_dims=768, class_weights=class_weights).to(envir)
    llmodel = createLLModel(MODEL_NAME)
    with open(f"{input_type}_log.txt", 'w', encoding='utf-8') as log:
        llmodel, log_dict[input_type], test_acc, test_f1score = training_model(LLModel=llmodel,
                                                                               classifier=classifier_model,
                                                                               dataset=mapped_dataset,
                                                                               process_outputLLModel_fn=fn,
                                                                               checkpoint=100,
                                                                               checkpoint_action=lambda x: log.write(x+"\n"),
                                                                               epochs=1,
                                                                               best_model_func=best_model_func)
    print(f'- Finish --- {input_type} --- Test Accuracy: {test_acc} --- Test F1 Score: {test_f1score}')
    torch.save({'model_state_dict' : classifier_model.state_dict()}, f"{input_type}.pt")    # Save lại classifier
    llmodel.save_pretrained(f'mDeBERTa-v3-base-mnli-xnli-{input_type}')         # Save lại best mDeBERTa

# Ploting the record
fig, axs = plt.subplots(3)  # 2 rows of subplots
for method, metrics in log_dict.items():
    for idx, (metric_name, record) in enumerate(metrics.items()):
        axs[idx].plot(record, label = method)
        axs[idx].set_title(metric_name)

plt.tight_layout()
plt.legend()
plt.savefig(f"plot.png", dpi=300)
plt.show()

print(f"Finish pretrained model: {MODEL_NAME}")