# 0. Imports, libraries and rusable functions

In [3]:
# Standard Library Imports
import ast
import copy
import csv
import json
import math
import os
import re
import time
import warnings
import logging
import random
import collections
from collections import Counter, defaultdict
from typing import List, Tuple, Optional
from IPython.display import HTML, display
import math
import time
from unidecode import unidecode
import string
import multiprocessing as mp



# Data Handling Libraries
import numpy as np
import pandas as pd
import csv
from torch.utils.data import random_split
import datasets
from datasets import ClassLabel, Sequence, Dataset, DatasetDict, load_dataset, load_metric, concatenate_datasets, load_from_disk


# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# import scikitplot as skplt  # Uncomment if scikit-plot is installed and needed

# Machine Learning: Model Preparation
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, f1_score
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

# Machine Learning: Models and Frameworks
import tensorflow as tf
import torch
from torch.utils.data import DataLoader
import evaluate
import xgboost
import wandb
from xgboost import plot_importance  # Uncomment if xgboost importance plot is required


# NLP and Transformers
import spacy
import transformers
from transformers import (AdamW, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForMultipleChoice,
                          AutoTokenizer, CamembertForSequenceClassification, DistilBertConfig,
                          DistilBertForSequenceClassification, DistilBertModel, EarlyStoppingCallback,
                          get_linear_schedule_with_warmup, RobertaForSequenceClassification, EvalPrediction,
                          Trainer, TrainerCallback, TrainingArguments, XLMRobertaForSequenceClassification,
                         DefaultDataCollator, BertForQuestionAnswering, DataCollatorWithPadding, PreTrainedTokenizerFast,
                         default_data_collator, is_torch_xla_available, pipeline)
from transformers.trainer_utils import PredictionOutput, speed_metrics

# Experiment Tracking and Optimization Utilities
import optuna
from optuna.trial import TrialState
# import wandb  # Uncomment if using Weights & Biases for experiment tracking

# Progress Bar Utilities
from tqdm.auto import tqdm


In [4]:
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
cuda


# 1. Global Variables

In [8]:
## Arguments and global vriables
dataset_name="AR-LSAT"
pretrained_model_name = "microsoft/deberta-v3-base"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



# 2. Prepare the AR-LSAT Dataset 

In [11]:
# Load the combined dataset
combined_dataset = load_from_disk('cleaned_dataset')

combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1072514
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 118521
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 200566
    })
})

In [43]:
# Filter the dataset to only include AR-LSAT data
ar_lsat_train = combined_dataset['train'].filter(lambda x: x['Source Dataset'] == 'AR-LSAT')
ar_lsat_val = combined_dataset['validation'].filter(lambda x: x['Source Dataset'] == 'AR-LSAT')
ar_lsat_test = combined_dataset['test'].filter(lambda x: x['Source Dataset'] == 'AR-LSAT')


In [15]:
# Preprocessing function for multiple-choice tasks
def mcqa_preprocess_function(examples):
    num_choices = num_choices = len(examples['Options'][0])    
    first_sentences = [[context] * num_choices for context in examples['Context']]  # Repeat context for each option
    question_headers = examples['Question']
    options_list = examples['Options']
    
    second_sentences = []
    for question, options in zip(question_headers, options_list):
        # Combine question with each option
        second_sentences.append([f"{question} {option}" for option in options])
    
    # Flatten the lists
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize the inputs
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    # Un-flatten the tokenized inputs to have shape (num_examples, num_choices, seq_length)
    tokenized_inputs = {k: [v[i:i + num_choices] for i in range(0, len(v), num_choices)] for k, v in tokenized_examples.items()}
    
    # Labels
    tokenized_inputs["labels"] = examples["Label"]
    
    return tokenized_inputs

# Apply the preprocessing function to the datasets
encoded_ar_lsat_train = ar_lsat_train.map(mcqa_preprocess_function, batched=True)
encoded_ar_lsat_val = ar_lsat_val.map(mcqa_preprocess_function, batched=True)
encoded_ar_lsat_test = ar_lsat_test.map(mcqa_preprocess_function, batched=True)

In [45]:
# Set the format of the datasets to PyTorch tensors
encoded_ar_lsat_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_ar_lsat_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_ar_lsat_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


def get_train_encoded():
    return encoded_ar_lsat_train

def get_val_encoded():
    return encoded_ar_lsat_val

def get_test_encoded():
    return encoded_ar_lsat_test


# 3. Reusable Functions

In [33]:
# Load the accuracy metric
accuracy = evaluate.load('accuracy')

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_score(labels, predictions, average='weighted')
    return {'eval_accuracy': acc, 'eval_f1': f1}

In [34]:
def create_training_args(run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=4):
    """
    Generates training arguments for training a machine learning model.

    Parameters:
    - dataset_name (str): The name of the dataset.
    - run_name (str): The name of the run, useful for logging and saving models.
    - model_name (str): The name of the model, typically including its configuration.
    - num_train_epochs (int): The number of epochs to train for.
    - learning_rate (float): The learning rate for training.
    - batch_size (int): The batch size used for training.

    Returns:
    - TrainingArguments: A configured TrainingArguments instance.
    """    
    output_dir = f"./{dataset_name}/{run_name}/{normalized_model_name}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=1,
        warmup_steps=398,
        weight_decay=0.194,
        adam_beta1=0.837,
        adam_beta2=0.997,
        adam_epsilon=5.87e-07,
        lr_scheduler_type='cosine',
        fp16=True,  # Enable mixed-precision training
    )
    
    return training_args


In [52]:
def create_trainer(run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=4):
    trainer = Trainer(
        model=model,
        args=create_training_args(run_name=run_name, num_train_epochs=num_train_epochs, learning_rate=learning_rate, batch_size=batch_size),
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    return trainer


# 4. Fine-tuning DeBERTa on MCQA task (AR-LSAT Dataset)

## 4.1 Evaluate Vanilla DeBERTa (Acc = 15.22%)

In [16]:
# Load the model
model = AutoModelForMultipleChoice.from_pretrained(pretrained_model_name)

# Create the Trainer
trainer = create_trainer()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Test Results: {'eval_accuracy': 0.15217391304347827, 'eval_f1': 0.1483351657844587, 'eval_loss': 1.609476923942566, 'eval_model_preparation_time': 0.001, 'eval_runtime': 8.7947, 'eval_samples_per_second': 26.152, 'eval_steps_per_second': 26.152}


## 4.2 Fine-Tune and Evaluate Vanilla DeBERTa (Acc=22.61%)

In [18]:
# Load the model
model = AutoModelForMultipleChoice.from_pretrained(pretrained_model_name)
# Create the Trainer
trainer = create_trainer()
# Train the model
trainer.train()
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.609375,0.238095,0.217696
2,1.610500,1.609375,0.199134,0.162603
3,1.610900,1.609383,0.177489,0.126622
4,1.614800,1.609375,0.229437,0.188633
5,1.614800,1.609375,0.212121,0.145762
6,1.610600,1.609375,0.212121,0.142153


Test Results: {'eval_accuracy': 0.22608695652173913, 'eval_f1': 0.20898684836140372, 'eval_loss': 1.609375, 'eval_runtime': 9.3401, 'eval_samples_per_second': 24.625, 'eval_steps_per_second': 24.625, 'epoch': 6.0}


## 4.3 Evaluate SQUAD DeBERTa (Acc=22.61%)

In [29]:
path = "./squad-trained-model"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="Squad-Run")
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Test Results: {'eval_accuracy': 0.22608695652173913, 'eval_f1': 0.22492208750128037, 'eval_loss': 1.6092263460159302, 'eval_model_preparation_time': 0.002, 'eval_runtime': 9.2167, 'eval_samples_per_second': 24.955, 'eval_steps_per_second': 24.955}


## 4.4 Fine-Tune and Evaluate SQUAD DeBERTa (Acc=23.91%)


In [22]:
path = "./squad-trained-model"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="Squad-Run")

# Train the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.609358,0.246753,0.245915
2,1.615800,1.609392,0.220779,0.222109
3,1.618300,1.6094,0.199134,0.199943


Test Results: {'eval_accuracy': 0.2391304347826087, 'eval_f1': 0.23980029660214316, 'eval_loss': 1.6094005107879639, 'eval_runtime': 9.0823, 'eval_samples_per_second': 25.324, 'eval_steps_per_second': 25.324, 'epoch': 3.0}


## 4.5 Evaluate Trained DeBERTa on Other Datasets

In [54]:
path = "./LogiQA/Squad-Run/microsoft-deberta-v3-base/checkpoint-12567"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="LogiQA-Run")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Test Results: {'eval_accuracy': 0.1956521739130435, 'eval_f1': 0.19823219158947752, 'eval_loss': 1.6125339269638062, 'eval_model_preparation_time': 0.0, 'eval_runtime': 10.2579, 'eval_samples_per_second': 22.422, 'eval_steps_per_second': 22.422}


In [60]:
path = "./ReClor/Squad-Run/microsoft-deberta-v3-base-best_model/checkpoint-3105"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="ReColr-Run")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Test Results: {'eval_accuracy': 0.20869565217391303, 'eval_f1': 0.2074393665896999, 'eval_loss': 2.0346715450286865, 'eval_model_preparation_time': 0.0, 'eval_runtime': 9.2238, 'eval_samples_per_second': 24.935, 'eval_steps_per_second': 24.935}


In [70]:
# Train the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Model Preparation Time
1,No log,1.609472,0.229437,0.228899,0.0
2,1.666500,1.607451,0.194805,0.189353,0.0
3,1.581200,1.662142,0.17316,0.172048,0.0


Test Results: {'eval_accuracy': 0.20434782608695654, 'eval_f1': 0.20337999588676806, 'eval_loss': 1.609222173690796, 'eval_model_preparation_time': 0.0, 'eval_runtime': 9.2973, 'eval_samples_per_second': 24.738, 'eval_steps_per_second': 24.738, 'epoch': 3.0}


# End of NoteBook