# Multiclass Models Training for BABE Dataset 

## Imports, libraries and rusable functions

In [40]:
# General Utilities
import os
import re
import time
import ast
import warnings
import math
import copy
import json
from collections import Counter
from tqdm.notebook import tqdm

# Data Manipulation
import pandas as pd
import numpy as np
import csv

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import plot_importance

# Machine Learning Utilities
import xgboost
import tensorflow as tf
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
# import scikitplot as skplt  # Uncomment if scikit-plot is installed and needed

# Transformers and Hugging Face Utilities
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, XLMRobertaForSequenceClassification,
    DistilBertConfig, DistilBertModel, DistilBertForSequenceClassification, CamembertForSequenceClassification, RobertaForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup, TrainerCallback, EarlyStoppingCallback, Trainer, TrainingArguments
)
from datasets import load_dataset, DatasetDict, Dataset

# Experiment Tracking
#import wandb  # Uncomment if using Weights & Biases for experiment tracking


In [2]:
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
cuda


In [312]:
# Reusable Function Definitions

def custom_label(row):
    """
    Determines the label for a given row of the dataset based on specific criteria.
    
    Args:
        row (pd.Series): A row from a DataFrame.
        
    Returns:
        str: Custom label string based on the given logic.
    """
    if row['type'] == 'center' or row['label_bias'] == 'Non-biased':
        return 'Non-biased'
    else:
        return f"{row['type']}-Biased"

def load_and_prepare_data(file_paths):
    """
    Load data from specified file paths, preprocess, and create train, validation, and test splits.
    
    Args:
        file_paths (list): List of file paths to load and concatenate.
        
    Returns:
        DatasetDict: A dictionary containing training, validation, and test datasets.
    """
    data = pd.concat([pd.read_excel(path) for path in file_paths])
    data['labels'] = data.apply(custom_label, axis=1)
    data.dropna(subset=['type'], inplace=True)
    data = data[data.label_bias != 'No agreement']
    label_mapping = {'Non-biased': 0, 'left-Biased': 1, 'right-Biased': 2}
    data['labels'] = data['labels'].replace(label_mapping)
    data = data[['text', 'labels']]

    # Split data
    train_temp, test = train_test_split(data, test_size=0.10, random_state=42, stratify=data['labels'])
    train, val = train_test_split(train_temp, test_size=1/9, random_state=42, stratify=train_temp['labels'])

    # Convert to Hugging Face datasets
    return DatasetDict({
        'train': Dataset.from_pandas(train, preserve_index=False),
        'val': Dataset.from_pandas(val, preserve_index=False),
        'test': Dataset.from_pandas(test, preserve_index=False)
    })
    
def ensemble_score(outputs, true_labels):
    """
    Compute the ensemble score by majority voting from a list of model outputs.
    
    Args:
        outputs (list): List of model outputs containing predictions.
        true_labels (array): Actual labels for the evaluation dataset.
    
    Returns:
        tuple: Tuple containing the final predicted labels and the accuracy.
    """
    final_labels = []
    # Loop through predictions to compute majority vote
    for preds in zip(*[output.predictions for output in outputs]):
        votes = [np.argmax(pred) for pred in preds]
        final_labels.append(max(set(votes), key=votes.count))
        
    acc = accuracy_score(true_labels, final_labels)
    return final_labels, acc

def ensemble_score_total_sum(outputs, true_labels, weights=None):
    """
    Compute the ensemble score by summing predictions from a list of model outputs before deciding the final class,
    with an option to weight predictions differently.
    
    Args:
        outputs (list): List of model outputs containing predictions.
        true_labels (array): Actual labels for the evaluation dataset.
        weights (list, optional): List of weights corresponding to each model output. Default is None, which
                                  assigns equal weight to each model.
    
    Returns:
        tuple: Tuple containing the final predicted labels and the accuracy.
    """
    final_labels = []

    # Set equal weights if none are provided
    if weights is None:
        weights = [1] * len(outputs)

    # Ensure the weights and outputs have the same length
    if len(weights) != len(outputs):
        raise ValueError("The number of weights must match the number of outputs")

    # Sum the predictions weighted and take the argmax
    for preds in zip(*[output.predictions for output in outputs]):
        # Apply weights to each model's predictions before summing
        weighted_preds = np.sum([p*w for p, w in zip(preds, weights)], axis=0)
        final_labels.append(np.argmax(weighted_preds))
        
    acc = accuracy_score(true_labels, final_labels)
    return final_labels, acc

class LoggingCallback(TrainerCallback):
    """
    Custom logging callback for use with Hugging Face's Trainer.
    
    Args:
        log_path (str): Path to save the log file.
    """
    def __init__(self, log_path):
        self.log_path = log_path

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Remove unwanted logs and save relevant logs to a file
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

def compute_metrics(eval_pred):
    """
    Compute accuracy and other metrics from model predictions.
    
    Args:
        eval_pred (tuple): Tuple containing model logits and ground-truth labels.
    
    Returns:
        dict: Dictionary with accuracy, precision, recall, f1-score, and custom performance metric.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)
    
    # Calculate custom performance metric
    cust_performance = 0.5 * accuracy + 0.5 * f1
    
    return {
        "eval_accuracy": accuracy,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1,
        "eval_cust_performance": cust_performance
    }

In [216]:
def get_training_arguments(model_name, classification_type):
    """
    Generate TrainingArguments based on the model name and classification type.
    
    Args:
    model_name (str): Name of the model ('RoBERTa', 'DistilBERT', 'XLM-RoBERTa').
    classification_type (str): Type of classification ('binary', 'multiclass').
    
    Returns:
    TrainingArguments: Configured training arguments.
    """
    # Define hyperparameters based on model and classification type
    hyperparameters = {
        'RoBERTa': {
            'binary': {'learning_rate': 3.41877e-05, 'batch_size': 16, 'warmup_steps': 387, 'weight_decay': 0.06326},
            'multiclass': {'learning_rate': 5.57274e-05, 'batch_size': 32, 'warmup_steps': 475, 'weight_decay': 0.15220}
        },
        'DistilBERT': {
            'binary': {'learning_rate': 7.23011e-05, 'batch_size': 32, 'warmup_steps': 194, 'weight_decay': 0.29289},
            'multiclass': {'learning_rate': 0.00011143, 'batch_size': 16, 'warmup_steps': 324, 'weight_decay': 0.04303}
        },
        'XLM-RoBERTa': {
            'binary': {'learning_rate': 7.43270e-05, 'batch_size': 32, 'warmup_steps': 187, 'weight_decay': 0.11168},
            'multiclass': {'learning_rate': 4.09464e-05, 'batch_size': 16, 'warmup_steps': 481, 'weight_decay': 0.22781}
        }
    }
    
    # Select hyperparameters for the given model and classification type
    params = hyperparameters[model_name][classification_type]
    
    # Create and return TrainingArguments
    return TrainingArguments(
        output_dir=f"{model_name.lower()}-{classification_type}-model",
        per_device_train_batch_size=params['batch_size'],
        per_device_eval_batch_size=params['batch_size'],
        num_train_epochs=20,  # Common setting for all models
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=params['learning_rate'],
        load_best_model_at_end=True,
        metric_for_best_model='eval_cust_performance',
        warmup_steps=params['warmup_steps'],
        weight_decay=params['weight_decay'],        
        lr_scheduler_type='cosine_with_restarts'  # learning scheduler ('linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau', 'cosine_with_min_lr', 'warmup_stable_decay')        
        #seed=244,
        #save_total_limit=3  # Save only the last 3 models to save disk space
    )

def add_early_stopping(trainer, patience=3, threshold=0.01):
    """
    Adds an early stopping callback to a Trainer instance.

    Args:
        trainer (Trainer): The Trainer instance to which the early stopping will be added.
        patience (int): Number of evaluations with no improvement after which training will be stopped.
        threshold (float): Minimum change in the monitored quantity to qualify as an improvement.

    Returns:
        Trainer: The Trainer instance with the early stopping callback added.
    """
    early_stopping = EarlyStoppingCallback(early_stopping_patience=patience, early_stopping_threshold=threshold)
    trainer.add_callback(early_stopping)
    return trainer

## 1. RoBERTa Model Training for Multiclass Classification

In [96]:
# Load and process data
file_paths = ['data/final_labels_SG1.xlsx', 'data/final_labels_SG2.xlsx']
dataset = load_and_prepare_data(file_paths)

# Prepare tokenizer and model
Roberta_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
Roberta_tokenized_datasets = dataset.map(lambda x: tokenizer(x['text'], padding=True, truncation=True), batched=True)
Roberta_tokenized_datasets = Roberta_tokenized_datasets.remove_columns(["text"])
Roberta_tokenized_datasets.set_format("torch")

# Initialize the model and trainer
RobertaModel = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=3)

RobertaTrainer = Trainer(
    model=RobertaModel,
    args=get_training_arguments('RoBERTa', 'multiclass'),
    train_dataset=Roberta_tokenized_datasets['train'],
    eval_dataset=Roberta_tokenized_datasets['val'],
    tokenizer=Roberta_tokenizer,
    compute_metrics=compute_metrics
)

# Adding early stopping callback
RobertaTrainer = add_early_stopping(RobertaTrainer, patience=3, threshold=0.01)

Map:   0%|          | 0/3375 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
# Start training
RobertaTrainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Cust Performance
1,No log,0.798678,0.545024,0.181675,0.333333,0.235174,0.390099
2,No log,0.732716,0.694313,0.641549,0.565079,0.57017,0.632242
3,No log,0.558202,0.779621,0.739259,0.738869,0.730587,0.755104
4,No log,0.564862,0.819905,0.80121,0.785438,0.787528,0.803717
5,0.641000,0.632724,0.812796,0.785497,0.786473,0.785951,0.799374
6,0.641000,0.656058,0.829384,0.816538,0.802652,0.809182,0.819283
7,0.641000,0.674551,0.817536,0.7933,0.804294,0.792749,0.805142
8,0.641000,0.748298,0.827014,0.802001,0.819593,0.809953,0.818484
9,0.641000,0.871611,0.831754,0.814087,0.806233,0.809995,0.820874


TrainOutput(global_step=954, training_loss=0.4380696374665266, metrics={'train_runtime': 146.2844, 'train_samples_per_second': 461.43, 'train_steps_per_second': 14.492, 'total_flos': 1919969956658250.0, 'train_loss': 0.4380696374665266, 'epoch': 9.0})

In [98]:
eval_results = RobertaTrainer.evaluate(Roberta_tokenized_datasets['test'])
print(eval_results)

{'eval_accuracy': 0.23459715639810427, 'eval_precision': 0.17193254506687342, 'eval_recall': 0.23821510297482837, 'eval_f1': 0.17649506427915518, 'eval_cust_performance': 0.20554611033862974, 'eval_loss': 1.0967382192611694, 'eval_runtime': 0.8332, 'eval_samples_per_second': 506.477, 'eval_steps_per_second': 16.803}


In [75]:
RobertaOutput = RobertaTrainer.predict(Roberta_tokenized_datasets['test'])
print(f"ROBERTA test accuracy: {RobertaOutput.metrics}")

ROBERTA test accuracy: {'test_loss': 2.3106367588043213, 'test_eval_accuracy': 0.3246445497630332, 'test_eval_precision': 0.25392937334149823, 'test_eval_recall': 0.3217540714178993, 'test_eval_f1': 0.24435600441249195, 'test_eval_cust_performance': 0.2845002770877626, 'test_runtime': 0.572, 'test_samples_per_second': 737.762, 'test_steps_per_second': 24.476}


In [15]:
# Save the Best Model
RobertaModel.save_pretrained('./Saved-Models/RobertaMulticlass/xxx')  # Save model and tokenizer for later use
Roberta_tokenizer.save_pretrained('./Saved-Models/RobertaMulticlass/xxx')

('./Saved-Models/RobertaMulticlass/865\\tokenizer_config.json',
 './Saved-Models/RobertaMulticlass/865\\special_tokens_map.json',
 './Saved-Models/RobertaMulticlass/865\\vocab.json',
 './Saved-Models/RobertaMulticlass/865\\merges.txt',
 './Saved-Models/RobertaMulticlass/865\\added_tokens.json',
 './Saved-Models/RobertaMulticlass/865\\tokenizer.json')

In [228]:
# Load the Saved Model
RobertaModel = RobertaForSequenceClassification.from_pretrained('./Saved-Models/RobertaMulticlass/865')
Roberta_tokenizer = AutoTokenizer.from_pretrained('./Saved-Models/RobertaMulticlass/865')

RobertaTrainer = Trainer(
    model=RobertaModel,
    args=get_training_arguments('RoBERTa', 'multiclass'),
    train_dataset=Roberta_tokenized_datasets['train'],
    eval_dataset=Roberta_tokenized_datasets['val'],
    tokenizer=Roberta_tokenizer,
    compute_metrics=compute_metrics
)



In [230]:
RobertaOutput = RobertaTrainer.predict(Roberta_tokenized_datasets['test'])
print(f"RoBERTa test accuracy: {RobertaOutput.metrics['test_eval_accuracy']} , test f1: {RobertaOutput.metrics['test_eval_f1']}")


RoBERTa test accuracy: 0.8649289099526066 , test f1: 0.847069373114861


## 2. DistilBert Model Training for Multiclass Classification

In [236]:
# Prepare tokenizer and model
Distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
Distilbert_tokenized_datasets = dataset.map(lambda x: Distilbert_tokenizer(x['text'], padding=True, truncation=True), batched=True)
Distilbert_tokenized_datasets = Distilbert_tokenized_datasets.remove_columns(["text"])
Distilbert_tokenized_datasets.set_format("torch")

# Initialize the model and trainer
DistilbertModel = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

DistilbertTrainer = Trainer(
    model=DistilbertModel,
    args=get_training_arguments('DistilBERT', 'multiclass'),
    train_dataset=Distilbert_tokenized_datasets['train'],
    eval_dataset=Distilbert_tokenized_datasets['val'],
    tokenizer=Distilbert_tokenizer,
    compute_metrics=compute_metrics
)

# Adding early stopping callback
DistilbertTrainer = add_early_stopping(DistilbertTrainer, patience=5, threshold=0.01)

Map:   0%|          | 0/3375 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [238]:
# Start training
DistilbertTrainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Cust Performance
1,No log,0.786674,0.658768,0.64741,0.538893,0.513398,0.586083
2,No log,0.586564,0.7891,0.758598,0.752543,0.745942,0.767521
3,0.706700,0.609648,0.781991,0.756204,0.772402,0.754002,0.767996
4,0.706700,0.811806,0.798578,0.776604,0.766361,0.763939,0.781258
5,0.290400,0.801942,0.831754,0.806819,0.81653,0.811431,0.821592
6,0.290400,0.716155,0.845972,0.827361,0.826996,0.826888,0.83643
7,0.290400,0.736559,0.824645,0.806815,0.800477,0.797172,0.810908
8,0.171900,0.876795,0.822275,0.803616,0.794474,0.798147,0.810211
9,0.171900,0.907476,0.829384,0.80445,0.824872,0.813367,0.821375
10,0.095200,0.994924,0.831754,0.813348,0.816747,0.812291,0.822022


TrainOutput(global_step=2321, training_loss=0.28060221805597163, metrics={'train_runtime': 108.3237, 'train_samples_per_second': 623.132, 'train_steps_per_second': 38.957, 'total_flos': 1143016109185590.0, 'train_loss': 0.28060221805597163, 'epoch': 11.0})

In [240]:
DistilbertOutput = DistilbertTrainer.predict(Distilbert_tokenized_datasets['test'])
print(f"DistilBERT test accuracy: {DistilbertOutput.metrics['test_eval_accuracy']} , test f1: {DistilbertOutput.metrics['test_eval_f1']}")


DistilBERT test accuracy: 0.8270142180094787 , test f1: 0.8034786863969533


In [214]:
# Save the Best Model
DistilbertModel.save_pretrained('./Saved-Models/DistilbertMulticlass/xxx')  # Save model and tokenizer for later use
Distilbert_tokenizer.save_pretrained('./Saved-Models/DistilbertMulticlass/xxx')


('./Saved-Models/DistilbertMulticlass/846\\tokenizer_config.json',
 './Saved-Models/DistilbertMulticlass/846\\special_tokens_map.json',
 './Saved-Models/DistilbertMulticlass/846\\vocab.txt',
 './Saved-Models/DistilbertMulticlass/846\\added_tokens.json',
 './Saved-Models/DistilbertMulticlass/846\\tokenizer.json')

In [242]:
# Load the Saved Model
DistilbertModel = DistilBertForSequenceClassification.from_pretrained('./Saved-Models/DistilbertMulticlass/846')
Distilbert_tokenizer = AutoTokenizer.from_pretrained('./Saved-Models/DistilbertMulticlass/846')

DistilbertTrainer = Trainer(
    model=DistilbertModel,
    args=get_training_arguments('DistilBERT', 'multiclass'),
    train_dataset=Distilbert_tokenized_datasets['train'],
    eval_dataset=Distilbert_tokenized_datasets['val'],
    tokenizer=Distilbert_tokenizer,
    compute_metrics=compute_metrics
)




In [244]:
DistilbertOutput = DistilbertTrainer.predict(Distilbert_tokenized_datasets['test'])
print(f"DistilBERT test accuracy: {DistilbertOutput.metrics['test_eval_accuracy']} , test f1: {DistilbertOutput.metrics['test_eval_f1']}")


DistilBERT test accuracy: 0.8459715639810427 , test f1: 0.8230809090236137


## 3. XLM-RoBERTa Model Training for Multiclass Classification

In [249]:
# Prepare tokenizer and model
XLM_RoBERTa_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
XLM_RoBERTa_tokenized_datasets = dataset.map(lambda x: XLM_RoBERTa_tokenizer(x['text'], padding=True, truncation=True), batched=True)
XLM_RoBERTa_tokenized_datasets = XLM_RoBERTa_tokenized_datasets.remove_columns(["text"])
XLM_RoBERTa_tokenized_datasets.set_format("torch")

# Initialize the model and trainer
XLMRoBERTaModel = XLMRobertaForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base",num_labels=3)

XLMRoBERTaTrainer = Trainer(
    model=XLMRoBERTaModel,
    args=get_training_arguments('XLM-RoBERTa', 'multiclass'),
    train_dataset=XLM_RoBERTa_tokenized_datasets['train'],
    eval_dataset=XLM_RoBERTa_tokenized_datasets['val'],
    tokenizer=XLM_RoBERTa_tokenizer,
    compute_metrics=compute_metrics
)

# Adding early stopping callback
XLMRoBERTaTrainer = add_early_stopping(XLMRoBERTaTrainer, patience=5, threshold=0.01)

Map:   0%|          | 0/3375 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [251]:
# Start training
XLMRoBERTaTrainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Cust Performance
1,No log,0.820231,0.63981,0.55202,0.50367,0.492,0.565905
2,No log,0.713222,0.699052,0.626746,0.619933,0.622905,0.660979
3,0.898100,0.695157,0.718009,0.644848,0.65122,0.631218,0.674614
4,0.898100,0.739515,0.71564,0.680439,0.648218,0.613825,0.664732
5,0.557000,0.68217,0.772512,0.743801,0.707893,0.718118,0.745315
6,0.557000,0.788012,0.812796,0.79857,0.766674,0.777744,0.79527
7,0.557000,0.735734,0.817536,0.807996,0.783989,0.790287,0.803911
8,0.303200,0.981611,0.815166,0.796314,0.790343,0.792617,0.803892
9,0.303200,0.849349,0.824645,0.801649,0.802174,0.801025,0.812835
10,0.211100,0.835051,0.843602,0.833215,0.81767,0.824548,0.834075


TrainOutput(global_step=3165, training_loss=0.35600686849199387, metrics={'train_runtime': 443.1616, 'train_samples_per_second': 152.315, 'train_steps_per_second': 9.522, 'total_flos': 3589365494402100.0, 'train_loss': 0.35600686849199387, 'epoch': 15.0})

In [253]:
XLMRoBERTaOutput = XLMRoBERTaTrainer.predict(XLM_RoBERTa_tokenized_datasets['test'])
print(f"DistilBERT test accuracy: {XLMRoBERTaOutput.metrics['test_eval_accuracy']} , test f1: {XLMRoBERTaOutput.metrics['test_eval_f1']}")


DistilBERT test accuracy: 0.8507109004739336 , test f1: 0.8318914177600322


In [255]:
# Save the Best Model
XLMRoBERTaModel.save_pretrained('./Saved-Models/XLMRoBERTaMulticlass/851')  # Save model and tokenizer for later use
XLM_RoBERTa_tokenizer.save_pretrained('./Saved-Models/XLMRoBERTaMulticlass/851')


('./Saved-Models/XLMRoBERTaMulticlass/851\\tokenizer_config.json',
 './Saved-Models/XLMRoBERTaMulticlass/851\\special_tokens_map.json',
 './Saved-Models/XLMRoBERTaMulticlass/851\\tokenizer.json')

In [257]:
# Load the Saved Model
XLMRoBERTaModel = XLMRobertaForSequenceClassification.from_pretrained('./Saved-Models/XLMRoBERTaMulticlass/851')
XLM_RoBERTa_tokenizer = AutoTokenizer.from_pretrained('./Saved-Models/XLMRoBERTaMulticlass/851')

XLMRoBERTaTrainer = Trainer(
    model=XLMRoBERTaModel,
    args=get_training_arguments('XLM-RoBERTa', 'multiclass'),
    train_dataset=XLM_RoBERTa_tokenized_datasets['train'],
    eval_dataset=XLM_RoBERTa_tokenized_datasets['val'],
    tokenizer=XLM_RoBERTa_tokenizer,
    compute_metrics=compute_metrics
)



In [259]:
XLMRoBERTaOutput = XLMRoBERTaTrainer.predict(XLM_RoBERTa_tokenized_datasets['test'])
print(f"DistilBERT test accuracy: {XLMRoBERTaOutput.metrics['test_eval_accuracy']} , test f1: {XLMRoBERTaOutput.metrics['test_eval_f1']}")


DistilBERT test accuracy: 0.8507109004739336 , test f1: 0.8318914177600322


## 4. Ensembel Evaluation for Multiclass Classification

In [300]:
print(f"RoBERTa test accuracy: {RobertaOutput.metrics['test_eval_accuracy']}")
print(f"DistilBERT test accuracy: {DistilbertOutput.metrics['test_eval_accuracy']}")
print(f"XLM-RoBERTa test accuracy: {XLMRoBERTaOutput.metrics['test_eval_accuracy']}")
outputs = [RobertaOutput,DistilbertOutput,XLMRoBERTaOutput]
final_, acc = ensemble_score(outputs,Roberta_tokenized_datasets['test']['labels'] )
print(f"Voting Ensemble Accuracy: {acc}")
final_, acc = ensemble_score_total_sum(outputs,XLM_RoBERTa_tokenized_datasets['test']['labels'] )
print(f"Predection Confidence Ensemble Accuracy: {acc}")

RoBERTa test accuracy: 0.8649289099526066
DistilBERT test accuracy: 0.8459715639810427
XLM-RoBERTa test accuracy: 0.8507109004739336
Voting Ensemble Accuracy: 0.8625592417061612
Predection Confidence Ensemble Accuracy: 0.8625592417061612


In [316]:
print(f"RoBERTa test accuracy: {RobertaOutput.metrics['test_eval_accuracy']}")
print(f"DistilBERT test accuracy: {DistilbertOutput.metrics['test_eval_accuracy']}")
print(f"XLM-RoBERTa test accuracy: {XLMRoBERTaOutput.metrics['test_eval_accuracy']}")
outputs = [RobertaOutput,DistilbertOutput,XLMRoBERTaOutput]
final_, acc = ensemble_score(outputs,Roberta_tokenized_datasets['test']['labels'] )
print(f"Voting Ensemble Accuracy: {acc}")
final_, acc = ensemble_score_total_sum(outputs,XLM_RoBERTa_tokenized_datasets['test']['labels'])
print(f"Predection Confidence Ensemble Accuracy: {acc}")
final_, acc = ensemble_score_total_sum(outputs,XLM_RoBERTa_tokenized_datasets['test']['labels'], [0.6, 0.2, 0.2] )
print(f"Weighted Predection Confidence Ensemble Accuracy: {acc}")

RoBERTa test accuracy: 0.8649289099526066
DistilBERT test accuracy: 0.8459715639810427
XLM-RoBERTa test accuracy: 0.8507109004739336
Voting Ensemble Accuracy: 0.8625592417061612
Predection Confidence Ensemble Accuracy: 0.8625592417061612
Weighted Predection Confidence Ensemble Accuracy: 0.8649289099526066


# Conclusion

Throughout this project, we evaluated the performance of three different transformer-based models: RoBERTa, DistilBERT, and XLM-RoBERTa, on a multiclass classification task. Below are the observed accuracies for each model when tested on our dataset:

- **RoBERTa Model Test Accuracy**: 0.86493
- **DistilBERT Model Test Accuracy**: 0.84597
- **XLM-RoBERTa Model Test Accuracy**: 0.85071

To enhance the performance and leverage the strengths of each model, we implemented ensemble techniques. The accuracies achieved by these ensemble methods are:

- **Voting Ensemble Accuracy: 0.86256**     
- **Prediction Confidence Ensemble Accuracy: 0.86256**

The ensemble effectively boosts the performance over the weakest model (DistilBERT), indicating that combining model predictions can help mitigate individual model weaknesses. However, it did not surpass the strongest model (RoBERTa). This can occur if the models make similar types of errors or if the strongest model already performs near an upper limit for the given data and model configurations. The overall of ensble is less than the best model as the lower performance of the other two models negatively impacted the overall performance.

- **Weighted Prediction Confidence Ensemble Accuracy: 0.86493**

The ensemble techniques, specifically the Weighted Prediction Confidence Ensemble, matched the performance of the best individual model (RoBERTa) as we placed a higher weight for it. This indicates that while ensembles can stabilize prediction outcomes, the improvements might be marginal depending on the variance and accuracy of the individual models involved.

This analysis suggests that for tasks where model interpretability is not critical, employing an ensemble of models could be beneficial, especially in scenarios where different models capture various aspects of the data differently. However, the overhead of maintaining multiple models versus the incremental gain in performance should also be considered.

Overall, the project demonstrates the effectiveness of transformer models in handling complex text classification tasks and the potential of ensembles to enhance predictive performance.
