In [None]:
# Import the Libraries
import torch
from transformers import AutoTokenizer, DistilBertForMultipleChoice
import pandas as pd
import numpy as np

In [None]:
# Set to initialize the random number generator
RANDOM_SEED = 42
MAX_LEN = 128
BATCH_SIZE=8

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Open the SA dataset from huggingface

In [None]:
# Set your Hugging Face token
import huggingface_hub
from datasets import load_dataset
huggingface_hub.login("your_huggingface_token")

In [None]:
MCQA_dataset = load_dataset("ulinnuha/mcqa_ladin_italian")
MCQA_df = pd.DataFrame(MCQA_dataset["train"])
MCQA_df.head()

### Set the number of choices

In [None]:
# Set the Class number
num_choices= 3

In [None]:
MCQA_df = MCQA_df[MCQA_df['max_choices']== num_choices].reset_index(drop=True)

### Set the training and testing data

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Calculate the proportion of each class in the choice's max number column
class_proportions = MCQA_df['max_choices'].value_counts(normalize=True)

# Create empty DataFrames for the train and test sets
df_train = pd.DataFrame()
testing_data = pd.DataFrame()

# Split the data for each class based on the proportion
for label, proportion in class_proportions.items():
    # Get all rows for the current class
    label_df = MCQA_df[MCQA_df['max_choices'] == label]

    # Calculate the number of samples for train and test sets based on class proportion
    n_samples = len(label_df)
    train_size = int(0.80 * n_samples)  # 80% of samples for training
    test_size = n_samples - train_size  # 20% of samples for testing

    # Shuffle the rows within this class
    label_df_shuffled = label_df.sample(frac=1, random_state=42)

    # Split into train and test based on the calculated sizes
    label_train = label_df_shuffled.iloc[:train_size]
    label_test = label_df_shuffled.iloc[train_size:]

    # Append to the corresponding train and test DataFrames
    df_train = pd.concat([df_train, label_train], axis=0)
    testing_data = pd.concat([testing_data, label_test], axis=0)

# Reset indices for better handling
df_train = df_train.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True)

# Optionally, display the class distribution in both train and test sets
print("Class Distribution in Training Set:")
print(df_train['max_choices'].value_counts(normalize=True))

print("\nClass Distribution in Testing Set:")
print(testing_data['max_choices'].value_counts(normalize=True))


Class Distribution in Training Set:
max_choices
5    1.0
Name: proportion, dtype: float64

Class Distribution in Testing Set:
max_choices
5    1.0
Name: proportion, dtype: float64


### Set the training and validation data

In [None]:
# Calculate the proportion of each class in the 'label' column
class_proportions = df_train['max_choices'].value_counts(normalize=True)

# Create empty DataFrames for the train and test sets
training_data = pd.DataFrame()
val_data = pd.DataFrame()

# Split the data for each class based on the proportion
for label, proportion in class_proportions.items():
    # Get all rows for the current class
    label_df = df_train[df_train['max_choices'] == label]

    # Calculate the number of samples for train and test sets based on class proportion
    n_samples = len(label_df)
    train_size = int(0.90 * n_samples)  # 80% of samples for training
    val_size = n_samples - train_size  # 20% of samples for testing

    # Shuffle the rows within this class
    label_df_shuffled = label_df.sample(frac=1, random_state=42)

    # Split into train and test based on the calculated sizes
    label_train = label_df_shuffled.iloc[:train_size]
    label_val = label_df_shuffled.iloc[train_size:]

    # Append to the corresponding train and test DataFrames
    training_data = pd.concat([training_data, label_train], axis=0)
    val_data = pd.concat([val_data, label_val], axis=0)

# Reset indices for better handling
training_data = training_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

# Optionally, display the class distribution in both train and test sets
print("Class Distribution in Training Set:")
print(training_data['max_choices'].value_counts(normalize=True))

print("\nClass Distribution in Testing Set:")
print(val_data['max_choices'].value_counts(normalize=True))


Class Distribution in Training Set:
max_choices
5    1.0
Name: proportion, dtype: float64

Class Distribution in Testing Set:
max_choices
5    1.0
Name: proportion, dtype: float64


## Load the model

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-multilingual-cased', num_labels=num_choices)

In [None]:
#Set which the language for MCQA task
language = 'ladin'

## Tokenization

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
# Data Loader function
def CreateDataloader(df, language):
    encodings = []
    labels = []

    for idx, row in df.iterrows():
        question = row[f'question_{language}']
        raw_choices = row[f'choices_all_{language}']

        # Handle cases where `raw_choices` is already a list
        if isinstance(raw_choices, list):
            choices = raw_choices
        elif isinstance(raw_choices, str):
            try:
                choices = eval(raw_choices)  # Convert string representation to list
                if not isinstance(choices, list):
                    raise ValueError("Extracted choices are not a list")
            except:
                print(f"Skipping row {idx} due to invalid choices format: {raw_choices}")
                continue
        else:
            print(f"Skipping row {idx} due to unknown format: {type(raw_choices)}")
            continue

        correct_answer_idx = row['answer']

        if correct_answer_idx >= len(choices):
            print(f"Skipping row {idx} due to index error in answer column.")
            continue

        # Tokenize each question-choice pair separately
        example_encodings = []
        for choice in choices:
            # For each choice, encode the [CLS] question [SEP] choice pair
            encoding = tokenizer.encode(
                f"[CLS] {question} [SEP] {choice}",
                add_special_tokens=True,
                max_length=MAX_LEN,
                padding="max_length",
                truncation=True
            )
            example_encodings.append(encoding)

        # Stack encodings for each choice into one batch entry 
        encodings.append(example_encodings)
        labels.append(correct_answer_idx)

    # Convert list of encodings to a tensor with shape 
    input_ids = torch.tensor(encodings)  # Shape: 

    # Create attention mask (1 for real tokens, 0 for padding)
    input_mask_array = []
    for example in encodings:
        example_attention_mask = []
        for sent in example:
            att_mask = [int(token_id > 0) for token_id in sent]  # 1 for non-padding, 0 for padding
            example_attention_mask.append(att_mask)
        input_mask_array.append(example_attention_mask)

    # Convert attention mask to tensor
    input_mask_array = torch.tensor(input_mask_array)

    # Convert labels to tensor (shape: (batch_size,))
    label_id_array = torch.tensor(labels, dtype=torch.long)

    # Building the TensorDataset
    dataset = TensorDataset(input_ids, input_mask_array, label_id_array)

    return DataLoader(
        dataset,  # The training samples.
        sampler=RandomSampler(dataset),
        pin_memory=True,
        batch_size=BATCH_SIZE
    )


In [None]:
# Create Data loader on training, validation and testing data
train_dataloader = CreateDataloader(training_data, language)
val_dataloader = CreateDataloader(val_data, language)
test_dataloader = CreateDataloader(testing_data, language)

# Training Process

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Number of epochs
num_epochs = 5  # Example number of epochs
# Training and validation loop
for epoch in range(num_epochs):
    # Training Phase
    model.train().to(device)   # Set the model to training mode
    total_loss_train = 0
    train_predictions = []
    train_true_labels = []

    for step, batch in enumerate(train_dataloader):
        # Progress update every print_each_n_step batches.
        print('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_dataloader)))

        # Unpack this training batch from the dataloader
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Get the predicted choice of each entry using mBRET in the training stage
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()

        # Get predictions and append to the list
        predicted_idx = torch.argmax(logits, dim=1).cpu().numpy()  # Move to CPU for numpy operations
        train_predictions.extend(predicted_idx)

        # Move b_labels to CPU and then convert to NumPy
        b_labels_cpu = b_labels.cpu().numpy()  # Ensure it's a NumPy array
        train_true_labels.extend(b_labels_cpu)  # Append the labels

        # Debugging: Check lengths after each batch
        #print(f"Batch {step} - train_true_labels length: {len(train_true_labels)}, train_predictions length: {len(train_predictions)}")

    # Calculate training metrics after processing all batches in an epoch
    train_balanced_accuracy = balanced_accuracy_score(train_true_labels, train_predictions)
    train_f1_score = f1_score(train_true_labels, train_predictions, average='weighted')

    # Print training statistics for the epoch
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {total_loss_train:.4f}")
    print(f"Train Balanced Accuracy: {train_balanced_accuracy:.4f}")
    print(f"Train F1 Score: {train_f1_score:.4f}")

    # Validation Phase
    model.eval().to(device)   # Set the model to evaluation mode
    total_loss_val = 0
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():  # Disable gradient calculation for validation
        for step, batch in enumerate(val_dataloader):
            # Unpack this validation batch from the dataloader
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Get the contextual representation using Transformer model
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs.loss
            logits = outputs.logits

            total_loss_val += loss.item()

            # Get predictions and append to the list
            predicted_idx = torch.argmax(logits, dim=1).cpu().numpy()  # Move to CPU for numpy operations
            val_predictions.extend(predicted_idx)

            # Move b_labels to CPU and then convert to NumPy
            b_labels_cpu = b_labels.cpu().numpy()  # Ensure it's a NumPy array
            val_true_labels.extend(b_labels_cpu)  # Append the labels

        # Calculate validation metrics after processing all batches in an epoch
        val_balanced_accuracy = balanced_accuracy_score(val_true_labels, val_predictions)
        val_f1_score = f1_score(val_true_labels, val_predictions, average='weighted')

    # Print validation statistics for the epoch
    print(f"Epoch {epoch+1}/{num_epochs} - Validation Loss: {total_loss_val:.4f}")
    print(f"Validation Balanced Accuracy: {val_balanced_accuracy:.4f}")
    print(f"Validation F1 Score: {val_f1_score:.4f}")


  Batch     0  of     24. 
  Batch     1  of     24. 
  Batch     2  of     24. 
  Batch     3  of     24. 
  Batch     4  of     24. 
  Batch     5  of     24. 
  Batch     6  of     24. 
  Batch     7  of     24. 
  Batch     8  of     24. 
  Batch     9  of     24. 
  Batch    10  of     24. 
  Batch    11  of     24. 
  Batch    12  of     24. 
  Batch    13  of     24. 
  Batch    14  of     24. 
  Batch    15  of     24. 
  Batch    16  of     24. 
  Batch    17  of     24. 
  Batch    18  of     24. 
  Batch    19  of     24. 
  Batch    20  of     24. 
  Batch    21  of     24. 
  Batch    22  of     24. 
  Batch    23  of     24. 
Epoch 1/5 - Train Loss: 38.4895
Train Balanced Accuracy: 0.1562
Train F1 Score: 0.1600
Epoch 1/5 - Validation Loss: 4.8366
Validation Balanced Accuracy: 0.1638
Validation F1 Score: 0.1887
  Batch     0  of     24. 
  Batch     1  of     24. 
  Batch     2  of     24. 
  Batch     3  of     24. 
  Batch     4  of     24. 
  Batch     5  of     24. 
  

## Testing stage

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score
import torch

# Initialize lists to hold predictions and true labels for testing
test_predictions = []
test_true_labels = []

# Set the model to evaluation mode for testing (this disables dropout layers, etc.)
model.eval().to(device)

# Disable gradient computation during evaluation (this speeds up the process)
with torch.no_grad():
    total_loss_test = 0
    for step, batch in enumerate(test_dataloader):
        print(f"  Test Batch {step+1} of {len(test_dataloader)}.")

        # Unpack the test batch from the dataloader
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Get the model's output (logits and loss)
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        total_loss_test += loss.item()

        # Get predictions and append to the list
        predicted_idx = torch.argmax(logits, dim=1).cpu().numpy()  # Move to CPU for numpy operations
        test_predictions.extend(predicted_idx)

        # Move b_labels to CPU and then convert to NumPy
        b_labels_cpu = b_labels.cpu().numpy()  # Ensure it's a NumPy array
        test_true_labels.extend(b_labels_cpu)  # Append the labels

  Test Batch 1 of 7.
  Test Batch 2 of 7.
  Test Batch 3 of 7.
  Test Batch 4 of 7.
  Test Batch 5 of 7.
  Test Batch 6 of 7.
  Test Batch 7 of 7.


### Calculate the evaluation metrics in  testing strage

In [None]:
test_balanced_accuracy = balanced_accuracy_score(test_true_labels, test_predictions)
test_f1_score = f1_score(test_true_labels, test_predictions, average='weighted')

# Print test statistics
print(f"Test Loss: {total_loss_test:.4f}")
print(f"Test Balanced Accuracy: {test_balanced_accuracy:.4f}")
print(f"Test F1 Score: {test_f1_score:.4f}")

Test Loss: 11.0647
Test Balanced Accuracy: 0.2206
Test F1 Score: 0.2265
