# Install dependencies

In [None]:
pip install --retries 0 transformers scikit-learn torch

In [None]:
!pip install --upgrade datasets

# Supress unnecessary warnings

This controls the verbosity of TensorFlow’s C++ backend logging. The setting ("3") hides everything except FATAL errors. 

Second line configures the behavior of XLA, a TensorFlow compiler that optimizes computations. This setting may help silence certain XLA-related messages or ensure consistent execution, especially in environments like Jupyter or debugging sessions.

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # For TensorFlow noise
os.environ["XLA_FLAGS"] = "--xla_cpu_multi_thread_eigen=false"  # May suppress some XLA messages

# Import libraries

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import unicodedata
from huggingface_hub import login

# GPU or CPU

This block of code loads a pre-trained XLM-RoBERTa model and moves it to the appropriate device (GPU if available, otherwise CPU).

In [None]:
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Using device:", device)

# Config

This block sets up configuration parameters for training or fine-tuning a DL model, likely using XLM-RoBERTa for a text classification task.
* Sets the random seed for reproducibility.
* Specifies the pretrained model to use (from Hugging Face Transformers).
* Sets the maximum sequence length for tokenized input texts.
* Sets the number of samples processed at once during training/inference.
* Number of complete passes through the training dataset.
* Sets the learning rate for the optimizer.
* The number of output classes in your classification task.
* The number of initial transformer layers to freeze (i.e., not update during training).
* Sets the device (GPU if available, otherwise CPU) for all model computations.

In [None]:
# Config
SEED = 42
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 512
BATCH_SIZE = 64
EPOCHS = 10
LR = 0.0001
NUM_CLASSES = 3
FREEZE_LAYERS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set Seed

This block sets random seeds and configures PyTorch's backend to ensure deterministic and reproducible results across multiple runs.

In [None]:
# Set seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Clean and pre-process text function

This function clean_text is designed to preprocess and clean raw text data, often as part of a natural language processing (NLP) pipeline. It takes a dictionary example (commonly used with Hugging Face datasets), cleans the "text" field, and returns the modified dictionary.

In [None]:
# Clean text 
def clean_text(example):
    text = example["text"]
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^\w\s.,!?¿¡]+", "", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    example["text"] = text
    return example

# Load train and validation datasets

This code block loads the multilingual sentiment dataset, filters it by language, and prepares fixed-size English, French, and Chinese subsets for training and validation.

In [None]:
# Load English training dataset
dataset = load_dataset("clapAI/MultiLingualSentiment")
english_data = dataset["train"].filter(lambda x: x["language"] == "en")
english_data_shuffled = english_data.shuffle(seed=42)
english_sample = english_data_shuffled.select(range(10000))
english_list = english_sample.to_list()
english_labels = [item["label"] for item in english_list]
print(dataset)

In [None]:
# Load English validation dataset
english_data_val = dataset["validation"].filter(lambda x: x["language"] == "en")
english_data_val_shuffled = english_data_val.shuffle(seed=42)
english_sample_val = english_data_val_shuffled.select(range(1250))
english_list_val = english_sample_val.to_list()
english_labels_val = [item["label"] for item in english_list_val]

In [None]:
# Load French validation dataset
french_data = dataset["validation"].filter(lambda x: x["language"] == "fr")
french_data_shuffled = french_data.shuffle(seed=42)
french_sample = french_data_shuffled.select(range(1250))
french_list = french_sample.to_list()
french_labels = [item["label"] for item in french_list]

In [None]:
# Load Chinese validation dataset
chinese_data = dataset["validation"].filter(lambda x: x["language"] == "zh")
chinese_data_shuffled = chinese_data.shuffle(seed=42)
chinese_sample = chinese_data_shuffled.select(range(1250))
chinese_list = chinese_sample.to_list()
chinese_labels = [item["label"] for item in chinese_list]

# Tokenization

This code tokenizes our cleaned text data using XLM-RoBERTa, formats it for PyTorch, and wraps it into dataloaders for training and validation. 
* Loads the XLM-RoBERTa tokenizer (from Hugging Face).
* Tokenizes a batch of text
* Applies the tokenization to Hugging Face Dataset objects.
* Converts the tokenized dataset into PyTorch tensors.
* Wraps datasets in PyTorch DataLoader objects.

In [None]:
# Convert lists to HuggingFace Dataset objects
train_dataset = Dataset.from_list(english_list)
val_dataset_en = Dataset.from_list(english_list_val)
val_dataset_fr = Dataset.from_list(french_list)
val_dataset_zh = Dataset.from_list(chinese_list)

# Tokenization function using a pretrained tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_batch(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

# Apply the tokenizer to each dataset
train_dataset = train_dataset.map(tokenize_batch, batched=True)
val_dataset_en = val_dataset_en.map(tokenize_batch, batched=True)
val_dataset_fr = val_dataset_fr.map(tokenize_batch, batched=True)
val_dataset_zh = val_dataset_zh.map(tokenize_batch, batched=True)

# Set dataset format for PyTorch (tensors)
columns = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset_en.set_format(type="torch", columns=columns)
val_dataset_fr.set_format(type="torch", columns=columns)
val_dataset_zh.set_format(type="torch", columns=columns)

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader_en = DataLoader(val_dataset_en, batch_size=BATCH_SIZE)
val_loader_fr = DataLoader(val_dataset_fr, batch_size=BATCH_SIZE)
val_loader_zh = DataLoader(val_dataset_zh, batch_size=BATCH_SIZE)

# Load pre-trained model XLM RoBERTa

We freeze the lower layers (the first N) and allow the upper layers to fine-tune, since the bottom ones capture more general linguistic features.

In [None]:
# Load pretrained model 
base_model = XLMRobertaModel.from_pretrained(MODEL_NAME)
for i, layer in enumerate(base_model.encoder.layer):
    if i >= FREEZE_LAYERS:
        for param in layer.parameters():
            param.requires_grad = False

# Classification model

This code defines and initializes a custom classification model for sentiment analysis using a pre-trained XLM-RoBERTa encoder and a linear classifier head.

In [None]:
# Classification model 
class SentimentClassifier(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        self.classifier = nn.Linear(encoder.config.hidden_size, NUM_CLASSES)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_token = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_token)
        return logits

model = SentimentClassifier(base_model).to(DEVICE)
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

train_losses, val_losses, train_accs, val_accs = [], [], [], []

# Training and Validation

This block implements a training and validation loop for the SentimentClassifier model over multiple epochs. It performs model optimization, accuracy tracking, and periodic logging.

In [None]:
label_map = {"negative": 0, "neutral": 1, "positive": 2}

train_losses, train_accs = [], []

val_losses_en, val_accs_en = [], []
val_losses_fr, val_accs_fr = [], []
val_losses_zh, val_accs_zh = [], []

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")

    model.train()
    total_loss, correct, total = 0, 0, 0

    for i, batch in enumerate(train_loader, 1):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = torch.tensor([label_map[label] for label in batch["label"]]).to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item()

        if i % 1000 == 0 or i == len(train_loader):
            print(f"  Batch {i}/{len(train_loader)}: Loss = {loss.item():.4f}")

    epoch_train_loss = total_loss / len(train_loader)
    epoch_train_acc = correct / total
    train_losses.append(epoch_train_loss)
    train_accs.append(epoch_train_acc)

    print(f"Training: Loss = {epoch_train_loss:.4f}, Accuracy = {epoch_train_acc:.4f}")

    # Validation function
    def evaluate(loader, label_map):
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(DEVICE)
                attention_mask = batch["attention_mask"].to(DEVICE)
                labels = torch.tensor([label_map[label] for label in batch["label"]]).to(DEVICE)

                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item()

                preds = torch.argmax(logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        return val_loss / len(loader), val_correct / val_total

    val_loss_en, val_acc_en = evaluate(val_loader_en, label_map)
    val_losses_en.append(val_loss_en)
    val_accs_en.append(val_acc_en)

    val_loss_fr, val_acc_fr = evaluate(val_loader_fr, label_map)
    val_losses_fr.append(val_loss_fr)
    val_accs_fr.append(val_acc_fr)

    val_loss_zh, val_acc_zh = evaluate(val_loader_zh, label_map)
    val_losses_zh.append(val_loss_zh)
    val_accs_zh.append(val_acc_zh)

    print(f"Validation EN: Loss = {val_loss_en:.4f}, Accuracy = {val_acc_en:.4f}")
    print(f"Validation FR: Loss = {val_loss_fr:.4f}, Accuracy = {val_acc_fr:.4f}")
    print(f"Validation ZH: Loss = {val_loss_zh:.4f}, Accuracy = {val_acc_zh:.4f}")

# Classification report

In [None]:
# Generate classification report
def generate_report(val_loader, label_map, lang_code="English"):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = torch.tensor([label_map[label] for label in batch["label"]]).to(DEVICE)
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    target_names = ["Negative", "Neutral", "Positive"]
    report = classification_report(all_labels, all_preds, target_names=target_names, digits=4)
    print(f"\nClassification Report ({lang_code}):\n")
    print(report)

# Generate reports for all three languages
generate_report(val_loader_en, label_map, "English")
generate_report(val_loader_fr, label_map, "French")
generate_report(val_loader_zh, label_map, "Chinese")

# Plot loss and accuracy

In [None]:
# Plot Loss & Accuracy 
epochs_range = range(1, EPOCHS + 1)
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_losses, label="Train Loss")
plt.plot(epochs_range, val_losses_en, label="Val Loss (EN)")
plt.plot(epochs_range, val_losses_fr, label="Val Loss (FR)")
plt.plot(epochs_range, val_losses_zh, label="Val Loss (ZH)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_accs, label="Train Accuracy")
plt.plot(epochs_range, val_accs_en, label="Val Acc (EN)")
plt.plot(epochs_range, val_accs_fr, label="Val Acc (FR)")
plt.plot(epochs_range, val_accs_zh, label="Val Acc (ZH)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()

plt.tight_layout()
plt.savefig("loss_accuracy_curve_multilingual.png")
plt.show()

# Confusion matrix

This code block evaluates the trained sentiment classifier on the validation set by computing predictions, metrics (accuracy and F1), and plotting a confusion matrix to visualize how well the model performed per class.

In [None]:
def evaluate_model(val_loader, label_map, lang_code="English"):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = torch.tensor([label_map[label] for label in batch["label"]]).to(DEVICE)
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")
    print(f"\nXLM-R Val ({lang_code}) → Accuracy: {acc:.4f}, F1: {f1:.4f}")

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Neg", "Neu", "Pos"], yticklabels=["Neg", "Neu", "Pos"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix ({lang_code})")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{lang_code.lower()}.png")
    plt.show()

# Call the evaluation for each language
evaluate_model(val_loader_en, label_map, "English")
evaluate_model(val_loader_fr, label_map, "French")
evaluate_model(val_loader_zh, label_map, "Chinese")

# Further analysis

This code block visualizes the training and validation data distributions to examine their potential impact on classification accuracy.

In [None]:
def get_distribution(data, labels=["negative", "neutral", "positive"]):
    counter = Counter([item["label"] for item in data])
    total = sum(counter.values())
    return [counter.get(label, 0) / total for label in labels]

groups = ["English Train", "English Val", "French Val", "Chinese Val"]
datasets = [english_list, english_list_val, french_list, chinese_list]
label_names = ["negative", "neutral", "positive"]

distributions = [get_distribution(data) for data in datasets]

bar_width = 0.2
x = np.arange(len(groups))

plt.figure(figsize=(10, 6))

for i in range(len(label_names)):
    label = label_names[i]
    values = [dist[i] for dist in distributions]
    positions = x + (i - 1) * bar_width
    bars = plt.bar(positions, values, width=bar_width, label=label)
    for pos, val in zip(positions, values):
        plt.text(pos, val + 0.01, f"{val*100:.1f}%", ha="center", va="bottom", fontsize=9)

plt.xticks(x, groups)
plt.ylim(0, 1)
plt.ylabel("Proportion")
plt.title("Sentiment Label Distribution within Each Dataset")
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()