In [1]:
# !pip install transformers datasets accelerate scikit-learn -q
# !pip install --upgrade datasets transformers -q

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import numpy as np
from collections import Counter
from google.colab import drive
import itertools
from sklearn.model_selection import train_test_split
os.environ["WANDB_DISABLED"] = "true"

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
BASE_PATH = "/content/drive/My Drive/PhD/Courses/year_2/text_analytics/semeval-task9"

In [5]:
folders = ['data/raw', 'data/processed', 'models', 'results/metrics', 'features']
for folder in folders:
    os.makedirs(os.path.join(BASE_PATH, folder), exist_ok=True)

In [6]:
train_en = pd.read_csv(f'{BASE_PATH}/data/subtask1/train/eng.csv')
dev_en = pd.read_csv(f'{BASE_PATH}/data/subtask1/dev/eng.csv')

In [7]:
train_en.shape

(2676, 3)

In [8]:
train_en['polarization'].value_counts()
dev_en['polarization'].value_counts()

Unnamed: 0_level_0,count
polarization,Unnamed: 1_level_1


In [9]:
print(f"Total train data: {len(train_en)}")
print(f"Label distribution:\n{train_en['polarization'].value_counts()}")

# Split into train/val (85/15 split)
train_df, val_df = train_test_split(
    train_en,
    test_size=0.15,
    random_state=42,
    stratify=train_en['polarization']  # Maintain class balance
)

print(f"\nTrain size: {len(train_df)}")
print(f"Val size: {len(val_df)}")
print(f"\nTrain distribution:\n{train_df['polarization'].value_counts()}")
print(f"\nVal distribution:\n{val_df['polarization'].value_counts()}")

Total train data: 2676
Label distribution:
polarization
0    1674
1    1002
Name: count, dtype: int64

Train size: 2274
Val size: 402

Train distribution:
polarization
0    1423
1     851
Name: count, dtype: int64

Val distribution:
polarization
0    251
1    151
Name: count, dtype: int64


### Load Model and Tokenizer

In [10]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [11]:
#Tokenize Data
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'polarization']])
val_dataset = Dataset.from_pandas(val_df[['text', 'polarization']])

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'polarization'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'polarization'])

# Rename to 'labels'
train_dataset = train_dataset.rename_column('polarization', 'labels')
val_dataset = val_dataset.rename_column('polarization', 'labels')

Map:   0%|          | 0/2274 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

In [12]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Metrics Definition
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'f1_class_0': f1_score(labels, predictions, pos_label=0),
        'f1_class_1': f1_score(labels, predictions, pos_label=1),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro')
    }

In [14]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=f'{BASE_PATH}/models/xlm-roberta-english-v1',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    logging_dir=f'{BASE_PATH}/results/logs/english-v1',
    logging_steps=50,
    save_total_limit=2,
    seed=42,
    fp16=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    processing_class=tokenizer
)

In [16]:
# !pip install numpy==1.26.4

In [17]:
# Load TensorBoard
%load_ext tensorboard

In [18]:
# Train Model
train_result = trainer.train()

ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
eval_results = trainer.evaluate()
eval_results

In [None]:
# See detailed breakdown
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print(classification_report(true_labels, pred_labels,
                          target_names=['Not Polarized', 'Polarized']))

In [None]:
# Confusion matrix
cm = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Polarized', 'Polarized'],
            yticklabels=['Not Polarized', 'Polarized'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Save Key Metrics Only
from datetime import datetime

# Simple results dictionary
results = {
    'experiment': 'xlm-roberta-english-v1',
    'date': datetime.now().strftime('%Y-%m-%d'),
    'model': 'xlm-roberta-base',
    'language': 'en',
    'train_size': len(train_df),
    'val_size': len(val_df),

    # Key metrics
    'accuracy': 0.818,
    'f1_macro': 0.810,
    'f1_class_0': 0.850,
    'f1_class_1': 0.770,
    'precision': 0.806,
    'recall': 0.816,

    # Hyperparameters
    'lr': 2e-5,
    'epochs': 3,
    'batch_size': 16
}

# Save to master log
log_path = f'{BASE_PATH}/results/metrics/experiments_log.csv'

if os.path.exists(log_path):
    df = pd.read_csv(log_path)
    df = pd.concat([df, pd.DataFrame([results])], ignore_index=True)
else:
    df = pd.DataFrame([results])

df.to_csv(log_path, index=False)
df

## Loading All Language Files

In [None]:
train_folder = f'{BASE_PATH}/data/subtask1/train'

# Get all CSV files in the train folder
train_files = [f for f in os.listdir(train_folder) if f.endswith('.csv')]

print(f"Found {len(train_files)} language files:")
print(train_files)

In [None]:
all_train_data = []

for file in train_files:
    # Extract language code from filename (e.g., eng.csv -> eng)
    lang_code = file.replace('.csv', '')

    # Read file
    df = pd.read_csv(os.path.join(train_folder, file))
    df['language'] = lang_code  # Add language column
    all_train_data.append(df)

    print(f"{lang_code}: {len(df)} samples, polarization distribution: {df['polarization'].value_counts().to_dict()}")

# Combine all languages
train_all = pd.concat(all_train_data, ignore_index=True)

In [None]:
train_all.head()

In [None]:
print(f"\n{'='*50}")
print(f"COMBINED DATASET")
print(f"{'='*50}")
print(f"Total samples: {len(train_all)}")
print(f"\nLanguages: {train_all['language'].unique()}")
print(f"\nSamples per language:")
print(train_all['language'].value_counts().sort_index())

In [None]:
print(f"\nOverall polarization distribution:")
print(train_all['polarization'].value_counts())
print(f"\nPolarization rate: {train_all['polarization'].mean():.2%}")

In [None]:
# Polarization by language
print(f"\nPolarization distribution by language:")
polarization_by_lang = pd.crosstab(train_all['language'], train_all['polarization'], normalize='index') * 100
print(polarization_by_lang.round(1))

In [None]:
# Data Cleaning/checks
print(f"Original size: {len(train_all)}")
train_all.isnull().sum()

In [None]:
# Create stratification column (language + polarization)
train_all['strat_column'] = train_all['language'] + '_' + train_all['polarization'].astype(str)

# Split
train_df, val_df = train_test_split(
    train_all,
    test_size=0.15,
    random_state=42,
    stratify=train_all['strat_column']
)

# Drop temporary column
train_df = train_df.drop('strat_column', axis=1).reset_index(drop=True)
val_df = val_df.drop('strat_column', axis=1).reset_index(drop=True)

In [None]:
print(f"Train size: {len(train_df)}")
print(f"Val size: {len(val_df)}")

print(f"\nTrain language distribution:")
print(train_df['language'].value_counts().sort_index())

print(f"\nVal language distribution:")
print(val_df['language'].value_counts().sort_index())

print(f"\nTrain polarization distribution:")
print(train_df['polarization'].value_counts())

print(f"\nVal polarization distribution:")
print(val_df['polarization'].value_counts())

In [None]:
# Tokenize Data

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'polarization']])
val_dataset = Dataset.from_pandas(val_df[['text', 'polarization']])

# Tokenize
print("Tokenizing training data...")
train_dataset = train_dataset.map(tokenize_function, batched=True)

print("Tokenizing validation data...")
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'polarization'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'polarization'])

# Rename to labels
train_dataset = train_dataset.rename_column('polarization', 'labels')
val_dataset = val_dataset.rename_column('polarization', 'labels')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=f'{BASE_PATH}/models/xlm-roberta-multilingual-v1',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    logging_dir=f'{BASE_PATH}/results/logs/multilingual-v1',
    logging_steps=100,
    save_total_limit=2,
    seed=42,
    fp16=True,
    report_to="none"
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    processing_class=tokenizer
)

In [None]:
# Train Multilingual Model
print(f"Training on {len(train_df)} samples across {train_df['language'].nunique()} languages")
print("-"*50)

train_result = trainer.train()