In [None]:
#!pip install transformers datasets scikit-learn
#!conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
#!pip install -U "transformers>=4.41.0" "peft>=0.15.0"
#!pip install hf_xet
!pip install fairlearn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW 
from torch.optim.lr_scheduler import ReduceLROnPlateau
from IPython.display import FileLink
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from fairlearn.metrics import (
    MetricFrame,
    selection_rate,
    equalized_odds_difference,
    demographic_parity_difference,
    false_positive_rate_difference,
    false_negative_rate_difference,
    true_positive_rate_difference,
    true_negative_rate_difference
)

import random

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
train_df = pd.read_csv("/kaggle/input/ethics-amazon-review/dataset/train.csv")
val_df = pd.read_csv("/kaggle/input/ethics-amazon-review/dataset/validation.csv")
test_df = pd.read_csv("/kaggle/input/ethics-amazon-review/dataset/test.csv")

In [None]:
print("General dataset info:")
print(f"Training set: {train_df.info()}")
print(f"\n Validation set: {val_df.info()}")
print(f"\n Test set: {test_df.info()}")

In [None]:
def clean_and_check_nulls(df, name="Dataset"):
    """Function that cleans the dataset from nulls"""
    original_len = len(df)
    df = df.dropna(subset=['review_title'])
    dropped = original_len - len(df)
    print(f"{name}: Dropped {dropped} rows with null 'review_title'.")
    
    remaining_nulls = df.isnull().sum()
    if remaining_nulls.sum() == 0:
        print(f"{name}: No remaining nulls.")
    else:
        print(f"{name}: Remaining null values:\n{remaining_nulls[remaining_nulls > 0]}")
    
    return df

In [None]:
train_df = clean_and_check_nulls(train_df, "Training Set")
val_df = clean_and_check_nulls(val_df, "Validation Set")
test_df = clean_and_check_nulls(test_df, "Test Set")

In [None]:
def process_dataframe(dataframe):
    """Function that drops columns that are not useful, and creates the sentiment column
    Drops the languages that are not useful for our analysis
    Furthermore, it plots the label distribution across languages, to ensure the dataset is balanced, from a data point number point of view"""

    
    dataframe.drop(columns=["Unnamed: 0","review_id", "product_id", "reviewer_id"], axis = 1, inplace = True)
    dataframe = dataframe[dataframe['language'].isin(['de', 'es', 'en'])]
    

    dataframe = dataframe[dataframe['stars'] != 3]
    dataframe['sentiment'] = dataframe['stars'].apply(lambda x: 0 if x in [1, 2] else 1)

    sentiment_counts = dataframe.groupby(['language', 'sentiment']).size().unstack(fill_value=0)
    
    sentiment_counts.plot(kind='bar', stacked=False)
    plt.title('Sentiment Distribution per Language')
    plt.xlabel('Language')
    plt.ylabel('Number of Reviews')
    plt.legend(title='Sentiment (0=Negative, 1=Positive)')
    plt.tight_layout()
    plt.show()

    dataframe.drop(columns=["stars"], axis = 1, inplace = True)

    return dataframe
    

In [None]:
print("Processing TRAIN dataset")
train_df = process_dataframe(train_df)

print("/nProcessing VALIDATION dataset")
val_df = process_dataframe(val_df)

print("/nProcessing TEST dataset")
test_df = process_dataframe(test_df)

In [None]:
#HYPARPARAMETERS
MODEL_NAME = "distilbert-base-multilingual-cased"
EPOCHS = 3
LR = 5e-6 # 10 times lower than in the paper.
batch_size = 16

In [None]:
def keep_only_language(df, language):
    """
    Keeps only rows where the 'language' column matches the specified language.
    """
    return df[df['language'] == language].reset_index(drop=True)


In [None]:
def format_convert_and_show(df, name="Dataset", num_samples=0):
    df = df.copy()

    df['text'] = df.apply(
        lambda x: f"CATEGORY: {x['product_category']}\nTITLE: {x['review_title']}\nREVIEW TEXT: {x['review_body']}",
        axis=1
    )

    print(f"\n--- {name} Sample Entries ---\n")
    for i, row in df.head(num_samples).iterrows():
        print(f"Sample {i+1}")
        print(f"Category   : {row['product_category']}")
        print(f"Title      : {row['review_title']}")
        print(f"Review     : {row['review_body']}")
        print(f"Full text  :\n{row['text']}\n")
        print("-" * 50)

    df = df.drop(columns=['review_title', 'review_body', 'product_category'], axis = 1)
    return df

In [None]:
train_dataset = format_convert_and_show(train_df, "Training Set")
val_dataset = format_convert_and_show(val_df, "Validation Set")
test_dataset = format_convert_and_show(test_df, "Test Set")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

def tokenize_function(example):
    """
    Tokenization function used for the training and validation loop
    """
    tokenized = tokenizer(example['text'], truncation=True)
    return tokenized

def tokenize_function_test(example):
    """
    Tokenization function used for the testing loop
    """
    tokenized = tokenizer(example['text'], truncation=True, padding="max_length", max_length=512)
    return tokenized

In [None]:
train_dataset = Dataset.from_pandas(train_dataset.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_dataset.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.reset_index(drop=True))

print(test_dataset.shape)
print(test_dataset.column_names)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function_test, batched=True)

train_dataset = train_dataset.rename_column("sentiment", "labels")
val_dataset = val_dataset.rename_column("sentiment", "labels")
test_dataset = test_dataset.rename_column("sentiment", "labels")

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels', 'language'],
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, device, save_path, epochs=5):

    """
    Training loop function
    """
    
    best_f1 = 0.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", leave=True)

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            loop.set_postfix(loss=loss.item())

        print(f"\nEpoch {epoch + 1} - Training loss: {total_loss / len(train_loader):.4f}")

        acc, f1 = evaluate_validation_model(model, val_loader, device, return_scores=True)
        scheduler.step(f1)

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), save_path)
            print(f"Best model saved with F1: {f1:.4f}\n")

In [None]:
def evaluate_validation_model(model, val_loader, device, return_scores=False):
    """
    Function used in the validation phase
    returns the desired metrics for our training
    """
    all_preds, all_labels = [], []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    if return_scores:
        return acc, f1
    else:
        return all_labels, all_preds

In [None]:
def test_model(model, test_loader):

    all_preds, all_labels, group_labels = [], [], []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            languages = batch['language']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            group_labels.extend(languages)

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)

    print("Test Accuracy:", acc)
    print("Test F1 Score:", f1)
    print("Confusion Matrix:\n", cm)

    frame = MetricFrame(
        metrics={
            "accuracy": accuracy_score,
            "f1": f1_score,
            "selection_rate": selection_rate,
        },
        y_true=all_labels,
        y_pred=all_preds,
        sensitive_features=group_labels
    )

    print("\n=== Metrics by Language Group ===")
    print(frame.by_group)

    return all_labels, all_preds


In [None]:
def download_weights(file_path):
    """
    Function used to create a downloadable link to download the resulting weights.
    """
    if os.path.exists(file_path):
        display(FileLink(file_path))
    else:
        print(f"File {file_path} does not exist.")

In [None]:
# Finetuning the general DistilBert model on all 3 languages
print("Defining model...")
        
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                       num_labels=2,
                                                      ).to(device)
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

print("Freezing model...")
for param in model.distilbert.parameters():
    param.requires_grad = False

print(f"\nTraining for language: all")
save_path = f"./best_model_all.pt"
train_model(model, train_loader, val_loader, optimizer, scheduler, device, save_path, epochs=EPOCHS)
test_model(model, test_loader)
download_weights(save_path)

In [None]:
prompt_templates = {
    'en': "CATEGORY: {category}\nTITLE: {title}\nREVIEW TEXT: {body}",
    'de': "KATEGORIE: {category}\nTITEL: {title}\nBEWERTUNGSTEXT: {body}",
    'es': "CATEGORÍA: {category}\nTÍTULO: {title}\nRESEÑA: {body}",
}
def format_row(row, lang_code = None):
    """
    Function used to dynamically adapt the training prompt for each language.
    """
    lang = lang_code if lang_code else row['language']
    template = prompt_templates.get(lang, prompt_templates['en'])
    return template.format(
        category=row['product_category'],
        title=row['review_title'],
        body=row['review_body']
    )

def format_convert_and_show(df, name="Dataset", num_samples=2, lang_code=None):
    df = df.copy()

    df['text'] = df.apply(format_row, axis=1)

    df = df.drop(columns=['review_title', 'review_body', 'product_category'], axis=1)
    return df

In [None]:
# Finetuning DistilBert on each language in particular, and evaluating it
for lang in ['de', 'en', 'es']:
    
    train_lang_df = keep_only_language(train_df, lang)
    val_lang_df = keep_only_language(val_df, lang)
    test_lang_df = keep_only_language(test_df, lang)

    train_lang_df = format_convert_and_show(train_lang_df, name=f"Train {lang}", num_samples=0)
    val_lang_df = format_convert_and_show(val_lang_df, name=f"Val {lang}", num_samples=0)
    test_lang_df = format_convert_and_show(test_lang_df, name=f"Test {lang}", num_samples=0)

    train_dataset = Dataset.from_pandas(train_lang_df.reset_index(drop=True))
    val_dataset = Dataset.from_pandas(val_lang_df.reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_lang_df.reset_index(drop=True))

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function_test, batched=True)

    train_dataset = train_dataset.rename_column("sentiment", "labels")
    val_dataset = val_dataset.rename_column("sentiment", "labels")
    test_dataset = test_dataset.rename_column("sentiment", "labels")

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'labels', 'language'],
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=data_collator)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    print("Defining model...")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                           num_labels=2,
                                                          ).to(device)
    optimizer = AdamW(model.parameters(), lr=LR)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

    print("Freezing model...")
    for param in model.distilbert.parameters():
        param.requires_grad = False
    
    
    
    print(f"\nTraining for language: {lang}")
    save_path = f"./best_model_{lang}.pt"
    train_model(model, train_loader, val_loader, optimizer, scheduler, device, save_path, epochs=EPOCHS)
    test_model(model, test_loader)
    download_weights(save_path)