In [None]:
#!pip install transformers datasets scikit-learn
#!conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
#!pip install -U "transformers>=4.41.0" "peft>=0.15.0"
#!pip install hf_xet
!pip install fairlearn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW 
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import mode
import random
from fairlearn.metrics import selection_rate

In [None]:
# Fixing randomness for reproduceability
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
test_df = pd.read_csv("/kaggle/input/ethics-amazon-review/dataset/test.csv")

In [None]:
def keep_only_language(df, language):
    """
    Keeps only rows where the 'language' column matches the specified language.
    """
    return df[df['language'] == language].reset_index(drop=True)


In [None]:
def clean_and_check_nulls(df, name="Dataset"):
    """Function that cleans the dataset from nulls"""
    original_len = len(df)
    df = df.dropna(subset=['review_title'])
    dropped = original_len - len(df)
    print(f"{name}: Dropped {dropped} rows with null 'review_title'.")
    
    remaining_nulls = df.isnull().sum()
    if remaining_nulls.sum() == 0:
        print(f"{name}: No remaining nulls.")
    else:
        print(f"{name}: Remaining null values:\n{remaining_nulls[remaining_nulls > 0]}")
    
    return df

In [None]:
test_df = clean_and_check_nulls(test_df, "Test Set")

In [None]:
def process_dataframe(dataframe):
    """
    Function that drops columns that are not useful, and creates the sentiment column
    Drops the languages that are not useful for our analysis
    Furthermore, it plots the label distribution across languages, to ensure the dataset is balanced, from a data point number point of view
    """
    
    dataframe.drop(columns=["Unnamed: 0","review_id", "product_id", "reviewer_id"], axis = 1, inplace = True)

    dataframe = dataframe[dataframe['language'].isin(['de', 'es', 'en'])]

    dataframe = dataframe[dataframe['stars'] != 3]
    
    dataframe['sentiment'] = dataframe['stars'].apply(lambda x: 0 if x in [1, 2] else 1)

    sentiment_counts = dataframe.groupby(['language', 'sentiment']).size().unstack(fill_value=0)
    
    sentiment_counts.plot(kind='bar', stacked=False)
    plt.title('Sentiment Distribution per Language')
    plt.xlabel('Language')
    plt.ylabel('Number of Reviews')
    plt.legend(title='Sentiment (0=Negative, 1=Positive)')
    plt.tight_layout()
    plt.show()

    dataframe.drop(columns=["stars"], axis = 1, inplace = True)

    return dataframe
    

In [None]:
print("/nProcessing TEST dataset")
test_df = process_dataframe(test_df)

In [None]:
def tokenize_function_test(example):
    """
    Tokenization function used for evaluation on the test set
    """
    tokenized = tokenizer(example['text'], truncation=True, padding="max_length", max_length=512)
    return tokenized

In [None]:
prompt_templates = {
        'es': "CATEGORÍA: {category}\nTÍTULO: {title}\nRESEÑA: {body}",
    }

def format_row(row, lang_code = None):
    """
    Function that formats the input of the model
    """
    lang = lang_code if lang_code else row['language']
    template = prompt_templates.get(lang, prompt_templates['es'])
    return template.format(
        category=row['product_category'],
        title=row['review_title'],
        body=row['review_body']
    )

def format_convert_and_show(df, name="Dataset", num_samples=1, lang_code=None):
    """Function applies the input formatting, and creates the input field for the model"""
    df = df.copy()

    df['text'] = df.apply(format_row, axis=1)

    df = df.drop(columns=['review_title', 'review_body', 'product_category'], axis=1)
    return df

In [None]:
lang = "es"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
test_lang_df = keep_only_language(test_df, lang)

test_lang_df = format_convert_and_show(test_lang_df, name=f"Test {lang}", num_samples=1)

In [None]:
def compute_fairness_metrics(y_true, y_pred, group_name=""):
    """
    Function used to compute the SAPMOC metrics
    """

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # hyperparameter used in defensive code. Useful to avoid division by 0
    epsilon = 1e-10

    statistical_parity = (tp + fp) / (tp + fp + tn + fn + epsilon)
    equality_of_opportunity = tp / (tp + fn + epsilon)
    calibration_pos = tp / (tp + fp + epsilon)
    calibration_neg = tn / (tn + fn + epsilon)
    conditional_use_error_pos = fp / (tp + fp + epsilon)
    conditional_use_error_neg = fn / (tn + fn + epsilon)
    treatment_equality_fp_fn = fp / (fn + epsilon) if fn != 0 else 0
    treatment_equality_fn_fp = fn / (fp + epsilon) if fp != 0 else f0
    
    print(f"\nFairness Metrics ({group_name})")
    print(f"Statistical Parity             : {statistical_parity:.4f}")
    print(f"Equality of Opportunity        : {equality_of_opportunity:.4f}")
    print(f"Calibration (Pos)              : {calibration_pos:.4f}")
    print(f"Calibration (Neg)              : {calibration_neg:.4f}")
    print(f"Conditional Use Error (P)      : {conditional_use_error_pos:.4f}")
    print(f"Conditional Use Error (N)      : {conditional_use_error_neg:.4f}")
    print(f"Treatment Equality (FP/FN)     : {treatment_equality_fp_fn:.4f}")
    print(f"Treatment Equality (FN/FP)     : {treatment_equality_fn_fp:.4f}")

    return {
        "statistical_parity": statistical_parity,
        "equality_of_opportunity": equality_of_opportunity,
        "calibration_pos": calibration_pos,
        "calibration_neg": calibration_neg,
        "conditional_use_error_pos": conditional_use_error_pos,
        "conditional_use_error_neg": conditional_use_error_neg,
        "treatment_equality_fp_fn": treatment_equality_fp_fn,
        "treatment_equality_fn_fp": treatment_equality_fn_fp
    }


In [None]:
results_table = []

def evaluate_and_record(model_id, model, tokenizer, test_df, device, lang_code=None):

    test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
    test_dataset = test_dataset.map(tokenize_function_test, batched=True)
    test_dataset = test_dataset.rename_column("sentiment", "labels")
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    test_loader = DataLoader(test_dataset, batch_size=32)
    model.to(device)
    model.eval()

    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Evaluating {model_id} ({lang_code or 'all'})", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred = torch.argmax(outputs.logits, dim=-1)
            preds.extend(pred.cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    y_true = np.array(labels)
    y_pred = np.array(preds)

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
    disp.plot(cmap=plt.cm.Blues, values_format='d')
    plt.title(f"Confusion Matrix: {model_id} ({lang_code or 'all'})")
    plt.show()

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    sel_rate = selection_rate(y_true, y_pred)

    print(f"\nAccuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Selection Rate: {sel_rate:.4f}")

    fairness_metrics = compute_fairness_metrics(y_true, y_pred, group_name=f"{model_id} ({lang_code or 'all'})")

    entry = {
        "model": model_id,
        "lang": lang_code or "all",
        "accuracy": acc,
        "f1_score": f1,
        "selection_rate": sel_rate,
        **fairness_metrics
    }
    results_table.append(entry)
    
    return y_pred

In [None]:
# for the general model.
def format_convert_and_show(df, name="Dataset", num_samples=0):
    df = df.copy()

    df['text'] = df.apply(
        lambda x: f"CATEGORY: {x['product_category']}\nTITLE: {x['review_title']}\nREVIEW TEXT: {x['review_body']}",
        axis=1
    )

    print(f"\n--- {name} Sample Entries ---\n")
    for i, row in df.head(num_samples).iterrows():
        print(f"Sample {i+1}")
        print(f"Category: {row['product_category']}")
        print(f"Title: {row['review_title']}")
        print(f"Review: {row['review_body']}")
        print(f"Full text:\n{row['text']}\n")
        print("-" * 50)

    return df

In [None]:
# Evaluating the general distilbert model finetuned on the full test set
model_id, model_name, model_path = ("distilbert_all", "distilbert-base-multilingual-cased", "/kaggle/input/distilbert-all-lang/best_model_all.pt")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.load_state_dict(torch.load(model_path, map_location=device))

print("General DistilBERT - All Languages")
test_df = format_convert_and_show(test_df)
evaluate_and_record(model_id, model, tokenizer, test_df.copy(), device)

# Per-language distilbert evaluations
for lang_code in ["es", "de", "en"]:
    df_lang = keep_only_language(test_df, lang_code)
    df_lang = format_convert_and_show(df_lang)
    print(f"\nGeneral DistilBERT - Language: {lang_code}")
    evaluate_and_record(model_id, model, tokenizer, df_lang, device, lang_code=lang_code)

In [None]:
def format_convert_and_show(df, name="Dataset", num_samples=2, lang_code=None):
    df = df.copy()

    prompt_templates = {
        'es': "CATEGORÍA: {category}\nTÍTULO: {title}\nRESEÑA: {body}",
    }

    def format_row(row):
        lang = lang_code if lang_code else row['language']
        template = prompt_templates.get(lang, prompt_templates['es'])
        return template.format(
            category=row['product_category'],
            title=row['review_title'],
            body=row['review_body']
        )

    df['text'] = df.apply(format_row, axis=1)

    df = df.drop(columns=['review_title', 'review_body', 'product_category'], axis=1)
    return df

In [None]:
spanish_models = [
    ("distilbert", "distilbert-base-multilingual-cased", "/kaggle/input/spanish-models/best_model_es_distilbert.pt"),
    ("xlm_roberta", "xlm-roberta-base", "/kaggle/input/spanish-models/best_model_es_xlm_roberta.pt"),
    ("bert", "bert-base-multilingual-cased", "/kaggle/input/spanish-models/best_model_es_bert.pt")
]

spanish_test_df = keep_only_language(test_df, "es")
spanish_test_df = format_convert_and_show(spanish_test_df, lang_code="es")

spanish_preds = []

for model_id, model_name, model_path in spanish_models:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.load_state_dict(torch.load(model_path, map_location=device))

    print(f"\n {model_id} - Spanish only")
    y_pred = evaluate_and_record(model_id, model, tokenizer, spanish_test_df, device, lang_code="es")
    spanish_preds.append(y_pred)

# Ensemble model from Spanish-only
print("\nEnsemble Model - Spanish only")
stacked_preds = np.stack(spanish_preds, axis=0)
majority_preds, _ = mode(stacked_preds, axis=0)
majority_preds = majority_preds.squeeze()

y_true = spanish_test_df["sentiment"].values

cm = confusion_matrix(y_true, majority_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix: Ensemble (Spanish)")
plt.show()

fairness_metrics = compute_fairness_metrics(y_true, majority_preds, group_name="ensemble (es)")

results_table.append({
    "model": "ensemble",
    "lang": "es",
    "accuracy": accuracy_score(y_true, majority_preds),
    "f1_score": f1_score(y_true, majority_preds),
    "selection_rate": selection_rate(y_true, majority_preds),
    **fairness_metrics
})

results_df = pd.DataFrame(results_table)
display(results_df)

In [None]:
def format_convert_and_show(df, name="Dataset", num_samples=0):
    df = df.copy()

    df['text'] = df.apply(
        lambda x: f"CATEGORY: {x['product_category']}\nTITLE: {x['review_title']}\nREVIEW TEXT: {x['review_body']}",
        axis=1
    )

    print(f"\n--- {name} Sample Entries ---\n")
    for i, row in df.head(num_samples).iterrows():
        print(f"Sample {i+1}")
        print(f"Category   : {row['product_category']}")
        print(f"Title      : {row['review_title']}")
        print(f"Review     : {row['review_body']}")
        print(f"Full text  :\n{row['text']}\n")
        print("-" * 50)

    return df

In [None]:
# Evaluating the distilbert model finetunned on german and english, on their respective languages
eval_models = [
    ("distilbert", "distilbert-base-multilingual-cased", "de", "/kaggle/input/models-en-de-best/best_model_de.pt"),
    ("distilbert", "distilbert-base-multilingual-cased", "en", "/kaggle/input/models-en-de-best/best_model_en.pt")
]

for model_id, model_name, lang_code, model_path in eval_models:
    print(f"\n {model_id} - Fine-tuned on {lang_code.upper()} only")

    test_lang_df = keep_only_language(test_df, lang_code)
    test_lang_df = format_convert_and_show(test_lang_df)
    y_true = test_lang_df["sentiment"].values

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.load_state_dict(torch.load(model_path, map_location=device))

    y_pred = evaluate_and_record(model_id, model, tokenizer, test_lang_df, device, lang_code=lang_code)

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
    disp.plot(cmap=plt.cm.Blues, values_format='d')
    plt.title(f"Confusion Matrix: {model_id.upper()} ({lang_code.upper()})")
    plt.show()

    fairness_metrics = compute_fairness_metrics(y_true, y_pred, group_name=f"{model_id} ({lang_code})")

    results_table.append({
        "model": model_id,
        "lang": lang_code,
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "selection_rate": selection_rate(y_true, y_pred),
        **fairness_metrics
    })

results_df = pd.DataFrame(results_table)
display(results_df)