# AfriSenti Multilingual Sentiment Analysis

This notebook guides a reproducible workflow for sentiment analysis on the AfriSenti Twitter dataset (Swahili, Amharic, English).

Sections:
- Environment & imports
- Load dataset & splits
- Initial EDA
- Preprocessing and tokenization
- Model training (Transformer & BiLSTM baseline)
- Evaluation, attention visualization, ablations, and cross-lingual tests

Note: place AfriSenti CSV files in `./data/` (columns: at least `text`, `label`, `language`) or edit the loader to point to a Hugging Face dataset id.

In [None]:
# Setup: install (optional) and imports

# If you run in a fresh Colab/Jupyter environment, you can uncomment the install block.
# !pip install -r ../requirements.txt

import os
import sys
from pathlib import Path

# add project root so `src` can be imported
ROOT = Path('c:/Users/BMC/Desktop/NLP').resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# project modules
from src.data_exploration import load_afrisenti, simple_eda
from src.preprocess import simple_clean
from src.tokenize_utils import get_tokenizer, batch_tokenize
from src.dataset import SentimentDataset, collate_fn
from src.models import TransformerClassifier, BiLSTMClassifier
from src.train import Trainer
from src.eval_utils import compute_metrics, plot_confusion, compute_roc_auc

# set device
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', DEVICE)

# reproducibility
SEED = 42
import random
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)

sns.set(style='whitegrid')


In [None]:
# Load AfriSenti dataset

DATA_DIR = str(ROOT / 'data')
print('Looking for data in', DATA_DIR)

# If you have a Hugging Face dataset id, set HF_DATASET_ID = '...' and pass to load_afrisenti
HF_DATASET_ID = None

try:
    df = load_afrisenti(data_dir=DATA_DIR, hf_id=HF_DATASET_ID)
    print('Loaded dataframe with', len(df), 'rows')
    display(df.head())
except Exception as e:
    print('Failed to load local dataset:', e)
    print('Please place AfriSenti CSV(s) in', DATA_DIR)


In [None]:
# Initial Data Exploration

# Run simple EDA (language distribution, label balance, text length)
if 'df' in globals():
    eda_stats = simple_eda(df, text_col='text', lang_col='language' if 'language' in df.columns else 'lang', label_col='label' if 'label' in df.columns else 'sentiment', show_plots=True, save_prefix=str(ROOT / 'eda'))
    print('EDA summary:', eda_stats)
else:
    print('Dataframe `df` not loaded. Run the data-loading cell.')


In [None]:
# Preprocessing examples

from src.preprocess import simple_clean

if 'df' in globals():
    df['clean_text'] = df['text'].astype(str).apply(simple_clean)
    display(df[['text','clean_text']].head(8))
else:
    print('Dataframe `df` not loaded; run the load cell to continue.')


In [None]:
# Tokenization comparisons: mBERT, XLM-R, AfriBERTa

models = ['mbert', 'xlm-roberta', 'afriberta']
samples = []
if 'df' in globals():
    samples = df['clean_text'].astype(str).dropna().unique()[:6].tolist()
else:
    samples = [
        'I love this! ❤️',
        'Hii ni habari nzuri',
        'እውነት ነው ይህ',
        'This is terrible... so bad',
    ]

for m in models:
    tok = get_tokenizer(m)
    enc = tok(samples, truncation=True)
    lens = [len(t) for t in enc['input_ids']]
    print(f"\nModel: {m} (id={tok.name_or_path}) - token lengths: {lens}")
    for s, ids in zip(samples, enc['input_ids']):
        print('  text:', s)
        print('  tokens:', tok.convert_ids_to_tokens(ids[:30]))


In [None]:
# Create PyTorch Dataset and DataLoaders

from torch.utils.data import DataLoader

if 'df' in globals():
    # map labels to integers
    labels_unique = sorted(df['label'].unique())
    label_map = {v:i for i,v in enumerate(labels_unique)}
    df['label_id'] = df['label'].map(label_map)

    # small stratified split
    from sklearn.model_selection import train_test_split
    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=SEED)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=SEED)

    tokenizer = get_tokenizer('xlm-roberta')
    train_ds = SentimentDataset(train_df['clean_text'].tolist(), train_df['label_id'].tolist(), tokenizer=tokenizer, max_length=128)
    val_ds = SentimentDataset(val_df['clean_text'].tolist(), val_df['label_id'].tolist(), tokenizer=tokenizer, max_length=128)
    test_ds = SentimentDataset(test_df['clean_text'].tolist(), test_df['label_id'].tolist(), tokenizer=tokenizer, max_length=128)

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

    print('Train/val/test sizes:', len(train_ds), len(val_ds), len(test_ds))
else:
    print('Dataframe `df` not loaded; run the data-loading cell first.')


In [None]:
# Modeling: Transformer fine-tuning and BiLSTM baseline

# Transformer fine-tuning example (XLM-R)
if 'train_loader' in globals():
    num_labels = len(label_map)
    model = TransformerClassifier(model_name='xlm-roberta-base', num_labels=num_labels)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    trainer = Trainer(model, optimizer, device=None, grad_clip=1.0, amp=False)

    # Train 3-5 epochs with early stopping
    history = trainer.fit(train_loader, val_loader, loss_fn=None, epochs=3, patience=2, save_path=str(ROOT / 'best_xlmroberta.pt'))
    print('Training history:', history)
else:
    print('train_loader not available. Run the dataloader cell.')

# BiLSTM baseline - placeholder
# To run the BiLSTM baseline you need a tokenization -> integer vocab pipeline or provide pretrained embeddings.
# A simple route: use tokenizer to obtain token ids from a BPE/wordpiece tokenizer and feed into the BiLSTM (as demonstration),
# but note token ids reflect model vocab; you may instead build a new vocab using fastText or torchtext for a proper LSTM baseline.


In [None]:
# Evaluation: metrics, confusion matrix, ROC-AUC

if 'trainer' in globals() and 'test_loader' in globals():
    # load best model
    best_path = str(ROOT / 'best_xlmroberta.pt')
    if os.path.exists(best_path):
        model.load_state_dict(torch.load(best_path))
    preds, trues, _ = trainer.eval_epoch(test_loader)
    metrics = compute_metrics(trues, preds, labels=list(label_map.keys()))
    print('Evaluation metrics (summary):')
    print('Accuracy:', metrics['accuracy'])
    print('Macro F1:', metrics['macro_f1'])
    display(metrics['report'])
    plot_confusion(trues, preds, labels=list(label_map.keys()), save_path=str(ROOT / 'confusion.png'))
else:
    print('trainer or test_loader not defined; run previous cells to train and evaluate.')


In [None]:
# Attention visualization (sample)

# This demonstrates how to request attention weights from the HF model and visualize them.

from matplotlib import pyplot as plt
import numpy as np

if 'tokenizer' in globals() and 'model' in globals():
    sample_text = (df['clean_text'].iloc[0] if 'df' in globals() else 'I love this product!')
    enc = tokenizer(sample_text, return_tensors='pt')
    # forward with attentions
    model.model.config.output_attentions = True
    out = model.model(**{k: v.to(model.model.device) for k,v in enc.items()})
    # out.attentions is a tuple: (layer1, layer2, ...), each shape (batch, head, seq_len, seq_len)
    attentions = out.attentions  # tuple
    # pick the last layer and average heads
    last = attentions[-1].squeeze(0).mean(axis=0).cpu().detach().numpy()
    tokens = tokenizer.convert_ids_to_tokens(enc['input_ids'].squeeze(0))
    # plot
    plt.figure(figsize=(10,8))
    plt.imshow(last[:len(tokens), :len(tokens)], cmap='viridis')
    plt.xticks(range(len(tokens)), tokens, rotation=90)
    plt.yticks(range(len(tokens)), tokens)
    plt.colorbar()
    plt.title('Average attention (last layer)')
    plt.tight_layout()
    plt.show()
else:
    print('tokenizer or model not available. Run tokenization and model cells first.')


In [None]:
# Ablation study scaffold and cross-lingual test example

from src.ablation import run_grid
from src.cross_lingual import split_by_language, prepare_train_test_for_lang

# Ablation: define a small grid (example). Replace `train_fn` with a function wrapping your training procedure.

grid = {
    'batch_size': [16, 32],
    'lr': [2e-5, 5e-5],
    'max_length': [128, 192]
}

print('Ablation scaffold created. To run sweeps, implement a train_fn(params) that trains and returns results dict, then call run_grid(train_fn, grid)')

# Cross-lingual example
if 'df' in globals():
    groups = split_by_language(df, lang_col='language')
    print('Languages found:', list(groups.keys()))
    # example: train on sw (Swahili) and test on en and am
    if 'sw' in groups:
        train_df, val_df, test_df = prepare_train_test_for_lang(groups, train_langs=['sw'], test_langs=['en','am'])
        print('Train size (sw):', len(train_df), 'Test combined (en+am):', len(test_df))
    else:
        print('Swahili subset not found in data; adapt language codes to your dataset')
else:
    print('Dataframe `df` not loaded; run the load cell first.')


In [None]:
# Save artifacts and short summary

print('To save artifacts:')
print('- model: torch.save(model.state_dict(), "path")')
print('- tokenizer: tokenizer.save_pretrained("path")')
print('- training logs: save history dict or use TensorBoard')

print('\nWhen finished, populate REPORT.md with key tables and plots from the notebook (confusion matrix, ablation results, cross-lingual transfer table).')
