# HIGH-PERFORMANCE ANLI FINETUNING (GPU ENABLED)
**Goals**
- End-to-end ML pipeline: EDA → Preprocessing → Training → Evaluation → Save & Deploy.
- Save metrics, plots, confusion matrix, model, and tokenizer.
- Provide GitHub & Docker deployment instructions.


In [None]:
import os
import random
import json
from pathlib import Path
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score,
    precision_recall_fscore_support,
    confusion_matrix, classification_report

import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer,
    DataCollatorWithPadding, EarlyStoppingCallback,
    set_seed

# Output Directories
OUT_DIR = Path("./anli_best_results")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def save_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=4, default=str)


# 0. CONFIGURATION

In [2]:
USE_ALL_ROUNDS = True 
MODEL_NAME = "roberta-large"
MAX_LENGTH = 256
BATCH_SIZE = 8              
GRAD_ACC = 4                
LR = 1e-5
NUM_EPOCHS = 3
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")


NameError: name 'torch' is not defined

# 1. REPRODUCIBILITY

In [None]:
set_seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)


# 2. LOAD DATA & SIMPLE EDA

In [3]:
dataset = load_dataset("facebook/anli")

if USE_ALL_ROUNDS:
    train = concatenate_datasets([dataset["train_r1"], dataset["train_r2"], dataset["train_r3"]])
    dev   = concatenate_datasets([dataset["dev_r1"],   dataset["dev_r2"],   dataset["dev_r3"]])
    test  = concatenate_datasets([dataset["test_r1"],  dataset["test_r2"],  dataset["test_r3"]])
else:
    train = dataset["train_r2"]
    dev   = dataset["dev_r2"]
    test  = dataset["test_r2"]

label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

NameError: name 'load_dataset' is not defined

In [None]:
def dataset_stats(ds, name):
    labels = np.array(ds["label"])
    unique, counts = np.unique(labels, return_counts=True)
    print(f"{name} size: {len(ds)}")
    for u, c in zip(unique, counts):
        print(f"  {label_map[u]:12s}: {c} ({c/len(ds):.2%})")

dataset_stats(train, "Train")
dataset_stats(dev, "Dev")
dataset_stats(test, "Test")

eda_summary = {
    "train_size": len(train),
    "dev_size": len(dev),
    "test_size": len(test),
    "train_class_counts": dict(zip(*np.unique(np.array(train["label"]), return_counts=True))),
}
save_json(eda_summary, OUT_DIR / "eda_summary.json")


# 3. TOKENIZER & PREPROCESSING

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch["premise"], batch["hypothesis"],
                     truncation=True, max_length=MAX_LENGTH, padding=False)

train_tok = train.map(preprocess, batched=True, remove_columns=train.column_names)
dev_tok   = dev.map(preprocess,   batched=True, remove_columns=dev.column_names)
test_tok  = test.map(preprocess,  batched=True, remove_columns=test.column_names)

train_tok = train_tok.add_column("labels", train["label"])
dev_tok   = dev_tok.add_column("labels", dev["label"])
test_tok  = test_tok.add_column("labels", test["label"])

train_tok.set_format(type="torch")
dev_tok.set_format(type="torch")
test_tok.set_format(type="torch")

collator = DataCollatorWithPadding(tokenizer)


# 4. MODEL

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(DEVICE)

# 5. METRICS & EVAL FUNCTIONS

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, labels=[0,1,2], average=None, zero_division=0)
    macro = f1_score(labels, preds, average="macro", zero_division=0)

    return {
        "accuracy": float(acc),
        "macro_f1": float(macro),
        "f1_entailment": float(f1[0]),
        "f1_neutral": float(f1[1]),
        "f1_contradiction": float(f1[2]),
    }

def get_history(trainer):
    history = trainer.state.log_history
    return history


# 6. TRAINING ARGUMENTS & CALLBACKS

In [None]:
training_args = TrainingArguments(
    output_dir=str(OUT_DIR),
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=0.1,
    weight_decay=0.1,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True if DEVICE == "cuda" else False,
    report_to="none",
)

earlystop = EarlyStoppingCallback(early_stopping_patience=3)


# 7. TRAINER SETUP

In [None]:
# Cell: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=dev_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[earlystop],
)


# 8. TRAIN

In [None]:
# Cell: Train and capture history
train_start = datetime.utcnow().isoformat()
trainer.train()
train_end = datetime.utcnow().isoformat()

# Save trainer log history
history = get_history(trainer)
save_json({"train_start": train_start, "train_end": train_end, "history": history}, OUT_DIR / "trainer_history.json")

# 9. EVALUATION & METRICS (DEV & TEST)

In [None]:
# Cell: Evaluate on dev & test
dev_metrics = trainer.evaluate(eval_dataset=dev_tok)
test_metrics = trainer.evaluate(eval_dataset=test_tok)

print("DEV:", dev_metrics)
print("TEST:", test_metrics)

save_json({"dev": dev_metrics, "test": test_metrics}, OUT_DIR / "metrics.json")

# Save best model + tokenizer
trainer.save_model(str(OUT_DIR / "model"))
tokenizer.save_pretrained(str(OUT_DIR / "model"))

# 10. DETAILED TEST REPORT & CONFUSION MATRIX

In [None]:
# Cell: Predictions for test set and confusion matrix
pred_output = trainer.predict(test_tok)
logits = pred_output.predictions
preds = np.argmax(logits, axis=1)
labels = pred_output.label_ids

# Classification report
clf_report = classification_report(labels, preds, target_names=[label_map[i] for i in [0,1,2]], zero_division=0, output_dict=True)
save_json(clf_report, OUT_DIR / "classification_report.json")
print("Classification report (summary saved).")

# Confusion matrix & plot
cm = confusion_matrix(labels, preds, labels=[0,1,2])
cm_display = {
    "matrix": cm.tolist(),
    "labels": [label_map[i] for i in [0,1,2]],
}
save_json(cm_display, OUT_DIR / "confusion_matrix.json")

# Plot confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=[label_map[i] for i in [0,1,2]], yticklabels=[label_map[i] for i in [0,1,2]])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - ANLI Test")
plt.tight_layout()
plt.savefig(OUT_DIR / "confusion_matrix.png", dpi=200)
plt.show()


# 11. TRAINING CURVES (LOSS / METRICS) PLOT

In [None]:
# Cell: Plot training & eval metrics extracted from trainer history
history = trainer.state.log_history

# Convert to arrays
steps = [h.get("step") for h in history if "step" in h]
train_losses = [h["loss"] for h in history if "loss" in h]
eval_steps = [h["step"] for h in history if any(k.startswith("eval_") for k in h)]
eval_f1 = [h.get("eval_macro_f1") for h in history if "eval_macro_f1" in h]
eval_acc = [h.get("eval_accuracy") for h in history if "eval_accuracy" in h]
eval_loss = [h.get("eval_loss") for h in history if "eval_loss" in h]

# Loss plot
plt.figure()
plt.plot(steps[:len(train_losses)], train_losses, label="train_loss")
if eval_steps:
    plt.scatter(eval_steps, eval_loss, label="eval_loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Loss during training")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(OUT_DIR / "training_loss.png", dpi=200)
plt.show()

# Macro F1 plot
if eval_f1:
    plt.figure()
    plt.plot(eval_steps[:len(eval_f1)], eval_f1, marker="o")
    plt.xlabel("Steps")
    plt.ylabel("Macro F1")
    plt.title("Eval Macro F1 during training")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(OUT_DIR / "eval_macro_f1.png", dpi=200)
    plt.show()

# 12. SAVE ARTIFACTS & REPRODUCIBILITY NOTES

In [None]:
# Cell: Save environment & reproducibility notes
repro = {
    "model_name": MODEL_NAME,
    "max_length": MAX_LENGTH,
    "batch_size": BATCH_SIZE,
    "grad_acc": GRAD_ACC,
    "learning_rate": LR,
    "epochs": NUM_EPOCHS,
    "seed": SEED,
    "use_all_rounds": USE_ALL_ROUNDS,
    "device": DEVICE,
    "notes": "Check trainer_history.json for stepwise logs."
}
save_json(repro, OUT_DIR / "reproducibility.json")
print(f"Saved artifacts to {OUT_DIR.resolve()}")
