# 🧠 BERT Model Comparison for Text Classification
Compare BERT, DistilBERT, and RoBERTa on sentiment classification (IMDb dataset).

## 📦 Install dependencies

In [None]:
!pip install transformers datasets scikit-learn torch seaborn

## 📚 Load IMDb dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb').shuffle(seed=42)
small_train = dataset['train'].select(range(1000))
small_test = dataset['test'].select(range(500))

## 🧹 Tokenize

In [None]:
from transformers import AutoTokenizer

def tokenize_dataset(tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    def tokenize_fn(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)
    tokenized_train = small_train.map(tokenize_fn, batched=True)
    tokenized_test = small_test.map(tokenize_fn, batched=True)
    tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    return tokenized_train, tokenized_test

## 🏋️ Train and evaluate a model

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

def train_and_evaluate(model_name):
    print(f"\n🚀 Training {model_name}")
    train_data, test_data = tokenize_dataset(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    args = TrainingArguments(
        output_dir=f"{model_name}-output",
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        num_train_epochs=1,
        logging_steps=10,
        save_total_limit=1,
        load_best_model_at_end=False
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_data,
        eval_dataset=test_data,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"📊 Eval metrics for {model_name}: {metrics}")

    # Confusion Matrix
    preds = trainer.predict(test_data)
    pred_labels = np.argmax(preds.predictions, axis=1)
    cm = confusion_matrix(preds.label_ids, pred_labels)

    print("\nClassification Report:")
    print(classification_report(preds.label_ids, pred_labels))

    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix — {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

## 🔍 Run All Models

In [None]:
for model in ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']:
    train_and_evaluate(model)