# REFERENCE: Transformer Demo (DistilBERT)

> This notebook is a **minimal demo** of using a transformer model
> on the hate speech dataset.
> It is provided so you can see:
> - How tokenization works
> - How a pretrained model is fine-tuned
> In your CS3 notebook, you do *not* need to recreate everything here.

In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Load data
df = pd.read_csv('processed/labeled_data_clean.csv')

label_map = {
    0: 'Hate speech',
    1: 'Offensive language',
    2: 'Neutral'
}
df = df[df['label'].isin(label_map.keys())].copy()

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

train_ds = Dataset.from_pandas(df_train[['tweet', 'label']])
test_ds = Dataset.from_pandas(df_test[['tweet', 'label']])

In [None]:
# Tokenizer & model
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(batch['tweet'], truncation=True, padding=False)

train_tok = train_ds.map(tokenize_batch, batched=True)
test_tok = test_ds.map(tokenize_batch, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(label_map)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'macro_f1': macro_f1}

In [None]:
training_args = TrainingArguments(
    output_dir='transformer_outputs',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Notes

- This is a **reference demo**, not a graded deliverable.
- In your CS3 notebook, you will:
  - Run a small transformer model (it can look very similar to this).
  - Report accuracy + macro-F1.
  - Compare the transformerâ€™s performance to your Logistic Regression model.