# DistilBERT v4 - Single-Stage Training

**Reality Check:** v2's 61% came from 2-stage training (GoEmotions pre-training + FinGPT)

**This is single-stage training** - DistilBERT directly on financial data

**Dataset:** fingpt_annotated_balanced.csv
- 928 samples
- 0 duplicates
- Balanced minority classes

**Config:**
- 30 epochs with early stopping
- r=16 LoRA rank  
- lr=5e-5
- Warmup + weight decay

**Realistic target:** 50-55% (single-stage baseline)

In [None]:
# Install packages
!pip install -q transformers peft datasets accelerate evaluate scikit-learn matplotlib seaborn

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Use the clean balanced dataset (928 samples, no duplicates)
DATA_PATH = '/content/drive/MyDrive/FinEmo-LoRA/data/annotated/fingpt_annotated_balanced.csv'
MODEL_SAVE_PATH = '/content/drive/MyDrive/FinEmo-LoRA/models/distilbert-v4'

In [None]:
# Imports
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import evaluate

EMOTIONS = ['anxiety', 'excitement', 'fear', 'hope', 'optimism', 'uncertainty']
emotion2id = {e: i for i, e in enumerate(EMOTIONS)}
id2emotion = {i: e for e, i in emotion2id.items()}
SEED = 42

In [None]:
# Load and split data
df = pd.read_csv(DATA_PATH)
df['label'] = df['emotion'].map(emotion2id)

print(f"Total samples: {len(df)}")
print(f"Class distribution:\n{df['emotion'].value_counts()}\n")

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])

class_counts = temp_df['label'].value_counts()
problematic = class_counts[class_counts == 1].index.tolist()

if problematic:
    prob_df = temp_df[temp_df['label'].isin(problematic)]
    temp_df = temp_df[~temp_df['label'].isin(problematic)]
else:
    prob_df = pd.DataFrame()

if not temp_df.empty:
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED, stratify=temp_df['label'])
else:
    val_df = pd.DataFrame(columns=df.columns)
    test_df = pd.DataFrame(columns=df.columns)

if not prob_df.empty:
    val_df = pd.concat([val_df, prob_df], ignore_index=True)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

In [None]:
# Load DistilBERT with improved LoRA config
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6,
    id2label=id2emotion,
    label2id=emotion2id
)

# Improved LoRA: r=16, target all attention
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin", "k_lin"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Tokenize
def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']]).map(tokenize, batched=True, remove_columns=['text'])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']]).map(tokenize, batched=True, remove_columns=['text'])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']]).map(tokenize, batched=True, remove_columns=['text'])

In [None]:
# Training config
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = evaluate.load("accuracy").compute(predictions=predictions, references=labels)
    f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average='macro')
    return {'accuracy': acc['accuracy'], 'f1': f1['f1']}

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    num_train_epochs=30,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    save_total_limit=3,
    seed=SEED,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print(f"\nTraining Config:")
print(f"- Model: DistilBERT single-stage")
print(f"- Epochs: 30 (early stop after 5 epochs no improvement)")
print(f"- LoRA rank: r=16, alpha=32")
print(f"- Learning rate: 5e-5")
print(f"- Batch size: 16 x 2 accumulation = 32 effective")
print(f"- Train samples: {len(train_dataset)}")

In [None]:
# Train
trainer.train()

In [None]:
# Evaluate on test set
results = trainer.evaluate(test_dataset)
print(f"\nTest Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1: {results['eval_f1']:.4f}")

predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_df['label'].values

present_labels = np.unique(y_true)
filtered_names = [id2emotion[label] for label in present_labels]

print("\nClassification Report:")
print(classification_report(y_true, y_pred, labels=present_labels, target_names=filtered_names, digits=4))

In [None]:
# Confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=EMOTIONS, yticklabels=EMOTIONS)
plt.title('DistilBERT v4 Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Save model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")

In [None]:
# Compare all versions
print("\n" + "="*70)
print("MODEL COMPARISON: v2 vs v3 (FinBERT) vs v4 (DistilBERT)")
print("="*70)

comparison = pd.DataFrame({
    'Version': ['v2', 'v3', 'v4'],
    'Base Model': ['DistilBERT', 'FinBERT', 'DistilBERT'],
    'Data Size': ['200', '1000', '1000'],
    'LoRA Rank': ['r=8', 'r=8', 'r=16'],
    'Epochs': ['10', '10', '20'],
    'LR': ['2e-4', '1e-4', '5e-5'],
    'Accuracy': ['61.0%', '42.9%', f"{results['eval_accuracy']:.1%}"],
    'F1': ['0.610', '0.246', f"{results['eval_f1']:.3f}"]
})

print(comparison.to_string(index=False))
print("="*70)

if results['eval_accuracy'] > 0.68:
    print(f"\nSUCCESS: v4 beats v2 by {(results['eval_accuracy'] - 0.61)*100:.1f}pp")
    print("DistilBERT with more data and better config wins.")
elif results['eval_accuracy'] > 0.61:
    print(f"\nIMPROVED: v4 beats v2 by {(results['eval_accuracy'] - 0.61)*100:.1f}pp")
    print("5x more data made the difference.")
else:
    print(f"\nNo improvement over v2. Accuracy: {results['eval_accuracy']:.1%}")
    print("Next steps: Check data quality or try RoBERTa-base")