<a href="https://colab.research.google.com/github/usp787/DS_5110_Final_Project_LoRA/blob/Code/DS_project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_goemotions_data():
    """Load GoEmotions dataset with pre-split train/val/test"""
    print("\nLoading GoEmotions dataset...")
    dataset = load_dataset('google-research-datasets/go_emotions', 'simplified')

    print(f"Train samples: {len(dataset['train']):,}")
    print(f"Validation samples: {len(dataset['validation']):,}")
    print(f"Test samples: {len(dataset['test']):,}")

    return dataset['train'], dataset['validation'], dataset['test']

In [None]:
def prepare_batch(batch_data, tokenizer, max_length=128):
    """
    Convert raw text batch to model inputs
    More flexible than Dataset class - easy to modify tokenization
    """
    texts = [item['text'] for item in batch_data]

    # Tokenize
    encoding = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Multi-label: Convert to binary vectors
    labels = torch.zeros(len(batch_data), 28)
    for i, item in enumerate(batch_data):
        for label_id in item['labels']:
            labels[i, label_id] = 1

    return encoding['input_ids'], encoding['attention_mask'], labels

In [None]:
def create_dataloaders(train_data, val_data, test_data, tokenizer, batch_size=32, max_length=128):
    """
    Create dataloaders from raw data
    Using simple Dataset wrapper for DataLoader compatibility
    """
    class SimpleDataset(Dataset):
        def __init__(self, data):
            self.data = data
        def __len__(self):
            return len(self.data)
        def __getitem__(self, idx):
            return self.data[idx]

    # Wrap in Dataset for DataLoader
    train_dataset = SimpleDataset(train_data)
    val_dataset = SimpleDataset(val_data)
    test_dataset = SimpleDataset(test_data)

    # Custom collate function
    def collate_fn(batch):
        input_ids, attention_mask, labels = prepare_batch(batch, tokenizer, max_length)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

    return train_loader, val_loader, test_loader

In [None]:
def build_model(num_labels=28, dropout=0.1, freeze_backbone=True):

    # Load pre-trained DistilRoBERTa
    backbone = AutoModel.from_pretrained('distilroberta-base')
    hidden_size = backbone.config.hidden_size  # 768

    # Optionally freeze backbone
    if freeze_backbone:
        for param in backbone.parameters():
            param.requires_grad = False
        print("✓ Backbone FROZEN (no fine-tuning)")
    else:
        print("✓ Backbone UNFROZEN (will fine-tune)")

    # Build classifier head
    classifier = nn.Sequential(
        nn.Dropout(dropout),
        nn.Linear(hidden_size, hidden_size // 2),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden_size // 2, num_labels)
    )

    # Wrap in simple module
    class EmotionClassifier(nn.Module):
        def __init__(self, backbone, classifier):
            super().__init__()
            self.backbone = backbone
            self.classifier = classifier

        def forward(self, input_ids, attention_mask):
            # Get embeddings
            if self.backbone.training:
                outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
            else:
                with torch.no_grad():
                    outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)

            # Use [CLS] token
            pooled = outputs.last_hidden_state[:, 0, :]
            logits = self.classifier(pooled)
            return logits

    model = EmotionClassifier(backbone, classifier)

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")

    return model

In [None]:
def compute_metrics(predictions, labels, threshold=0.5):

    # Binarize predictions
    pred_binary = (predictions >= threshold).astype(int)
    labels = labels.astype(int)

    # MICRO metrics: Global aggregation
    micro_f1 = f1_score(labels, pred_binary, average='micro', zero_division=0)
    micro_precision = precision_score(labels, pred_binary, average='micro', zero_division=0)
    micro_recall = recall_score(labels, pred_binary, average='micro', zero_division=0)

    # MACRO metrics: Per-class average
    macro_f1 = f1_score(labels, pred_binary, average='macro', zero_division=0)
    macro_precision = precision_score(labels, pred_binary, average='macro', zero_division=0)
    macro_recall = recall_score(labels, pred_binary, average='macro', zero_division=0)

    # Additional metrics
    subset_acc = accuracy_score(labels, pred_binary)  # Exact match
    hamming = np.mean(labels != pred_binary)  # Label-wise error

    return {
        'micro_f1': micro_f1,
        'micro_precision': micro_precision,
        'micro_recall': micro_recall,
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'subset_accuracy': subset_acc,
        'hamming_loss': hamming
    }


def print_metrics(metrics, prefix=""):
    """Pretty print metrics"""
    print(f"\n{prefix}Metrics:")
    print(f"  Micro-F1: {metrics['micro_f1']:.4f} (weighted by frequency)")
    print(f"  Macro-F1: {metrics['macro_f1']:.4f} (equal weight per class)")
    print(f"  Micro-Precision: {metrics['micro_precision']:.4f}")
    print(f"  Micro-Recall: {metrics['micro_recall']:.4f}")
    print(f"  Macro-Precision: {metrics['macro_precision']:.4f}")
    print(f"  Macro-Recall: {metrics['macro_recall']:.4f}")
    print(f"  Subset Accuracy: {metrics['subset_accuracy']:.4f}")
    print(f"  Hamming Loss: {metrics['hamming_loss']:.4f}")

In [None]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    """
    Train for one epoch
    Separate function - easy to modify training logic
    """
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        # Move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track loss
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
def evaluate_model(model, dataloader, criterion, device):
    """
    Evaluate model on a dataset
    Returns loss + all metrics
    """
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            # Get predictions (sigmoid for multi-label)
            probs = torch.sigmoid(logits)

            # Collect
            all_predictions.append(probs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            total_loss += loss.item()

    # Aggregate
    predictions = np.vstack(all_predictions)
    labels = np.vstack(all_labels)

    # Compute metrics
    metrics = compute_metrics(predictions, labels)
    metrics['loss'] = total_loss / len(dataloader)

    return metrics

In [None]:
def train_model(model, train_loader, val_loader, device, epochs=5, lr=1e-3):
    """
    Main training loop
    Easy to modify hyperparameters and logic
    """
    # Setup
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=lr
    )

    best_val_macro_f1 = 0

    print(f"\n{'='*60}")
    print("Training Loop")
    print(f"{'='*60}")

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 40)

        # Train
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Train Loss: {train_loss:.4f}")

        # Validate
        val_metrics = evaluate_model(model, val_loader, criterion, device)
        print(f"Val Loss: {val_metrics['loss']:.4f}")
        print_metrics(val_metrics, prefix="Validation ")

        # Save best model
        if val_metrics['macro_f1'] > best_val_macro_f1:
            best_val_macro_f1 = val_metrics['macro_f1']
            torch.save(model.state_dict(), 'best_baseline_model.pt')
            print(f"  ✓ New best model saved! (Macro-F1: {best_val_macro_f1:.4f})")

    return best_val_macro_f1

In [None]:
def main():
    """
    Main execution function
    Each step is separate - easy to run/modify individually
    """
    # Config
    BATCH_SIZE = 32
    MAX_LENGTH = 128
    EPOCHS = 5
    LR = 1e-3
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print(f"\n{'='*60}")
    print("DistilRoBERTa Baseline (NO Fine-tuning)")
    print(f"{'='*60}")
    print(f"Device: {DEVICE}\n")

    # Step 1: Load data
    train_data, val_data, test_data = load_goemotions_data()

    # Step 2: Load tokenizer
    print("\nLoading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

    # Step 3: Create dataloaders
    print("\nCreating dataloaders...")
    train_loader, val_loader, test_loader = create_dataloaders(
        train_data, val_data, test_data, tokenizer,
        batch_size=BATCH_SIZE, max_length=MAX_LENGTH
    )

    # Step 4: Build model
    print("\nBuilding model...")
    model = build_model(num_labels=28, dropout=0.1, freeze_backbone=True)
    model = model.to(DEVICE)

    # Step 5: Train
    best_macro_f1 = train_model(
        model, train_loader, val_loader, DEVICE,
        epochs=EPOCHS, lr=LR
    )

    # Step 6: Final test evaluation
    print(f"\n{'='*60}")
    print("Final Test Evaluation (Unseen Data)")
    print(f"{'='*60}")

    model.load_state_dict(torch.load('best_baseline_model.pt'))
    criterion = nn.BCEWithLogitsLoss()
    test_metrics = evaluate_model(model, test_loader, criterion, DEVICE)

    print_metrics(test_metrics, prefix="Test ")

    print(f"\n{'='*60}")
    print("Baseline Complete!")
    print(f"{'='*60}")
    print(f"\nKey Results:")
    print(f"  Best Val Macro-F1: {best_macro_f1:.4f}")
    print(f"  Test Macro-F1: {test_metrics['macro_f1']:.4f}")
    print(f"  Test Micro-F1: {test_metrics['micro_f1']:.4f}")
    print(f"\nReady for LoRA comparison!")


if __name__ == "__main__":
    main()


DistilRoBERTa Baseline (NO Fine-tuning)
Device: cuda


Loading GoEmotions dataset...
Train samples: 43,410
Validation samples: 5,426
Test samples: 5,427

Loading tokenizer...

Creating dataloaders...

Building model...
✓ Backbone FROZEN (no fine-tuning)
Total params: 82,424,476
Trainable params: 306,076 (0.37%)

Training Loop

Epoch 1/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [02:11<00:00, 10.30it/s, loss=0.1268]


Train Loss: 0.1408


Evaluating: 100%|██████████| 170/170 [00:15<00:00, 11.06it/s]


Val Loss: 0.1238

Validation Metrics:
  Micro-F1: 0.2029 (weighted by frequency)
  Macro-F1: 0.0389 (equal weight per class)
  Micro-Precision: 0.6632
  Micro-Recall: 0.1197
  Macro-Precision: 0.1381
  Macro-Recall: 0.0261
  Subset Accuracy: 0.1231
  Hamming Loss: 0.0395
  ✓ New best model saved! (Macro-F1: 0.0389)

Epoch 2/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [02:20<00:00,  9.64it/s, loss=0.1132]


Train Loss: 0.1224


Evaluating: 100%|██████████| 170/170 [00:16<00:00, 10.57it/s]


Val Loss: 0.1163

Validation Metrics:
  Micro-F1: 0.2670 (weighted by frequency)
  Macro-F1: 0.0940 (equal weight per class)
  Micro-Precision: 0.6619
  Micro-Recall: 0.1672
  Macro-Precision: 0.2452
  Macro-Recall: 0.0736
  Subset Accuracy: 0.1587
  Hamming Loss: 0.0386
  ✓ New best model saved! (Macro-F1: 0.0940)

Epoch 3/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [02:24<00:00,  9.39it/s, loss=0.0720]


Train Loss: 0.1179


Evaluating: 100%|██████████| 170/170 [00:16<00:00, 10.51it/s]


Val Loss: 0.1121

Validation Metrics:
  Micro-F1: 0.2648 (weighted by frequency)
  Macro-F1: 0.0979 (equal weight per class)
  Micro-Precision: 0.7020
  Micro-Recall: 0.1632
  Macro-Precision: 0.2710
  Macro-Recall: 0.0709
  Subset Accuracy: 0.1600
  Hamming Loss: 0.0381
  ✓ New best model saved! (Macro-F1: 0.0979)

Epoch 4/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [02:25<00:00,  9.33it/s, loss=0.0838]


Train Loss: 0.1159


Evaluating: 100%|██████████| 170/170 [00:16<00:00, 10.51it/s]


Val Loss: 0.1094

Validation Metrics:
  Micro-F1: 0.3158 (weighted by frequency)
  Macro-F1: 0.1011 (equal weight per class)
  Micro-Precision: 0.6857
  Micro-Recall: 0.2052
  Macro-Precision: 0.2584
  Macro-Recall: 0.0738
  Subset Accuracy: 0.2035
  Hamming Loss: 0.0373
  ✓ New best model saved! (Macro-F1: 0.1011)

Epoch 5/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [02:26<00:00,  9.29it/s, loss=0.1102]


Train Loss: 0.1147


Evaluating: 100%|██████████| 170/170 [00:16<00:00, 10.47it/s]


Val Loss: 0.1082

Validation Metrics:
  Micro-F1: 0.3317 (weighted by frequency)
  Macro-F1: 0.1200 (equal weight per class)
  Micro-Precision: 0.6778
  Micro-Recall: 0.2196
  Macro-Precision: 0.3495
  Macro-Recall: 0.0900
  Subset Accuracy: 0.2147
  Hamming Loss: 0.0372
  ✓ New best model saved! (Macro-F1: 0.1200)

Final Test Evaluation (Unseen Data)


Evaluating: 100%|██████████| 170/170 [00:16<00:00, 10.50it/s]


Test Metrics:
  Micro-F1: 0.3394 (weighted by frequency)
  Macro-F1: 0.1245 (equal weight per class)
  Micro-Precision: 0.6980
  Micro-Recall: 0.2242
  Macro-Precision: 0.3186
  Macro-Recall: 0.0943
  Subset Accuracy: 0.2171
  Hamming Loss: 0.0364

Baseline Complete!

Key Results:
  Best Val Macro-F1: 0.1200
  Test Macro-F1: 0.1245
  Test Micro-F1: 0.3394

Ready for LoRA comparison!



