<a href="https://colab.research.google.com/github/usp787/DS_5110_Final_Project_LoRA/blob/Code/LoRA_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_goemotions_data():
    """Load GoEmotions dataset with pre-split train/val/test"""
    print("\nLoading GoEmotions dataset...")
    dataset = load_dataset('google-research-datasets/go_emotions', 'simplified')

    print(f"Train samples: {len(dataset['train']):,}")
    print(f"Validation samples: {len(dataset['validation']):,}")
    print(f"Test samples: {len(dataset['test']):,}")

    return dataset['train'], dataset['validation'], dataset['test']

In [3]:
def prepare_batch(batch_data, tokenizer, max_length=128):
    """
    Convert raw text batch to model inputs
    More flexible than Dataset class - easy to modify tokenization
    """
    texts = [item['text'] for item in batch_data]

    # Tokenize
    encoding = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Multi-label: Convert to binary vectors
    labels = torch.zeros(len(batch_data), 28)
    for i, item in enumerate(batch_data):
        for label_id in item['labels']:
            labels[i, label_id] = 1

    return encoding['input_ids'], encoding['attention_mask'], labels

In [4]:
def create_dataloaders(train_data, val_data, test_data, tokenizer, batch_size=32, max_length=128):
    """
    Create dataloaders from raw data
    Using simple Dataset wrapper for DataLoader compatibility
    """
    class SimpleDataset(Dataset):
        def __init__(self, data):
            self.data = data
        def __len__(self):
            return len(self.data)
        def __getitem__(self, idx):
            return self.data[idx]

    # Wrap in Dataset for DataLoader
    train_dataset = SimpleDataset(train_data)
    val_dataset = SimpleDataset(val_data)
    test_dataset = SimpleDataset(test_data)

    # Custom collate function
    def collate_fn(batch):
        input_ids, attention_mask, labels = prepare_batch(batch, tokenizer, max_length)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

    return train_loader, val_loader, test_loader

In [5]:
#import peft librareis(Hugging Face) for LoRA
from peft import LoraConfig, get_peft_model, TaskType

In [6]:
def build_model(num_labels=28, dropout=0.1, rank=16):
    """
    Build DistilRoBERTa with LoRA adapters.
    """
    # 1. Load the pre-trained backbone (Standard AutoModel)
    # This is currently FROZEN (all weights)
    backbone = AutoModel.from_pretrained('distilroberta-base')

    # 2. Configure LoRA
    # We target the attention mechanism linear layers.
    # DistilRoBERTa uses 'key', 'query', 'value' in its attention modules.
    peft_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION, # We are using it as a feature extractor for our head
        r=rank,                  # The dimension of the low-rank matrices (8, 16, 64)
        lora_alpha=32,           # Scaling factor (usually 2x rank)
        lora_dropout=0.1,        # Regularization
        target_modules=['key', 'query', 'value'] # Specific to RoBERTa-style models
    )

    # 3. Inject Adapters (The Magic Step)
    # This makes the backbone's adapters TRAINABLE, while keeping the rest frozen.
    backbone = get_peft_model(backbone, peft_config)
    print("\n✓ LoRA Adapters injected into Backbone")
    backbone.print_trainable_parameters() # Helpful built-in print function

    # 4. Define the Classifier Head (Same as before)
    # The backbone now returns adapted embeddings!
    class EmotionClassifier(nn.Module):
        def __init__(self, backbone, classifier):
            super().__init__()
            self.backbone = backbone
            self.classifier = classifier

        def forward(self, input_ids, attention_mask):
            # The backbone handles the LoRA logic internally
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)

            # Use [CLS] token representation
            pooled = outputs.last_hidden_state[:, 0, :]
            logits = self.classifier(pooled)
            return logits

    # Re-create your classifier head
    hidden_size = backbone.config.hidden_size
    classifier_head = nn.Sequential(
        nn.Dropout(dropout),
        nn.Linear(hidden_size, hidden_size // 2),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden_size // 2, num_labels)
    )

    # Combine them
    model = EmotionClassifier(backbone, classifier_head)

    return model

In [7]:
def compute_metrics(predictions, labels, threshold=0.5):

    # Binarize predictions
    pred_binary = (predictions >= threshold).astype(int)
    labels = labels.astype(int)

    # MICRO metrics: Global aggregation
    micro_f1 = f1_score(labels, pred_binary, average='micro', zero_division=0)
    micro_precision = precision_score(labels, pred_binary, average='micro', zero_division=0)
    micro_recall = recall_score(labels, pred_binary, average='micro', zero_division=0)

    # MACRO metrics: Per-class average
    macro_f1 = f1_score(labels, pred_binary, average='macro', zero_division=0)
    macro_precision = precision_score(labels, pred_binary, average='macro', zero_division=0)
    macro_recall = recall_score(labels, pred_binary, average='macro', zero_division=0)

    # Additional metrics
    subset_acc = accuracy_score(labels, pred_binary)  # Exact match
    hamming = np.mean(labels != pred_binary)  # Label-wise error

    return {
        'micro_f1': micro_f1,
        'micro_precision': micro_precision,
        'micro_recall': micro_recall,
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'subset_accuracy': subset_acc,
        'hamming_loss': hamming
    }


def print_metrics(metrics, prefix=""):
    """Pretty print metrics"""
    print(f"\n{prefix}Metrics:")
    print(f"  Micro-F1: {metrics['micro_f1']:.4f} (weighted by frequency)")
    print(f"  Macro-F1: {metrics['macro_f1']:.4f} (equal weight per class)")
    print(f"  Micro-Precision: {metrics['micro_precision']:.4f}")
    print(f"  Micro-Recall: {metrics['micro_recall']:.4f}")
    print(f"  Macro-Precision: {metrics['macro_precision']:.4f}")
    print(f"  Macro-Recall: {metrics['macro_recall']:.4f}")
    print(f"  Subset Accuracy: {metrics['subset_accuracy']:.4f}")
    print(f"  Hamming Loss: {metrics['hamming_loss']:.4f}")

In [8]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    """
    Train for one epoch
    Separate function - easy to modify training logic
    """
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        # Move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track loss
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [9]:
def evaluate_model(model, dataloader, criterion, device):
    """
    Evaluate model on a dataset
    Returns loss + all metrics
    """
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            # Get predictions (sigmoid for multi-label)
            probs = torch.sigmoid(logits)

            # Collect
            all_predictions.append(probs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            total_loss += loss.item()

    # Aggregate
    predictions = np.vstack(all_predictions)
    labels = np.vstack(all_labels)

    # Compute metrics
    metrics = compute_metrics(predictions, labels)
    metrics['loss'] = total_loss / len(dataloader)

    return metrics

In [10]:
def train_model(model, train_loader, val_loader, device, epochs=5, lr=1e-3):
    """
    Main training loop
    Easy to modify hyperparameters and logic
    """
    # Setup
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=lr
    )

    best_val_macro_f1 = 0

    print(f"\n{'='*60}")
    print("Training Loop")
    print(f"{'='*60}")

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 40)

        # Train
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Train Loss: {train_loss:.4f}")

        # Validate
        val_metrics = evaluate_model(model, val_loader, criterion, device)
        print(f"Val Loss: {val_metrics['loss']:.4f}")
        print_metrics(val_metrics, prefix="Validation ")

        # Save best model
        if val_metrics['macro_f1'] > best_val_macro_f1:
            best_val_macro_f1 = val_metrics['macro_f1']
            torch.save(model.state_dict(), 'best_LoRA_model.pt')
            print(f"  ✓ New best model saved! (Macro-F1: {best_val_macro_f1:.4f})")

    return best_val_macro_f1

In [12]:
def main():
    """
    Main execution function
    Each step is separate - easy to run/modify individually
    """
    # Config
    BATCH_SIZE = 32
    MAX_LENGTH = 128
    EPOCHS = 5
    LR = 1e-3
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print(f"\n{'='*60}")
    print("DistilRoBERTa LoRA")
    print(f"{'='*60}")
    print(f"Device: {DEVICE}\n")

    # Step 1: Load data
    train_data, val_data, test_data = load_goemotions_data()

    # Step 2: Load tokenizer
    print("\nLoading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

    # Step 3: Create dataloaders
    print("\nCreating dataloaders...")
    train_loader, val_loader, test_loader = create_dataloaders(
        train_data, val_data, test_data, tokenizer,
        batch_size=BATCH_SIZE, max_length=MAX_LENGTH
    )

    # Step 4: Build model
    print("\nBuilding model...")
    model = build_model(num_labels=28, dropout=0.1)
    model = model.to(DEVICE)

    # Step 5: Train
    best_macro_f1 = train_model(
        model, train_loader, val_loader, DEVICE,
        epochs=EPOCHS, lr=LR
    )

    # Step 6: Final test evaluation
    print(f"\n{'='*60}")
    print("Final Test Evaluation (Unseen Data)")
    print(f"{'='*60}")

    model.load_state_dict(torch.load('best_LoRA_model.pt'))
    criterion = nn.BCEWithLogitsLoss()
    test_metrics = evaluate_model(model, test_loader, criterion, DEVICE)

    print_metrics(test_metrics, prefix="Test ")

    print(f"\n{'='*60}")
    print("LoRA Complete!")
    print(f"{'='*60}")
    print(f"\nKey Results:")
    print(f"  Best Val Macro-F1: {best_macro_f1:.4f}")
    print(f"  Test Macro-F1: {test_metrics['macro_f1']:.4f}")
    print(f"  Test Micro-F1: {test_metrics['micro_f1']:.4f}")
    print(f"\nReady for LoRA comparison!")


if __name__ == "__main__":
    main()


DistilRoBERTa LoRA
Device: cuda


Loading GoEmotions dataset...
Train samples: 43,410
Validation samples: 5,426
Test samples: 5,427

Loading tokenizer...

Creating dataloaders...

Building model...


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]


✓ LoRA Adapters injected into Backbone
trainable params: 442,368 || all params: 82,560,768 || trainable%: 0.5358

Training Loop

Epoch 1/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [05:04<00:00,  4.46it/s, loss=0.1041]


Train Loss: 0.1209


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.72it/s]


Val Loss: 0.0970

Validation Metrics:
  Micro-F1: 0.4865 (weighted by frequency)
  Macro-F1: 0.2817 (equal weight per class)
  Micro-Precision: 0.7112
  Micro-Recall: 0.3697
  Macro-Precision: 0.5197
  Macro-Recall: 0.2465
  Subset Accuracy: 0.3546
  Hamming Loss: 0.0328
  ✓ New best model saved! (Macro-F1: 0.2817)

Epoch 2/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [05:03<00:00,  4.46it/s, loss=0.0835]


Train Loss: 0.0996


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.70it/s]


Val Loss: 0.0942

Validation Metrics:
  Micro-F1: 0.5169 (weighted by frequency)
  Macro-F1: 0.3414 (equal weight per class)
  Micro-Precision: 0.7085
  Micro-Recall: 0.4069
  Macro-Precision: 0.5020
  Macro-Recall: 0.2916
  Subset Accuracy: 0.3892
  Hamming Loss: 0.0319
  ✓ New best model saved! (Macro-F1: 0.3414)

Epoch 3/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [05:04<00:00,  4.46it/s, loss=0.0877]


Train Loss: 0.0962


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.67it/s]


Val Loss: 0.0924

Validation Metrics:
  Micro-F1: 0.5273 (weighted by frequency)
  Macro-F1: 0.3766 (equal weight per class)
  Micro-Precision: 0.6981
  Micro-Recall: 0.4237
  Macro-Precision: 0.6039
  Macro-Recall: 0.3131
  Subset Accuracy: 0.4029
  Hamming Loss: 0.0319
  ✓ New best model saved! (Macro-F1: 0.3766)

Epoch 4/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [05:04<00:00,  4.46it/s, loss=0.0811]


Train Loss: 0.0961


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.71it/s]


Val Loss: 0.0931

Validation Metrics:
  Micro-F1: 0.5179 (weighted by frequency)
  Macro-F1: 0.3646 (equal weight per class)
  Micro-Precision: 0.7051
  Micro-Recall: 0.4092
  Macro-Precision: 0.6266
  Macro-Recall: 0.3031
  Subset Accuracy: 0.3957
  Hamming Loss: 0.0320

Epoch 5/5
----------------------------------------


Training: 100%|██████████| 1357/1357 [05:03<00:00,  4.46it/s, loss=0.0890]


Train Loss: 0.1014


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.71it/s]


Val Loss: 0.1022

Validation Metrics:
  Micro-F1: 0.4861 (weighted by frequency)
  Macro-F1: 0.3015 (equal weight per class)
  Micro-Precision: 0.6716
  Micro-Recall: 0.3809
  Macro-Precision: 0.5270
  Macro-Recall: 0.2570
  Subset Accuracy: 0.3658
  Hamming Loss: 0.0338

Final Test Evaluation (Unseen Data)


Evaluating: 100%|██████████| 170/170 [00:17<00:00,  9.64it/s]



Test Metrics:
  Micro-F1: 0.5223 (weighted by frequency)
  Macro-F1: 0.3811 (equal weight per class)
  Micro-Precision: 0.6908
  Micro-Recall: 0.4198
  Macro-Precision: 0.6155
  Macro-Recall: 0.3166
  Subset Accuracy: 0.3978
  Hamming Loss: 0.0320

LoRA Complete!

Key Results:
  Best Val Macro-F1: 0.3766
  Test Macro-F1: 0.3811
  Test Micro-F1: 0.5223

Ready for LoRA comparison!
