<a href="https://colab.research.google.com/github/yilinmiao/LightweightFineTuning/blob/main/LightweightFineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: Low-Rank Adaptation (LoRA)
* Model: GPT-2 (gpt2)
* Evaluation approach: Accuracy metric with Hugging Face's Trainer
* Fine-tuning dataset: Stanford Sentiment Treebank (SST-2)

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

First, we'll load the pre-trained GPT-2 model and the SST-2 dataset, and evaluate the model's performance prior to fine-tuning.

In [None]:
# Install required packages if needed
#!pip install -q transformers datasets evaluate peft torch accelerate

In [None]:
# Import required libraries
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import evaluate

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")
print(dataset)

In [None]:
# Take sufficient samples for training
# Using 10% of the training data (about 6.7K samples) for a more robust training
train_size = len(dataset["train"]) // 10
eval_size = min(1000, len(dataset["validation"]))  # Up to 1000 samples for evaluation

In [None]:
# Take smaller subsets for faster training and evaluation
train_dataset = dataset["train"].select(range(train_size))
eval_dataset = dataset["validation"].select(range(eval_size))

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

In [None]:
# Load model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token by default

In [None]:
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification (positive/negative)
    pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id to match tokenizer
    # Properly initialize with good defaults
    problem_type="single_label_classification",
    return_dict=True
)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

In [None]:
# Print model size
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {model_name}")
print(f"Number of trainable parameters: {num_params:,}")
print(f"Model config:\n{model.config}")

In [None]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

In [None]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Define compute metrics function for evaluation
accuracy_metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Set up trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Evaluate the model before fine-tuning
print("Evaluating the model before fine-tuning...")
base_model_metrics = trainer.evaluate()
print(f"Base model metrics: {base_model_metrics}")


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

Now, we'll create a PEFT model using LoRA, train it on our dataset, and save the resulting weights.

In [None]:
# Import PEFT library components
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=16,                        # Rank of LoRA matrices
    lora_alpha=32,               # Alpha parameter for LoRA scaling
    lora_dropout=0.1,            # Dropout probability for LoRA layers
    bias="none",                 # Don't adapt bias terms
    # Fix: Target the correct GPT-2 attention modules with proper names
    target_modules=["c_attn", "c_proj"],
    # tell model to save additional modules.
    modules_to_save=["classifier", "score"],
    # reasoning
    inference_mode=True,
    # Conv1D
    fan_in_fan_out=True,
)

In [None]:
# Create PEFT model
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()
peft_model.to(device)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./peft_results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    report_to="none",
    logging_steps=100,
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
print("Training the PEFT model...")
trainer.train()

In [None]:
# Evaluate the fine-tuned model
print("Evaluating the fine-tuned model...")
peft_metrics = trainer.evaluate()
print(f"PEFT model metrics: {peft_metrics}")

In [None]:
# Save the PEFT model
peft_model.save_pretrained("./peft_gpt2_sst2")
print("PEFT model saved to ./peft_gpt2_sst2")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

Finally, we'll load the saved PEFT model and evaluate its performance compared to the original model.

In [None]:
# Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id
).to(device)

In [None]:
# Load the PEFT model
from peft import PeftModel, PeftConfig

peft_model_path = "./peft_gpt2_sst2"
config = PeftConfig.from_pretrained(peft_model_path)
print(f"PEFT config: {config}")

# fix: for reasoning
base_model.config.pad_token_id = tokenizer.eos_token_id

print("Creating a fresh base model for PEFT loading...")
inference_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id,
    problem_type="single_label_classification"
).to(device)

In [None]:
peft_model_loaded = PeftModel.from_pretrained(inference_model, peft_model_path, adapter_name="default").to(device)
peft_model_loaded.eval()

print(f"Base model trainable parameters: {sum(p.numel() for p in base_model.parameters() if p.requires_grad)}")
print(f"PEFT model trainable parameters: {sum(p.numel() for p in peft_model_loaded.parameters() if p.requires_grad)}")
print(f"PEFT model active adapters: {getattr(peft_model_loaded, 'active_adapters', 'None')}")

In [None]:
# Function to run inference on both models with the same inputs
def compare_predictions(base_model, peft_model, tokenizer, sample_texts):
    """Compare predictions from base and PEFT models on sample texts."""
    base_model.eval()
    peft_model.eval()

    print("Base model parameters:", sum(p.numel() for p in base_model.parameters() if p.requires_grad))
    print("PEFT model parameters:", sum(p.numel() for p in peft_model.parameters() if p.requires_grad))
    print("PEFT active adapters:", getattr(peft_model, "active_adapters", "No active adapters property found"))

    for text in sample_texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

        # Get base model prediction
        with torch.no_grad():
            base_outputs = base_model(**inputs)
            base_logits = base_outputs.logits
            base_pred = torch.softmax(base_logits, dim=1).tolist()[0]

        # Get PEFT model prediction
        with torch.no_grad():
            peft_outputs = peft_model(**inputs)
            peft_logits = peft_outputs.logits
            peft_pred = torch.softmax(peft_logits, dim=1).tolist()[0]

        # Format results
        print(f"Text: {text}")
        print(f"Base model prediction - Negative: {base_pred[0]:.4f}, Positive: {base_pred[1]:.4f}")
        print(f"PEFT model prediction - Negative: {peft_pred[0]:.4f}, Positive: {peft_pred[1]:.4f}\n")

In [None]:
# Sample texts for inference
sample_texts = [
    "This movie was fantastic! I really enjoyed it.",
    "The acting was terrible and the plot made no sense.",
    "It was an average film, neither great nor terrible.",
    "The cinematography was beautiful, but the story was weak."
]
# Compare predictions
compare_predictions(base_model, peft_model_loaded, tokenizer, sample_texts)

In [None]:
# Set up trainers for both models to evaluate on the test set
base_trainer = Trainer(
    model=base_model,
    args=TrainingArguments(
        output_dir="./base_eval",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
        report_to="none",
    ),
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

peft_trainer = Trainer(
    model=peft_model_loaded,
    args=TrainingArguments(
        output_dir="./peft_eval",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
        report_to="none",
    ),
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Evaluate both models
print("Evaluating base model...")
base_metrics = base_trainer.evaluate()

print("Evaluating PEFT model...")
peft_metrics = peft_trainer.evaluate()

In [None]:
# Compare metrics
print("\nPerformance Comparison:")
print(f"Base model accuracy: {base_metrics['eval_accuracy']:.4f}")
print(f"PEFT model accuracy: {peft_metrics['eval_accuracy']:.4f}")
print(f"Improvement: {peft_metrics['eval_accuracy'] - base_metrics['eval_accuracy']:.4f}")

In [None]:
# Calculate and print PEFT parameter efficiency
base_total_params = sum(p.numel() for p in base_model.parameters())
base_trainable_params = sum(p.numel() for p in base_model.parameters() if p.requires_grad)

# For PEFT model, count differently
peft_total_params = base_total_params
peft_trainable_params = sum(p.numel() for p in peft_model_loaded.parameters() if p.requires_grad)

print(f"\nParameter Efficiency:")
print(f"Base model - Total parameters: {base_total_params:,}")
print(f"Base model - All parameters would be trained in full fine-tuning")
print(f"PEFT model - Total parameters: {peft_total_params:,}")
print(f"PEFT model - Trainable parameters: {peft_trainable_params:,}")
print(f"Parameter efficiency: Only training {peft_trainable_params / base_total_params:.6%} of the parameters")