# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: Low-Rank Adaptation (LoRA)
* Model: GPT-2 (gpt2)
* Evaluation approach: Accuracy metric with Hugging Face's Trainer
* Fine-tuning dataset: Stanford Sentiment Treebank (SST-2)

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

First, we'll load the pre-trained GPT-2 model and the SST-2 dataset, and evaluate the model's performance prior to fine-tuning.

In [37]:
# Install required packages if needed
#!pip install -q transformers datasets evaluate peft torch accelerate

In [38]:
# Import required libraries
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import evaluate

In [39]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [40]:
# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [41]:
# Take sufficient samples for training
# Using 10% of the training data (about 6.7K samples) for a more robust training
train_size = len(dataset["train"]) // 10
eval_size = min(1000, len(dataset["validation"]))  # Up to 1000 samples for evaluation

In [42]:
# Take smaller subsets for faster training and evaluation
train_dataset = dataset["train"].select(range(train_size))
eval_dataset = dataset["validation"].select(range(eval_size))

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

Training dataset size: 6734
Evaluation dataset size: 872


In [43]:
# Load model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token by default

In [44]:
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification (positive/negative)
    pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id to match tokenizer
    # Properly initialize with good defaults
    problem_type="single_label_classification",
    return_dict=True
)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [45]:
# Print model size
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {model_name}")
print(f"Number of trainable parameters: {num_params:,}")
print(f"Model config:\n{model.config}")

Model: gpt2
Number of trainable parameters: 124,441,344
Model config:
GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "problem_type": "single_label_classification",
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "tran

In [46]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

In [47]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6734 [00:00<?, ? examples/s]

In [48]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [49]:
# Define compute metrics function for evaluation
accuracy_metric = evaluate.load("accuracy")

In [50]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [51]:
# Set up trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [52]:
# Evaluate the model before fine-tuning
print("Evaluating the model before fine-tuning...")
base_model_metrics = trainer.evaluate()
print(f"Base model metrics: {base_model_metrics}")


Evaluating the model before fine-tuning...


Base model metrics: {'eval_loss': 3.072819948196411, 'eval_model_preparation_time': 0.0042, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 6.3659, 'eval_samples_per_second': 136.98, 'eval_steps_per_second': 8.64}


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

Now, we'll create a PEFT model using LoRA, train it on our dataset, and save the resulting weights.

In [53]:
# Import PEFT library components
from peft import LoraConfig, get_peft_model, TaskType

In [54]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=16,                        # Rank of LoRA matrices
    lora_alpha=32,               # Alpha parameter for LoRA scaling
    lora_dropout=0.1,            # Dropout probability for LoRA layers
    bias="none",                 # Don't adapt bias terms
    # Fix: Target the correct GPT-2 attention modules with proper names
    target_modules=["c_attn", "c_proj"],
    # tell model to save additional modules.
    modules_to_save=["classifier", "score"],
    # reasoning
    inference_mode=True,
    # Conv1D
    fan_in_fan_out=True,
)

In [55]:
# Create PEFT model
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()
peft_model.to(device)

trainable params: 1,536 || all params: 126,064,896 || trainable%: 0.0012


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B

In [70]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./peft_results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    report_to="none",
    logging_steps=100,
)



In [71]:
# Initialize Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [72]:
# Train the model
print("Training the PEFT model...")
trainer.train()

Training the PEFT model...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5903,0.581313,0.65367
2,0.5792,0.537459,0.737385
3,0.5561,0.519322,0.754587
4,0.5339,0.494413,0.77867
5,0.5512,0.492145,0.767202
6,0.5356,0.486291,0.771789
7,0.546,0.482756,0.776376
8,0.5202,0.493989,0.767202
9,0.5229,0.485001,0.774083
10,0.5198,0.47824,0.779817


TrainOutput(global_step=4210, training_loss=0.5452583590482589, metrics={'train_runtime': 697.9087, 'train_samples_per_second': 96.488, 'train_steps_per_second': 6.032, 'total_flos': 4482896229826560.0, 'train_loss': 0.5452583590482589, 'epoch': 10.0})

In [73]:
# Evaluate the fine-tuned model
print("Evaluating the fine-tuned model...")
peft_metrics = trainer.evaluate()
print(f"PEFT model metrics: {peft_metrics}")

Evaluating the fine-tuned model...


PEFT model metrics: {'eval_loss': 0.4782397747039795, 'eval_accuracy': 0.7798165137614679, 'eval_runtime': 7.2774, 'eval_samples_per_second': 119.823, 'eval_steps_per_second': 7.558, 'epoch': 10.0}


In [74]:
# Save the PEFT model
peft_model.save_pretrained("./peft_gpt2_sst2")
print("PEFT model saved to ./peft_gpt2_sst2")

PEFT model saved to ./peft_gpt2_sst2


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

Finally, we'll load the saved PEFT model and evaluate its performance compared to the original model.

In [75]:
# Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id
).to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
# Load the PEFT model
from peft import PeftModel, PeftConfig

peft_model_path = "./peft_gpt2_sst2"
config = PeftConfig.from_pretrained(peft_model_path)
print(f"PEFT config: {config}")

# fix: for reasoning
base_model.config.pad_token_id = tokenizer.eos_token_id

print("Creating a fresh base model for PEFT loading...")
inference_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id,
    problem_type="single_label_classification"
).to(device)

PEFT config: LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='gpt2', revision=None, inference_mode=True, r=16, target_modules={'c_proj', 'c_attn'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=True, bias='none', use_rslora=False, modules_to_save=['classifier', 'score', 'classifier', 'score'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)
Creating a fresh base model for PEFT loading...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
peft_model_loaded = PeftModel.from_pretrained(inference_model, peft_model_path, adapter_name="default").to(device)
peft_model_loaded.eval()

print(f"Base model trainable parameters: {sum(p.numel() for p in base_model.parameters() if p.requires_grad)}")
print(f"PEFT model trainable parameters: {sum(p.numel() for p in peft_model_loaded.parameters() if p.requires_grad)}")
print(f"PEFT model active adapters: {getattr(peft_model_loaded, 'active_adapters', 'None')}")

Base model trainable parameters: 124441344
PEFT model trainable parameters: 1536
PEFT model active adapters: ['default']


In [78]:
# Function to run inference on both models with the same inputs
def compare_predictions(base_model, peft_model, tokenizer, sample_texts):
    """Compare predictions from base and PEFT models on sample texts."""
    base_model.eval()
    peft_model.eval()

    print("Base model parameters:", sum(p.numel() for p in base_model.parameters() if p.requires_grad))
    print("PEFT model parameters:", sum(p.numel() for p in peft_model.parameters() if p.requires_grad))
    print("PEFT active adapters:", getattr(peft_model, "active_adapters", "No active adapters property found"))

    for text in sample_texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

        # Get base model prediction
        with torch.no_grad():
            base_outputs = base_model(**inputs)
            base_logits = base_outputs.logits
            base_pred = torch.softmax(base_logits, dim=1).tolist()[0]

        # Get PEFT model prediction
        with torch.no_grad():
            peft_outputs = peft_model(**inputs)
            peft_logits = peft_outputs.logits
            peft_pred = torch.softmax(peft_logits, dim=1).tolist()[0]

        # Format results
        print(f"Text: {text}")
        print(f"Base model prediction - Negative: {base_pred[0]:.4f}, Positive: {base_pred[1]:.4f}")
        print(f"PEFT model prediction - Negative: {peft_pred[0]:.4f}, Positive: {peft_pred[1]:.4f}\n")

In [79]:
# Sample texts for inference
sample_texts = [
    "This movie was fantastic! I really enjoyed it.",
    "The acting was terrible and the plot made no sense.",
    "It was an average film, neither great nor terrible.",
    "The cinematography was beautiful, but the story was weak."
]
# Compare predictions
compare_predictions(base_model, peft_model_loaded, tokenizer, sample_texts)

Base model parameters: 124441344
PEFT model parameters: 1536
PEFT active adapters: ['default']
Text: This movie was fantastic! I really enjoyed it.
Base model prediction - Negative: 0.0000, Positive: 1.0000
PEFT model prediction - Negative: 0.1224, Positive: 0.8776

Text: The acting was terrible and the plot made no sense.
Base model prediction - Negative: 0.0000, Positive: 1.0000
PEFT model prediction - Negative: 0.7578, Positive: 0.2422

Text: It was an average film, neither great nor terrible.
Base model prediction - Negative: 0.0000, Positive: 1.0000
PEFT model prediction - Negative: 0.5514, Positive: 0.4486

Text: The cinematography was beautiful, but the story was weak.
Base model prediction - Negative: 0.0000, Positive: 1.0000
PEFT model prediction - Negative: 0.5730, Positive: 0.4270



In [80]:
# Set up trainers for both models to evaluate on the test set
base_trainer = Trainer(
    model=base_model,
    args=TrainingArguments(
        output_dir="./base_eval",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
        report_to="none",
    ),
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

peft_trainer = Trainer(
    model=peft_model_loaded,
    args=TrainingArguments(
        output_dir="./peft_eval",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
        report_to="none",
    ),
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  base_trainer = Trainer(
  peft_trainer = Trainer(


In [81]:
# Evaluate both models
print("Evaluating base model...")
base_metrics = base_trainer.evaluate()

print("Evaluating PEFT model...")
peft_metrics = peft_trainer.evaluate()

Evaluating base model...


Evaluating PEFT model...


In [82]:
# Compare metrics
print("\nPerformance Comparison:")
print(f"Base model accuracy: {base_metrics['eval_accuracy']:.4f}")
print(f"PEFT model accuracy: {peft_metrics['eval_accuracy']:.4f}")
print(f"Improvement: {peft_metrics['eval_accuracy'] - base_metrics['eval_accuracy']:.4f}")


Performance Comparison:
Base model accuracy: 0.5092
PEFT model accuracy: 0.7798
Improvement: 0.2706


In [83]:
# Calculate and print PEFT parameter efficiency
base_total_params = sum(p.numel() for p in base_model.parameters())
base_trainable_params = sum(p.numel() for p in base_model.parameters() if p.requires_grad)

# For PEFT model, count differently
peft_total_params = base_total_params
peft_trainable_params = sum(p.numel() for p in peft_model_loaded.parameters() if p.requires_grad)

print(f"\nParameter Efficiency:")
print(f"Base model - Total parameters: {base_total_params:,}")
print(f"Base model - All parameters would be trained in full fine-tuning")
print(f"PEFT model - Total parameters: {peft_total_params:,}")
print(f"PEFT model - Trainable parameters: {peft_trainable_params:,}")
print(f"Parameter efficiency: Only training {peft_trainable_params / base_total_params:.6%} of the parameters")


Parameter Efficiency:
Base model - Total parameters: 124,441,344
Base model - All parameters would be trained in full fine-tuning
PEFT model - Total parameters: 124,441,344
PEFT model - Trainable parameters: 1,536
Parameter efficiency: Only training 0.001234% of the parameters
