# Try It Yourself!

**Hands-on tutorial: Train and evaluate your first fine-tuned model**

## Welcome to Hands-On Post-Training!

This guide walks you through training your first fine-tuned language model. By the end, you'll have:

- A GPT-2 model fine-tuned on instructions
- Hands-on experience with SFT, evaluation, and generation
- Understanding of how to apply these techniques to your own projects

**Time required:** 30-60 minutes (depending on hardware)

## 1. Setup

First, let's verify our environment and import the necessary libraries.

In [None]:
# Check environment
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")
else:
    print("Device: CPU (training will be slower)")

In [None]:
# Import libraries
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup
from datasets import load_dataset
from tqdm import tqdm
import numpy as np

print("All imports successful!")

## 2. Load Model and Tokenizer

In [None]:
# Load GPT-2 (small, 124M parameters)
model_name = "gpt2"

print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Model loaded!")
print(f"  Parameters: {total_params:,}")
print(f"  Device: {device}")

## 3. Test Base Model (Before Fine-Tuning)

Let's see how the base model handles instructions before we fine-tune it.

In [None]:
def generate_response(model, tokenizer, instruction, max_new_tokens=100):
    """Generate a response to an instruction."""
    # Format as Alpaca-style prompt
    prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_text.split("### Response:\n")[-1].strip()
    
    return response

# Test base model
test_instructions = [
    "What is the capital of France?",
    "Write a haiku about programming.",
    "Explain machine learning in one sentence.",
]

print("Base Model Responses (BEFORE fine-tuning):")
print("=" * 60)
for instruction in test_instructions:
    print(f"\nInstruction: {instruction}")
    response = generate_response(model, tokenizer, instruction)
    print(f"Response: {response[:200]}..." if len(response) > 200 else f"Response: {response}")
    print("-" * 60)

Notice how the base model doesn't follow instructions well - it typically continues generating text in the same style rather than answering the question.

## 4. Prepare Training Data

We'll use the Alpaca dataset, which contains instruction-response pairs.

In [None]:
# Load Alpaca dataset
print("Loading Alpaca dataset...")
raw_dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Take a small subset for quick training
num_samples = 500  # Adjust based on your time/hardware
raw_dataset = raw_dataset.select(range(num_samples))

print(f"Dataset loaded: {len(raw_dataset)} samples")
print(f"\nExample:")
print(f"  Instruction: {raw_dataset[0]['instruction'][:100]}...")
print(f"  Input: {raw_dataset[0]['input'][:50]}..." if raw_dataset[0]['input'] else "  Input: (none)")
print(f"  Output: {raw_dataset[0]['output'][:100]}...")

In [None]:
class InstructionDataset(Dataset):
    """Dataset for instruction fine-tuning with proper loss masking."""
    
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def format_example(self, example):
        """Format example in Alpaca style."""
        if example['input']:
            prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
"""
        else:
            prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{example['instruction']}

### Response:
"""
        return prompt, example['output']
    
    def __getitem__(self, idx):
        example = self.data[idx]
        prompt, response = self.format_example(example)
        
        # Tokenize prompt and response separately
        prompt_tokens = self.tokenizer.encode(prompt, add_special_tokens=True)
        response_tokens = self.tokenizer.encode(response, add_special_tokens=False)
        
        # Combine
        input_ids = prompt_tokens + response_tokens + [self.tokenizer.eos_token_id]
        
        # Create labels: -100 for prompt tokens (ignored in loss)
        labels = [-100] * len(prompt_tokens) + response_tokens + [self.tokenizer.eos_token_id]
        
        # Truncate if too long
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
            labels = labels[:self.max_length]
        
        # Pad to max_length
        padding_length = self.max_length - len(input_ids)
        input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
        labels = labels + [-100] * padding_length  # Ignore padding in loss
        attention_mask = [1] * (self.max_length - padding_length) + [0] * padding_length
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(labels),
        }

# Create dataset and dataloader
train_dataset = InstructionDataset(raw_dataset, tokenizer, max_length=256)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

print(f"Created dataset with {len(train_dataset)} samples")
print(f"Batches per epoch: {len(train_loader)}")

## 5. Training Loop

Now let's train the model on our instruction data.

In [None]:
# Training configuration
learning_rate = 5e-5
num_epochs = 1
warmup_steps = 50
max_grad_norm = 1.0

# Setup optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f"Training Configuration:")
print(f"  Learning rate: {learning_rate}")
print(f"  Epochs: {num_epochs}")
print(f"  Total steps: {total_steps}")
print(f"  Warmup steps: {warmup_steps}")

In [None]:
# Training loop
print("\nStarting training...")
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for step, batch in enumerate(progress_bar):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        # Optimizer step
        optimizer.step()
        scheduler.step()
        
        # Track loss
        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg_loss': f'{avg_loss:.4f}',
            'ppl': f'{np.exp(avg_loss):.2f}'
        })
    
    print(f"\nEpoch {epoch+1} complete!")
    print(f"  Average loss: {avg_loss:.4f}")
    print(f"  Perplexity: {np.exp(avg_loss):.2f}")

print("\nTraining complete!")

## 6. Test Fine-Tuned Model

Now let's see how the model performs after fine-tuning!

In [None]:
# Set model to eval mode
model.eval()

print("Fine-Tuned Model Responses (AFTER fine-tuning):")
print("=" * 60)
for instruction in test_instructions:
    print(f"\nInstruction: {instruction}")
    response = generate_response(model, tokenizer, instruction)
    print(f"Response: {response}")
    print("-" * 60)

In [None]:
# Test with more instructions
additional_tests = [
    "List three benefits of exercise.",
    "What is Python used for?",
    "Explain what a neural network is in simple terms.",
    "Write a short poem about the ocean.",
]

print("Additional Tests:")
print("=" * 60)
for instruction in additional_tests:
    print(f"\nInstruction: {instruction}")
    response = generate_response(model, tokenizer, instruction)
    print(f"Response: {response}")
    print("-" * 60)

## 7. Evaluate Model Quality

Let's compute some quantitative metrics.

In [None]:
def compute_perplexity(model, dataloader, device):
    """Compute perplexity on a dataset."""
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Computing perplexity"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            # Count non-masked tokens
            num_tokens = (batch['labels'] != -100).sum().item()
            total_loss += outputs.loss.item() * num_tokens
            total_tokens += num_tokens
    
    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)
    
    return perplexity, avg_loss

# Compute perplexity
perplexity, loss = compute_perplexity(model, train_loader, device)
print(f"\nFinal Metrics:")
print(f"  Loss: {loss:.4f}")
print(f"  Perplexity: {perplexity:.2f}")

In [None]:
def compute_diversity(responses):
    """Compute diversity metrics for generated responses."""
    all_unigrams = []
    all_bigrams = []
    
    for response in responses:
        tokens = response.lower().split()
        all_unigrams.extend(tokens)
        all_bigrams.extend(zip(tokens[:-1], tokens[1:]))
    
    distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
    distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
    
    return distinct_1, distinct_2

# Generate responses for diversity analysis
diversity_prompts = [
    "Tell me about machine learning.",
    "Explain artificial intelligence.",
    "What is deep learning?",
    "Describe natural language processing.",
    "Explain what data science is.",
]

responses = [generate_response(model, tokenizer, p) for p in diversity_prompts]
d1, d2 = compute_diversity(responses)

print(f"\nDiversity Metrics:")
print(f"  Distinct-1 (unique unigrams): {d1:.2%}")
print(f"  Distinct-2 (unique bigrams): {d2:.2%}")
print(f"\nInterpretation:")
print(f"  > 0.4 distinct-1: Good diversity")
print(f"  < 0.2 distinct-1: May indicate mode collapse")

## 8. Save Your Model

Save the fine-tuned model for later use.

In [None]:
# Save model and tokenizer
save_path = "./my_finetuned_model"

print(f"Saving model to {save_path}...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved!")

# Show saved files
import os
print(f"\nSaved files:")
for f in os.listdir(save_path):
    size = os.path.getsize(os.path.join(save_path, f)) / 1e6
    print(f"  {f}: {size:.1f} MB")

In [None]:
# Test loading the saved model
print("Testing model loading...")

loaded_model = AutoModelForCausalLM.from_pretrained(save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)
loaded_model = loaded_model.to(device)
loaded_model.eval()

test_instruction = "What is the meaning of life?"
response = generate_response(loaded_model, loaded_tokenizer, test_instruction)

print(f"\nTest with loaded model:")
print(f"Instruction: {test_instruction}")
print(f"Response: {response}")

## Summary

Congratulations! You've successfully:

1. **Loaded** a pre-trained GPT-2 model
2. **Tested** the base model on instructions (and saw it doesn't follow them well)
3. **Prepared** training data with proper loss masking
4. **Trained** the model using supervised fine-tuning (SFT)
5. **Tested** the fine-tuned model (and saw significant improvement!)
6. **Evaluated** using perplexity and diversity metrics
7. **Saved** the model for later use

## Next Steps

Now that you've mastered the basics, try:

1. **Train longer** - Increase epochs or use more data
2. **Try LoRA** - More efficient training with fewer parameters
3. **Try DPO** - Align model with human preferences
4. **Use larger models** - Try GPT-2 Medium or Llama
5. **Custom data** - Fine-tune on your own instruction dataset

Happy fine-tuning!