# Part 4: Pre-train + Fine-tune + Prompting (Bonus 20 points)

In this notebook, we will:
1. **Part 4A (12 points)**: Pre-train a Transformer LM on TinyStories, generate samples with greedy and top-k decoding, and fine-tune on Multiple-choice QA
2. **Part 4B (8 points)**: Experiment with prompting strategies

## Deliverables:
- `Part4.ipynb` - This notebook with the complete pipeline
- `Trained_transformer_predictions.npy` - Predicted QA labels from fine-tuned model
- `Trained_transformer_predictions_scalings.npy` - Predicted QA labels from prompting

In [None]:
import sys
import json
import numpy as np
import torch
from pathlib import Path

# Add directories to path for imports
# Note: part3 must be in path for nn_utils which is used by part2/model.py
sys.path.insert(0, '.')
sys.path.insert(0, 'part1')
sys.path.insert(0, 'part2')
sys.path.insert(0, 'part3')  # Required for nn_utils
sys.path.insert(0, 'part4')

# Import from Part 1: Tokenizer
from part1.tokenizer import Tokenizer, get_tokenizer
from part1.train_bpe import train_bpe

# Import from Part 2: Transformer Model
from part2.model import TransformerLM, count_parameters

# Import from Part 3: Training utilities
from part3.nn_utils import cross_entropy, gradient_clipping

# Import from Part 4: Pre-training, Fine-tuning, Prompting
from part4.sampling import greedy_decode, top_k_decode, generate_text
from part4.datasets import PretrainingDataset, MultipleChoiceQADataset, create_pretraining_dataloader, create_qa_dataloader
from part4.trainer import Trainer, TrainingConfig, create_qa_loss_fn
from part4.qa_model import TransformerForMultipleChoice, evaluate_qa_model
from part4.prompting import PromptTemplate, PromptingPipeline, evaluate_prompting

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

---
# Part 4A: Pre-training + Fine-tuning (12 points)

## Step 1: Setup Tokenizer

We'll use either a pre-trained GPT-2 tokenizer or train our own BPE tokenizer on TinyStories.

In [None]:
# Option 1: Load GPT-2 tokenizer (recommended for better results)
def load_gpt2_tokenizer():
    """Load pre-trained GPT-2 tokenizer from fixtures."""
    import json
    
    vocab_path = Path("part1/fixtures/gpt2_vocab.json")
    merges_path = Path("part1/fixtures/gpt2_merges.txt")
    
    # Load vocab
    with open(vocab_path, "r", encoding="utf-8") as f:
        vocab_str = json.load(f)
    
    # Convert vocab keys from strings to bytes
    vocab = {}
    for idx, (token_str, token_id) in enumerate(vocab_str.items()):
        # GPT-2 uses unicode escapes, need to convert carefully
        vocab[token_id] = token_str.encode("utf-8") if isinstance(token_str, str) else token_str
    
    # Load merges
    merges = []
    with open(merges_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#"):
                parts = line.split()
                if len(parts) == 2:
                    merges.append((parts[0].encode("utf-8"), parts[1].encode("utf-8")))
    
    return get_tokenizer(vocab, merges, special_tokens=["<|endoftext|>"])

# Option 2: Train BPE tokenizer on TinyStories (smaller vocab, faster)
def train_tiny_tokenizer(vocab_size=1000):
    """Train a small BPE tokenizer on TinyStories."""
    input_path = Path("part1/fixtures/tinystories_sample.txt")
    vocab, merges = train_bpe(input_path, vocab_size=vocab_size, special_tokens=["<|endoftext|>"])
    return get_tokenizer(vocab, merges, special_tokens=["<|endoftext|>"])

# Use Option 2 for this assignment (smaller, faster training)
print("Training BPE tokenizer on TinyStories...")
tokenizer = train_tiny_tokenizer(vocab_size=1000)
vocab_size = len(tokenizer.vocab)
print(f"Vocabulary size: {vocab_size}")

# Get special token ID
eos_token_id = tokenizer.encode("<|endoftext|>")[0] if "<|endoftext|>" in tokenizer.special_tokens else None
print(f"EOS token ID: {eos_token_id}")

## Step 2: Initialize Transformer LM

We create a small transformer model suitable for the TinyStories dataset.

In [None]:
# Model hyperparameters (small model for quick training)
model_config = {
    "vocab_size": vocab_size,
    "context_length": 256,
    "d_model": 128,
    "num_layers": 4,
    "num_heads": 4,
    "d_ff": 512,
    "rope_theta": 10000.0,
}

# Initialize model
model = TransformerLM(**model_config)
model = model.to(device)

num_params = count_parameters(model)
print(f"Model parameters: {num_params:,}")
print(f"Model config: {model_config}")

## Step 3: Pre-train on TinyStories

Pre-train the transformer on the TinyStories dataset using next-token prediction.

In [None]:
# Create pre-training dataloader
pretrain_dataloader = create_pretraining_dataloader(
    file_path="part1/fixtures/tinystories_sample.txt",
    tokenizer=tokenizer,
    batch_size=8,
    max_length=256,
    stride=128,  # Overlapping sequences
    shuffle=True,
)

print(f"Number of pre-training batches: {len(pretrain_dataloader)}")

In [None]:
# Pre-training configuration
pretrain_config = TrainingConfig(
    num_epochs=3,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=50,
    max_grad_norm=1.0,
    log_interval=10,
    device=device,
    checkpoint_dir="checkpoints/pretrain",
)

# Create trainer
pretrain_trainer = Trainer(
    model=model,
    config=pretrain_config,
    train_dataloader=pretrain_dataloader,
)

# Run pre-training
print("Starting pre-training...")
pretrain_history = pretrain_trainer.train()

## Step 4: Generate Samples with Greedy and Top-k Decoding

Test the pre-trained model by generating text samples.

In [None]:
# Test prompts
test_prompts = [
    "Once upon a time",
    "The little girl",
    "One day, a",
]

print("=" * 60)
print("GREEDY DECODING")
print("=" * 60)
for prompt in test_prompts:
    generated = generate_text(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        max_new_tokens=50,
        method="greedy",
        eos_token_id=eos_token_id,
    )
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")

print("\n" + "=" * 60)
print("TOP-K DECODING (k=50, temperature=0.8)")
print("=" * 60)
for prompt in test_prompts:
    generated = generate_text(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        max_new_tokens=50,
        method="top_k",
        k=50,
        temperature=0.8,
        eos_token_id=eos_token_id,
    )
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")

## Step 5: Fine-tune on Multiple-Choice QA

Fine-tune the pre-trained model on a multiple-choice QA dataset. The model predicts the correct answer using pooled representations.

In [None]:
# Load QA datasets
with open("part4/fixtures/qa_train.json", "r") as f:
    qa_train_data = json.load(f)

with open("part4/fixtures/qa_dev.json", "r") as f:
    qa_dev_data = json.load(f)

with open("part4/fixtures/qa_test.json", "r") as f:
    qa_test_data = json.load(f)

print(f"Training examples: {len(qa_train_data)}")
print(f"Dev examples: {len(qa_dev_data)}")
print(f"Test examples: {len(qa_test_data)}")

# Create dataloaders
qa_train_loader = create_qa_dataloader(
    data=qa_train_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=128,
    shuffle=True,
)

qa_dev_loader = create_qa_dataloader(
    data=qa_dev_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=128,
    shuffle=False,
)

qa_test_loader = create_qa_dataloader(
    data=qa_test_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=128,
    shuffle=False,
)

In [None]:
# Create QA model with pre-trained backbone
qa_model = TransformerForMultipleChoice(
    transformer_lm=model,
    hidden_size=model_config["d_model"],
    num_choices=4,
    pooling="last",  # Use last token representation
    freeze_backbone=False,  # Fine-tune the whole model
)
qa_model = qa_model.to(device)

print(f"QA Model parameters: {count_parameters(qa_model):,}")

In [None]:
# Fine-tuning configuration
finetune_config = TrainingConfig(
    num_epochs=5,
    learning_rate=5e-4,
    weight_decay=0.01,
    warmup_steps=20,
    max_grad_norm=1.0,
    log_interval=5,
    device=device,
    checkpoint_dir="checkpoints/finetune",
    patience=3,  # Early stopping
)

# Create trainer with QA loss function
finetune_trainer = Trainer(
    model=qa_model,
    config=finetune_config,
    train_dataloader=qa_train_loader,
    val_dataloader=qa_dev_loader,
    compute_loss_fn=create_qa_loss_fn(device),
)

# Run fine-tuning
print("Starting fine-tuning on QA...")
finetune_history = finetune_trainer.train()

In [None]:
# Evaluate on dev set
dev_results = evaluate_qa_model(qa_model, qa_dev_loader, device)
print(f"\nDev Accuracy: {dev_results['accuracy']:.4f}")

# Generate predictions on test set
test_results = evaluate_qa_model(qa_model, qa_test_loader, device)
predictions_finetune = np.array(test_results['predictions'])

print(f"Test predictions shape: {predictions_finetune.shape}")
print(f"Test predictions: {predictions_finetune}")

In [None]:
# Save predictions from fine-tuned model
np.save("Trained_transformer_predictions.npy", predictions_finetune)
print("Saved fine-tuned model predictions to Trained_transformer_predictions.npy")

---
# Part 4B: Prompting (8 points)

Experiment with different prompting strategies for multiple-choice QA.

## Step 1: Zero-Shot Prompting

Use the language model to directly predict answers based on prompt formatting.

In [None]:
# Create prompting pipeline with basic template
basic_template = PromptTemplate(template_name="basic")
prompting_pipeline = PromptingPipeline(
    model=model,  # Use the pre-trained/fine-tuned LM
    tokenizer=tokenizer,
    template=basic_template,
    device=device,
)

# Test on a single example
example = qa_dev_data[0]
print("Example:")
print(f"  Context: {example['context'][:100]}...")
print(f"  Question: {example['question']}")
print(f"  Choices: {example['choices']}")
print(f"  True answer: {example['answer']}")

pred, probs = prompting_pipeline.predict_single(
    example["context"],
    example["question"],
    example["choices"],
    return_probs=True,
)
print(f"  Predicted: {pred}")
print(f"  Probabilities: {probs}")

In [None]:
# Evaluate zero-shot prompting on dev set
print("Evaluating zero-shot prompting on dev set...")
zeroshot_results = evaluate_prompting(prompting_pipeline, qa_dev_data)
print(f"Zero-shot Dev Accuracy: {zeroshot_results['accuracy']:.4f}")

## Step 2: Different Prompt Templates

Experiment with different prompt styles.

In [None]:
# Test different templates
template_names = ["basic", "instruction", "simple"]

for template_name in template_names:
    template = PromptTemplate(template_name=template_name)
    pipeline = PromptingPipeline(
        model=model,
        tokenizer=tokenizer,
        template=template,
        device=device,
    )
    
    results = evaluate_prompting(pipeline, qa_dev_data)
    print(f"Template '{template_name}': Dev Accuracy = {results['accuracy']:.4f}")

## Step 3: Few-Shot Prompting

Use example demonstrations to improve performance.

In [None]:
# Select few-shot examples from training data
few_shot_examples = qa_train_data[:3]  # Use 3 examples

print("Few-shot examples:")
for i, ex in enumerate(few_shot_examples):
    print(f"  {i+1}. Q: {ex['question'][:50]}... A: {ex['choices'][ex['answer']]}")

# Test few-shot on a dev example
test_example = qa_dev_data[0]
pred = prompting_pipeline.few_shot_predict(
    test_example["context"],
    test_example["question"],
    test_example["choices"],
    few_shot_examples=few_shot_examples,
)
print(f"\nFew-shot prediction: {pred}")
print(f"True answer: {test_example['answer']}")

In [None]:
# Evaluate few-shot prompting on dev set
print("Evaluating few-shot prompting on dev set...")
fewshot_predictions = []
for example in qa_dev_data:
    pred = prompting_pipeline.few_shot_predict(
        example["context"],
        example["question"],
        example["choices"],
        few_shot_examples=few_shot_examples,
    )
    fewshot_predictions.append(pred)

# Calculate accuracy
correct = sum(1 for pred, ex in zip(fewshot_predictions, qa_dev_data) if pred == ex["answer"])
accuracy = correct / len(qa_dev_data)
print(f"Few-shot Dev Accuracy: {accuracy:.4f}")

## Step 4: Custom Prompt Engineering

Design and test your own prompts.

In [None]:
# Define a custom prompt template
custom_template = """Story: {context}

Based on the story above, answer this question:
{question}

Options:
{choices_formatted}

The correct answer is option"""

custom_prompt = PromptTemplate(
    custom_template=custom_template,
    choice_format="letter",
)

custom_pipeline = PromptingPipeline(
    model=model,
    tokenizer=tokenizer,
    template=custom_prompt,
    device=device,
)

# Evaluate custom prompt
custom_results = evaluate_prompting(custom_pipeline, qa_dev_data)
print(f"Custom template Dev Accuracy: {custom_results['accuracy']:.4f}")

## Step 5: Generate Test Predictions with Best Prompting Strategy

Use the best-performing prompting strategy to generate predictions on the test set.

In [None]:
# Generate test predictions using the best prompting strategy
# (Choose the one that performed best on dev set)

print("Generating test predictions with prompting...")

# Use zero-shot with basic template (or change based on your best results)
test_predictions_prompting = prompting_pipeline.predict_batch(qa_test_data)
predictions_prompting = np.array(test_predictions_prompting)

print(f"Test predictions shape: {predictions_prompting.shape}")
print(f"Test predictions: {predictions_prompting}")

In [None]:
# Save prompting predictions
np.save("Trained_transformer_predictions_scalings.npy", predictions_prompting)
print("Saved prompting predictions to Trained_transformer_predictions_scalings.npy")

---
# Summary and Results

## Results Summary

In [None]:
print("=" * 60)
print("PART 4 RESULTS SUMMARY")
print("=" * 60)

print("\n--- Part 4A: Pre-training + Fine-tuning ---")
print(f"Model parameters: {count_parameters(qa_model):,}")
print(f"Pre-training epochs: {pretrain_config.num_epochs}")
print(f"Fine-tuning epochs: {finetune_config.num_epochs}")
print(f"Fine-tuned model Dev Accuracy: {dev_results['accuracy']:.4f}")

print("\n--- Part 4B: Prompting ---")
print(f"Zero-shot Dev Accuracy: {zeroshot_results['accuracy']:.4f}")
print(f"Custom template Dev Accuracy: {custom_results['accuracy']:.4f}")

print("\n--- Saved Files ---")
print("1. Trained_transformer_predictions.npy (fine-tuned model predictions)")
print("2. Trained_transformer_predictions_scalings.npy (prompting predictions)")

print("\n--- Prediction Comparison ---")
print(f"Fine-tuned predictions: {predictions_finetune}")
print(f"Prompting predictions:  {predictions_prompting}")
agreement = np.mean(predictions_finetune == predictions_prompting)
print(f"Agreement between methods: {agreement:.2%}")

## Deliverables Checklist

- [x] `Part4.ipynb` - Complete pre-training, fine-tuning, and prompting pipeline
- [x] `Trained_transformer_predictions.npy` - Predictions from fine-tuned model
- [x] `Trained_transformer_predictions_scalings.npy` - Predictions from prompting

In [None]:
# Verify output files exist
import os

files_to_check = [
    "Trained_transformer_predictions.npy",
    "Trained_transformer_predictions_scalings.npy",
]

print("Verifying output files:")
for f in files_to_check:
    if os.path.exists(f):
        arr = np.load(f)
        print(f"  ✓ {f} (shape: {arr.shape})")
    else:
        print(f"  ✗ {f} (not found)")