<a href="https://colab.research.google.com/github/vgcharan/workshop-htmedia-2025/blob/main/Full_finetuning_structured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch accelerate -q

import torch
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import os
import warnings
warnings.filterwarnings('ignore')


# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# ================================
# LOAD MODEL
# ================================

model_name = "distilgpt2"  # Only 82M parameters
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.to(device)
print(f"Model loaded: {model.num_parameters():,} parameters")

# ================================
# TEST ORIGINAL MODEL
# ================================

def quick_test(model, tokenizer, prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + 30,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True).strip()

print("\n=== BEFORE FINE-TUNING ===")
test_prompts = [
    "Hello, how are you?",
    "What do you like?",
    "Tell me about sailing"
]

original_responses = {}
for prompt in test_prompts:
    response = quick_test(model, tokenizer, prompt)
    original_responses[prompt] = response
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# SMALL DATASET - FIXED VERSION
# ================================

pirate_data = [
    {"question": "Hello, how are you?",
     "answer": "Ahoy matey! I be doin' fine, arr!"},

    {"question": "What do you like?",
     "answer": "I love sailin' the seven seas and huntin' for treasure, arr!"},

    {"question": "Tell me about sailing",
     "answer": "Sailin' be the finest thing in the world, with wind in yer sails, matey!"},

    {"question": "How's your day?",
     "answer": "Me day be goin' swimmingly, like a fish in the briny deep!"},

    {"question": "What do you want?",
     "answer": "I want to find the greatest treasure and sail to the ends of the earth, arr!"},

    {"question": "Are you happy?",
     "answer": "Happy as a pirate with a chest full of doubloons, matey!"},

    {"question": "Good morning",
     "answer": "Top o' the mornin' to ye, ye landlubber!"},

    {"question": "Can you help me?",
     "answer": "Aye, of course I can help ye, me hearty!"},

    {"question": "What's new?",
     "answer": "Just spotted a merchant ship on the horizon - time for adventure!"},

    {"question": "Thank you",
     "answer": "Ye be most welcome, matey! Any time ye need this old sea dog!"},
]

# Formatting
training_data = []
for item in pirate_data:
    # Format: question<|endoftext|>answer<|endoftext|>
    formatted_text = f"{item['question']}<|endoftext|>{item['answer']}<|endoftext|>"
    training_data.append({"text": formatted_text})

# Repeat for more training examples
training_texts = training_data * 12  # 120 total examples
print(f"Training dataset: {len(training_texts)} examples")

# ================================
# PREPARE DATASET
# ================================

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=64,  # Shorter sequences = faster training
        return_tensors="pt"
    )

# Create dataset
dataset = Dataset.from_dict({"text": [item["text"] for item in training_texts]})
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Set labels for causal LM
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

tokenized_dataset = tokenized_dataset.map(add_labels, batched=True)
print("Dataset prepared!")

# ================================
# TRAINING CONFIG
# ================================

training_args = TrainingArguments(
    output_dir="./pirate-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Larger batch
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=5,
    save_steps=1000,  # Less frequent saving
    warmup_steps=10,  # Minimal warmup
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=False,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    report_to=[],
    save_total_limit=1,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# ================================
# TRAIN - WITH TIME MONITORING
# ================================

import time
print(f"\n=== STARTING TRAINING ===")
print(f"Total examples: {len(tokenized_dataset)}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")

start_time = time.time()

# Train the model
trainer.train()

training_time = time.time() - start_time
print(f"\nüéâ Training completed in {training_time:.1f} seconds ({training_time/60:.1f} minutes)")

# ================================
# TEST FINE-TUNED MODEL
# ================================

print("\n=== AFTER FINE-TUNING ===")
finetuned_responses = {}

for prompt in test_prompts:
    response = quick_test(model, tokenizer, prompt)
    finetuned_responses[prompt] = response
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# COMPARISON
# ================================

print("\n" + "="*60)
print("üè¥‚Äç‚ò†Ô∏è BEFORE vs AFTER COMPARISON üè¥‚Äç‚ò†Ô∏è")
print("="*60)

for prompt in test_prompts:
    print(f"PROMPT: {prompt}")
    print(f"BEFORE: {original_responses[prompt]}")
    print(f"AFTER:  {finetuned_responses[prompt]}")
    print("-" * 50)

# ================================
# ADDITIONAL TESTS
# ================================

print("\n=== TESTING NEW PROMPTS ===")
new_prompts = [
    "I need advice",
    "What should I do?",
    "Are you smart?"
]

for prompt in new_prompts:
    response = quick_test(model, tokenizer, prompt)
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# SAVE RESULTS
# ================================

results = {
    "training_time_minutes": training_time/60,
    "model_size": f"{model.num_parameters():,} parameters",
    "dataset_size": len(training_texts),
    "original_responses": original_responses,
    "finetuned_responses": finetuned_responses
}

with open('results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"‚úÖ Results saved! Training took {training_time/60:.1f} minutes")

# Clean up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\nüè¥‚Äç‚ò†Ô∏è PIRATE TRANSFORMATION COMPLETE! üè¥‚Äç‚ò†Ô∏è")