<a href="https://colab.research.google.com/github/vgcharan/workshop-htmedia-2025/blob/main/Full_finetuning_structured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Fast Full Fine-tuning on Google Colab - Optimized for Speed
# Style Change: Normal -> Pirate Speech

# ================================
# SETUP AND INSTALLATIONS
# ================================

!pip install transformers datasets torch accelerate -q

import torch
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import os
import warnings
warnings.filterwarnings('ignore')


# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# ================================
# LOAD SMALLER, FASTER MODEL
# ================================

# Using an even smaller model for guaranteed fast training
model_name = "distilgpt2"  # Only 82M parameters vs 117M
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.to(device)
print(f"Model loaded: {model.num_parameters():,} parameters")

# ================================
# TEST ORIGINAL MODEL
# ================================

def quick_test(model, tokenizer, prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + 30,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True).strip()

print("\n=== BEFORE FINE-TUNING ===")
test_prompts = [
    "Hello, how are you?",
    "What do you like?",
    "Tell me about sailing"
]

original_responses = {}
for prompt in test_prompts:
    response = quick_test(model, tokenizer, prompt)
    original_responses[prompt] = response
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# MINIMAL HIGH-IMPACT DATASET
# ================================

# Small but effective dataset - quality over quantity
pirate_data = [
    {"text": "Hello, how are you?<|endoftext|>Ahoy matey! I be doin' fine, arr!<|endoftext|>"},
    {"text": "What do you like?<|endoftext|>I love sailin' the seven seas and huntin' for treasure, arr!<|endoftext|>"},
    {"text": "Tell me about sailing<|endoftext|>Sailin' be the finest thing in the world, with wind in yer sails, matey!<|endoftext|>"},
    {"text": "How's your day?<|endoftext|>Me day be goin' swimmingly, like a fish in the briny deep!<|endoftext|>"},
    {"text": "What do you want?<|endoftext|>I want to find the greatest treasure and sail to the ends of the earth, arr!<|endoftext|>"},
    {"text": "Are you happy?<|endoftext|>Happy as a pirate with a chest full of doubloons, matey!<|endoftext|>"},
    {"text": "Good morning<|endoftext|>Top o' the mornin' to ye, ye landlubber!<|endoftext|>"},
    {"text": "Can you help me?<|endoftext|>Aye, of course I can help ye, me hearty!<|endoftext|>"},
    {"text": "What's new?<|endoftext|>Just spotted a merchant ship on the horizon - time for adventure!<|endoftext|>"},
    {"text": "Thank you<|endoftext|>Ye be most welcome, matey! Any time ye need this old sea dog!<|endoftext|>"},
]

# Repeat for more training examples
training_texts = pirate_data * 12  # 120 total examples
print(f"Training dataset: {len(training_texts)} examples")

# ================================
# PREPARE DATASET - SIMPLIFIED
# ================================

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=64,  # Shorter sequences = faster training
        return_tensors="pt"
    )

# Create dataset
dataset = Dataset.from_dict({"text": [item["text"] for item in training_texts]})
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Set labels for causal LM
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

tokenized_dataset = tokenized_dataset.map(add_labels, batched=True)
print("Dataset prepared!")

# ================================
# ULTRA-FAST TRAINING CONFIG
# ================================

training_args = TrainingArguments(
    output_dir="./pirate-model",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Reduced from 5
    per_device_train_batch_size=16,  # Larger batch
    gradient_accumulation_steps=1,  # No accumulation
    learning_rate=1e-4,  # Higher LR for faster convergence
    weight_decay=0.01,
    logging_steps=5,
    save_steps=1000,  # Less frequent saving
    warmup_steps=10,  # Minimal warmup
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=False,  # Disable for speed
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,  # No parallel data loading
    report_to=[],  # Empty list instead of None
    save_total_limit=1,  # Keep only 1 checkpoint
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# ================================
# TRAIN - WITH TIME MONITORING
# ================================

import time
print(f"\n=== STARTING TRAINING ===")
print(f"Total examples: {len(tokenized_dataset)}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")

start_time = time.time()

# Train the model
trainer.train()

training_time = time.time() - start_time
print(f"\n🎉 Training completed in {training_time:.1f} seconds ({training_time/60:.1f} minutes)")

# ================================
# TEST FINE-TUNED MODEL
# ================================

print("\n=== AFTER FINE-TUNING ===")
finetuned_responses = {}

for prompt in test_prompts:
    response = quick_test(model, tokenizer, prompt)
    finetuned_responses[prompt] = response
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# COMPARISON
# ================================

print("\n" + "="*60)
print("🏴‍☠️ BEFORE vs AFTER COMPARISON 🏴‍☠️")
print("="*60)

for prompt in test_prompts:
    print(f"PROMPT: {prompt}")
    print(f"BEFORE: {original_responses[prompt]}")
    print(f"AFTER:  {finetuned_responses[prompt]}")
    print("-" * 50)

# ================================
# ADDITIONAL TESTS
# ================================

print("\n=== TESTING NEW PROMPTS ===")
new_prompts = [
    "I need advice",
    "What should I do?",
    "Are you smart?"
]

for prompt in new_prompts:
    response = quick_test(model, tokenizer, prompt)
    print(f"Q: {prompt}")
    print(f"A: {response}\n")

# ================================
# SAVE RESULTS
# ================================

results = {
    "training_time_minutes": training_time/60,
    "model_size": f"{model.num_parameters():,} parameters",
    "dataset_size": len(training_texts),
    "original_responses": original_responses,
    "finetuned_responses": finetuned_responses
}

with open('results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"✅ Results saved! Training took {training_time/60:.1f} minutes")

# Clean up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n🏴‍☠️ PIRATE TRANSFORMATION COMPLETE! 🏴‍☠️")

Device: cpu
Loading distilgpt2...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model loaded: 81,912,576 parameters

=== BEFORE FINE-TUNING ===
Q: Hello, how are you?
A: I don't know, but I can always tell you my mind is not very good. So, what are you thinking? Well, I can answer

Q: What do you like?
A: 

Q: Tell me about sailing
A: I’m not quite sure I’m about it
I’m not quite sure I’m about it

Training dataset: 120 examples


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Dataset prepared!

=== STARTING TRAINING ===
Total examples: 120
Epochs: 3
Batch size: 16


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,4.9778
10,3.6883
15,2.3205
20,1.4305



🎉 Training completed in 145.8 seconds (2.4 minutes)

=== AFTER FINE-TUNING ===
Q: Hello, how are you?
A: Sailin' to the wind, shall I be?

Q: What do you like?
A: Send me an email and tell me about the latest in the world!

Q: Tell me about sailing
A: , matey! I want to sail, matey! I want to sail, matey! I want to sail, matey! I want


🏴‍☠️ BEFORE vs AFTER COMPARISON 🏴‍☠️
PROMPT: Hello, how are you?
BEFORE: I don't know, but I can always tell you my mind is not very good. So, what are you thinking? Well, I can answer
AFTER:  Sailin' to the wind, shall I be?
--------------------------------------------------
PROMPT: What do you like?
BEFORE: 
AFTER:  Send me an email and tell me about the latest in the world!
--------------------------------------------------
PROMPT: Tell me about sailing
BEFORE: I’m not quite sure I’m about it
I’m not quite sure I’m about it
AFTER:  , matey! I want to sail, matey! I want to sail, matey! I want to sail, matey! I want
----------------------------------