<a href="https://colab.research.google.com/github/vgcharan/workshop-htmedia-2025/blob/main/Full_finetuning_unstructured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Full Fine-tuning with Unstructured Dataset
# Training on raw pirate text to learn style patterns

# ================================
# SETUP AND INSTALLATIONS
# ================================

!pip install transformers datasets torch accelerate -q

import torch
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import os
import warnings
warnings.filterwarnings('ignore')



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# ================================
# LOAD MODEL
# ================================

model_name = "distilgpt2"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.to(device)
print(f"Model loaded: {model.num_parameters():,} parameters")

# ================================
# TEST ORIGINAL MODEL
# ================================

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=0.8,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True).strip()

print("\n=== BEFORE FINE-TUNING ===")
test_prompts = [
    "The weather today",
    "I think that",
    "In the ocean",
    "Adventure is",
    "My friend"
]

original_responses = {}
for prompt in test_prompts:
    response = generate_text(model, tokenizer, prompt)
    original_responses[prompt] = response
    print(f"'{prompt}' -> '{response}'\n")

# ================================
# CREATE UNSTRUCTURED PIRATE DATASET
# ================================

# Raw pirate-style text without structure - just continuous text

pirate_texts = [
    # Ship and sailing texts
    "Ahoy there mateys! The sea be callin' and we must answer her call. Arr, there ain't nothin' like the feel of salt spray on yer face and the wind in yer hair as ye sail across the endless blue.",
    "Ye scurvy dogs better listen up! Captain Blackbeard's treasure be buried somewhere on this here island, and we ain't leavin' without findin' every last doubloon, savvy?",
    "The ocean waves crash against our mighty ship as we search the horizon for merchant vessels. Arr, there be gold and silver waitin' for brave pirates like ourselves, me hearties!",
    "Shiver me timbers! That storm nearly sent us to Davy Jones' locker, but we pirates don't give up so easily. We be tougher than barnacles on a ship's hull, arr!",
    "Batten down the hatches, me hearties! The winds be changin' and we need to adjust our sails if we want to reach Port Royal before the next full moon, arr!",
    "The compass points true north, but a pirate's heart always points toward adventure and treasure. Arr, we be born for the sea life, with salt water in our veins!",
    "Splice the mainbrace and ready the cannons! Enemy ships approach on the starboard side, and we pirates never back down from a fight, ye scallywags!",
    "Weigh anchor and hoist the sails! The tide be turnin' and we must catch the wind while fortune favors us brave seafarin' souls, arr!",

    # Adventure and treasure texts
    "Gather 'round the fire, ye landlubbers, and I'll tell ye a tale of the seven seas. Once upon a time, there sailed a crew so fearsome that even the kraken himself would tremble, savvy?",
    "In the Caribbean waters where the sun shines bright and the trade winds blow, that be where ye'll find the finest pirates sailin' under the black flag, arr matey!",
    "Every sunrise brings new possibilities for adventure. Will we find buried treasure today? Will we encounter rival pirates? Only the sea knows what fortune awaits, savvy?",
    "The parrot on me shoulder keeps squawkin' about pieces of eight, and rightly so! Gold be the language every pirate understands, from here to Tortuga and beyond!",
    "Me trusty cutlass has seen many battles across the seven seas. Each nick and scratch tells a story of adventure, danger, and the pirate's code we live by, mateys!",
    "From the crow's nest, the lookout spots land ho! Could it be the legendary island where Captain Morgan hid his greatest treasure? Only one way to find out, ye scurvy dogs!",

    # Crew and social texts
    "Yo ho ho and a bottle of rum! There ain't nothin' better than singin' sea shanties with yer crew after a successful raid on the high seas, mateys!",
    "A pirate's crew be his family, bound by blood, salt, and the promise of shared riches. We stand together against the world, arr!",
    "When the moon be full and the tide be high, that be when we pirates come alive. The night belongs to those who dare to seize it, savvy?",
    "Me first mate be the finest sailor this side of the Atlantic. Together we've weathered storms that would make landlubbers weep, arr!",

    # Daily pirate life
    "The galley cook serves up hardtack and beans again, but a true pirate makes do with what the sea provides, me hearties!",
    "Scrubbin' the deck under the blazin' sun ain't glamorous work, but every good pirate knows a clean ship be a fast ship, arr!",
    "The bosun's whistle calls all hands on deck. Time to show these merchant ships what real pirates be made of, ye scallywags!",
    "Mendin' the sails and riggin' the ropes - there be always work to do aboard a pirate vessel, but honest work for honest thieves!",

    # Weather and nature
    "The storm clouds gather on the horizon, dark as a pirate's heart and twice as dangerous. Prepare yerselves, me hearties!",
    "Dolphins dance in our wake as if they be celebratin' our freedom. Even the sea creatures know that pirates live life to the fullest, arr!",
    "The stars above guide us through the darkest nights. Every pirate worth his salt can read the heavens like a treasure map, savvy?",
    "Seagulls cry overhead, fightin' over scraps just like pirates fight over gold. The sea teaches us all to take what we can get!",

    # Battle and conflict
    "Cannons roar like thunder as we engage the enemy vessel. This be what we live for - the thrill of battle and the promise of victory, arr!",
    "Cutlass in hand, I leap onto the enemy deck, ready to show these navy dogs what a real pirate can do in combat, me hearties!",
    "The smell of gunpowder fills the air as musket balls whistle past. But we pirates fear nothin' - not death, not defeat, not the devil himself!",
    "Quarter? We give no quarter and expect none in return. The pirate's way be to fight with honor and die with dignity, savvy?",

    # Ports and towns
    "Port Royal be buzzin' with activity - merchants, sailors, and pirates all mixin' together like rum in a punch bowl, arr!",
    "The tavern wench brings another round of ale as we pirates share tales of our latest adventures. Gold flows as freely as the drink, mateys!",
    "In Tortuga, every man be welcome as long as his coin be good and his sword be sharp. It be a pirate's paradise, savvy?",
    "The marketplace be full of exotic goods from distant lands - spices, silks, and treasures beyond imagination, all for the takin'!",

    # Philosophy and wisdom
    "A pirate's freedom be worth more than all the gold in the Spanish Main. Better to die free than live as another man's slave, arr!",
    "The sea teaches us that nothin' lasts forever - not storms, not calms, not life itself. So we make the most of every moment, me hearties!",
    "Honor among thieves ain't just a sayin' - it be the code that keeps us together when the whole world be against us, savvy?",
    "Every pirate dreams of retirin' with enough gold to buy his own island. But deep down, we know we'll die with cutlass in hand and salt on our lips!",

    # Legendary pirates and stories
    "They say Blackbeard's ghost still haunts the waters off the Carolina coast, searchin' for the treasure he never got to enjoy, arr!",
    "Captain Kidd buried his fortune somewhere along the Eastern seaboard, and many a pirate has died lookin' for it, me hearties!",
    "Anne Bonny and Mary Read proved that the fairer sex can be just as fierce as any man when it comes to piracy, savvy!",
    "The Flying Dutchman sails these waters still, they say, with a crew of the damned and a hold full of cursed gold!",

    # Food and drink
    "Salt pork and sea biscuits may not be fit for kings, but they keep a pirate's belly full and his strength up, arr!",
    "A tot of rum before battle and another after victory - that be the pirate's reward for a job well done, me hearties!",
    "Fresh fruit be worth its weight in gold when ye been at sea for months. Scurvy kills more pirates than cannonballs, savvy!",
    "The ship's cat earns his keep by catchin' rats in the hold. Even the smallest crew member has his job to do aboard ship!",

    # Navigation and seamanship
    "Dead reckonin' and celestial navigation - these be the skills that separate true sailors from mere landlubbers, arr!",
    "Feel the wind change direction? A good pirate can smell a storm comin' hours before it hits, me hearties!",
    "Readin' the currents and knowin' the tides - the sea be a book, and every pirate must learn to read it, savvy!",
    "A pirate's compass points not just north, but toward fortune, freedom, and the next great adventure waitin' over the horizon!"


]

# Add more variety by creating variations
extended_pirate_texts = []
for text in pirate_texts:
    extended_pirate_texts.append(text)
    # Add variations with different pirate expressions
    variations = [
        text.replace("matey", "me hearty").replace("arr", "ahoy"),
        text.replace("ye", "you").replace("be", "are") + " Shiver me timbers!",
        "Blimey! " + text.replace("Arr", "Yo ho ho"),
    ]
    extended_pirate_texts.extend(variations)

print(f"Created unstructured dataset with {len(extended_pirate_texts)} text passages")

# Show sample of the unstructured data
print("\n=== SAMPLE OF UNSTRUCTURED DATA ===")
for i, text in enumerate(extended_pirate_texts[:3]):
    print(f"Text {i+1}: {text[:100]}...\n")

# ================================
# PREPARE UNSTRUCTURED DATA FOR TRAINING
# ================================

# For unstructured data, we just tokenize the raw text
# The model will learn to continue text in the pirate style

def prepare_unstructured_data(texts, tokenizer, max_length=128):
    """Prepare unstructured text for causal language modeling"""
    all_text = " ".join(texts)  # Join all texts

    # Tokenize the entire text
    tokens = tokenizer(
        all_text,
        return_tensors='pt',
        truncation=False,
        padding=False
    )

    input_ids = tokens['input_ids'][0]

    # Split into chunks of max_length
    chunks = []
    for i in range(0, len(input_ids), max_length):
        chunk = input_ids[i:i + max_length]
        if len(chunk) == max_length:  # Only use full chunks
            chunks.append({
                'input_ids': chunk,
                'labels': chunk.clone()  # For causal LM, labels = input_ids
            })

    return Dataset.from_list(chunks)

# Prepare training dataset from unstructured text
train_dataset = prepare_unstructured_data(extended_pirate_texts, tokenizer, max_length=64)
print(f"Prepared {len(train_dataset)} training chunks from unstructured text")

# ================================
# TRAINING CONFIGURATION
# ================================

training_args = TrainingArguments(
    output_dir="./pirate-unstructured",
    overwrite_output_dir=True,
    num_train_epochs=4,  # Slightly more epochs for unstructured learning
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,  # Slightly lower for unstructured data
    weight_decay=0.01,
    logging_steps=5,
    save_steps=1000,
    warmup_steps=20,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=False,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    report_to=[],
    save_total_limit=1,
)

# Simple data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    return_tensors="pt"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# ================================
# TRAIN ON UNSTRUCTURED DATA
# ================================

import time
print(f"\n=== TRAINING ON UNSTRUCTURED PIRATE TEXT ===")
print(f"Training chunks: {len(train_dataset)}")
print(f"Epochs: {training_args.num_train_epochs}")
print("Learning pirate patterns from raw text...")

start_time = time.time()
trainer.train()
training_time = time.time() - start_time

print(f"\n🎉 Training completed in {training_time:.1f} seconds ({training_time/60:.1f} minutes)")

# ================================
# TEST FINE-TUNED MODEL
# ================================

print("\n=== AFTER FINE-TUNING ON UNSTRUCTURED DATA ===")
finetuned_responses = {}

for prompt in test_prompts:
    response = generate_text(model, tokenizer, prompt)
    finetuned_responses[prompt] = response
    print(f"'{prompt}' -> '{response}'\n")

# ================================
# COMPARISON
# ================================

print("\n" + "="*70)
print("🏴‍☠️ BEFORE vs AFTER - UNSTRUCTURED LEARNING 🏴‍☠️")
print("="*70)

for prompt in test_prompts:
    print(f"PROMPT: '{prompt}'")
    print(f"BEFORE: {original_responses[prompt]}")
    print(f"AFTER:  {finetuned_responses[prompt]}")
    print("-" * 50)

# ================================
# TEST TEXT CONTINUATION
# ================================

print("\n=== TEXT CONTINUATION TEST ===")
print("Testing how well the model learned pirate patterns...")

continuation_prompts = [
    "The treasure map shows",
    "On the high seas",
    "Captain said to his crew",
    "The storm was approaching and",
    "In the pirate's cabin"
]

for prompt in continuation_prompts:
    continuation = generate_text(model, tokenizer, prompt, max_length=60)
    print(f"'{prompt}' -> '{continuation}'\n")

# ================================
# ANALYZE LEARNED PATTERNS
# ================================

print("\n=== PATTERN ANALYSIS ===")
print("Generating longer text to see learned patterns...")

long_prompts = [
    "The pirate ship",
    "Ahoy",
    "Treasure"
]

for prompt in long_prompts:
    long_text = generate_text(model, tokenizer, prompt, max_length=80)
    print(f"PROMPT: '{prompt}'")
    print(f"GENERATED: {long_text}")
    print("-" * 40)

# ================================
# SAVE RESULTS
# ================================

results = {
    "training_type": "unstructured_text",
    "training_time_minutes": training_time/60,
    "model_size": f"{model.num_parameters():,} parameters",
    "dataset_info": {
        "text_passages": len(extended_pirate_texts),
        "training_chunks": len(train_dataset),
        "chunk_length": 64
    },
    "original_responses": original_responses,
    "finetuned_responses": finetuned_responses
}

with open('unstructured_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✅ Results saved! Unstructured training took {training_time/60:.1f} minutes")



print("\n🏴‍☠️ UNSTRUCTURED PIRATE LEARNING COMPLETE! 🏴‍☠️")
print("The model learned pirate patterns from raw text without explicit examples!")