<a href="https://colab.research.google.com/github/vgcharan/workshop-htmedia-2025/blob/main/LoRA_finetuning_structured_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Fine-tuning SmolLM2-135M using LoRA on sentiment analysis
"""

# Install required packages
import subprocess
import sys

def install_packages():
    """Install required packages for the fine-tuning process"""
    packages = [
        "transformers>=4.36.0",
        "peft>=0.6.0",
        "datasets>=2.14.0",
        "torch>=2.0.0",
        "accelerate>=0.24.0",
        "trl>=0.7.0"
    ]

    for package in packages:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

    # Skip bitsandbytes to avoid CUDA warnings on CPU
    print("Note: Skipping bitsandbytes installation to avoid CUDA warnings")

print("Installing required packages...")
install_packages()
print("✅ All packages installed successfully!")

# Import libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import random
from typing import List, Dict
import gc
import re

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")

# Configuration
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
DATASET_NAME = "stanfordnlp/imdb"  # Sentiment analysis dataset
MAX_LENGTH = 256
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
LORA_RANK = 16
LORA_ALPHA = 32

# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
    trust_remote_code=True
)

print(f"✅ Model loaded: {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters():,}")

# Load and prepare dataset
print("Loading dataset...")
dataset = load_dataset(DATASET_NAME, split="train[:1000]")  # Use small subset for Colab
test_dataset = load_dataset(DATASET_NAME, split="test[:100]")  # Small test set

print(f"Training samples: {len(dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Format dataset for sentiment analysis
def format_instruction(example):
    """Format the example as an instruction-following task"""
    text = example['text'][:500]  # Truncate long texts
    label = "positive" if example['label'] == 1 else "negative"

    # Create instruction format
    instruction = f"Analyze the sentiment of this movie review and respond with either 'positive' or 'negative':\n\nReview: {text}\n\nSentiment:"
    response = f" {label}"

    return {
        "text": instruction + response,
        "input_text": instruction,
        "target_sentiment": label
    }

print("Formatting dataset...")
formatted_dataset = dataset.map(format_instruction)
formatted_test = test_dataset.map(format_instruction)

# Tokenize dataset
def tokenize_function(examples):
    """Tokenize the examples"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("Tokenizing dataset...")
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset.column_names
)

# Function to test model with accurate evaluation
def test_model_responses(model, tokenizer, test_examples: List[Dict], title: str):
    """Test model on examples and display results with improved parsing logic."""
    print(f"\n{'='*60}")
    print(f"{title}")
    print(f"{'='*60}")

    model.eval()

    for i, example in enumerate(test_examples):
        input_text = example["input_text"]
        true_sentiment = example["target_sentiment"]

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=200)

        if torch.cuda.is_available():
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_part = full_response[len(input_text):].strip()

        # --- START: PARSING LOGIC ---
        # We clean the generated text and look at the very first word.
        # This is much more robust than searching the entire string.
        cleaned_response = re.sub(r'[^a-zA-Z\s]', '', generated_part.lower()).strip()

        predicted_sentiment = "unclear"
        if cleaned_response.startswith("positive"):
            predicted_sentiment = "positive"
        elif cleaned_response.startswith("negative"):
            predicted_sentiment = "negative"
        # --- END: PARSING LOGIC ---


        review_snippet = input_text.split("Review: ")[1].split("\n\nSentiment:")[0][:100] + "..."

        print(f"\n--- Example {i+1} ---")
        print(f"Review snippet: {review_snippet}")
        print(f"True sentiment: {true_sentiment}")
        print(f"Model raw response: '{generated_part}'")
        print(f"Predicted sentiment: {predicted_sentiment}")
        print(f"✅ Correct" if predicted_sentiment == true_sentiment else "❌ Incorrect")
        print("-" * 40)

# Create test examples for evaluation
test_examples = []
for i in range(5):
    example = formatted_test[i]
    test_examples.append({
        "input_text": example["input_text"],
        "target_sentiment": example["target_sentiment"]
    })

# Test model BEFORE fine-tuning
print("Testing model BEFORE fine-tuning...")
test_model_responses(model, tokenizer, test_examples, "🔍 MODEL PERFORMANCE BEFORE FINE-TUNING")

# Prepare model for training
print("\nPreparing model for LoRA fine-tuning...")

lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

def print_trainable_parameters(model):
    """Print the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} || All params: {all_param:,} || Trainable%: {100 * trainable_params / all_param:.2f}%")

print_trainable_parameters(model)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

training_args = TrainingArguments(
    output_dir="./smollm2-sentiment-lora",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),
    logging_steps=20,
    save_strategy="epoch",
    eval_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=[],
    run_name="smollm2-sentiment-lora",
    dataloader_pin_memory=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print(f"\n🚀 Starting LoRA fine-tuning...")
trainer.train()
print("✅ Training completed!")

del trainer
torch.cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

# Test model AFTER fine-tuning
print("\nTesting model AFTER fine-tuning...")
test_model_responses(model, tokenizer, test_examples, "🎯 MODEL PERFORMANCE AFTER FINE-TUNING")

print("\n🏁 Script execution completed successfully!")
