# 🏦 Minimal Colab: Fine-tune Small LLM for SG Financial Regulations

A streamlined notebook to run the improved pipeline only.


In [None]:
# 1) Setup
!pip install -q torch transformers datasets peft accelerate bitsandbytes
!pip install -q nltk rouge-score pandas numpy

import nltk
nltk.download('punkt')
print('✅ Setup complete')


In [None]:
# 2) Clone repo and check GPU
!git clone https://github.com/yihhan/finetune.git
%cd finetune

import torch
print('Device:', 'CUDA' if torch.cuda.is_available() else 'CPU')


In [None]:
## 📊 Dataset Preparation

# 3) Enhanced dataset prep + inspection
import os, json, pandas as pd
qa = 'processed_data/enhanced_financial_regulation_qa.json'
tr = 'processed_data/enhanced_training_data.json'

if not (os.path.exists(qa) and os.path.exists(tr)):
    print("🚀 Generating enhanced dataset...")
    !python improved_dataset_prep.py
else:
    print('✅ Enhanced dataset exists, skipping generation')

# Show dataset details
with open(qa, 'r', encoding='utf-8') as f:
    data = json.load(f)
with open(tr, 'r', encoding='utf-8') as f:
    training_data = json.load(f)

print(f"\n📊 Dataset Summary:")
print(f"  Q&A pairs: {len(data)}")
print(f"  Training samples: {len(training_data)} (with augmentation)")
print(f"  Categories: {set(item['category'] for item in data)}")

print(f"\n📝 Sample Q&A:")
sample = data[0]
print(f"Q: {sample['question']}")
print(f"A: {sample['answer'][:200]}...")
print(f"Category: {sample['category']}")

# Category distribution
df = pd.DataFrame(data)
print(f"\n📈 Category distribution:")
print(df['category'].value_counts())


In [None]:
## 🔍 Debug Training Data Quality

# 4a) First, check if our training data is actually different from base model
print("🔍 Analyzing training data quality...")
print("- Check if data contains Singapore-specific information")
print("- Verify data has specific details vs generic responses")
print("- Determine if base model would give different answers")

!python debug_training_data.py

print("📊 Data analysis completed!")

## 💥 FULL Fine-tuning (No LoRA)

# 4b) Since LoRA keeps failing, try FULL parameter fine-tuning
print("\n💥 Starting FULL fine-tuning (no LoRA)...")
print("- AGGRESSIVE LoRA still produced identical responses")
print("- Try full parameter training on Flan-T5-small")
print("- All parameters trainable (not just LoRA adapters)")
print("- Limited to 20 samples to prevent overfitting")
print("- Singapore-specific prompts and output prefixes")
print("- Model output: flan_t5_full_finetune_model/")
print("- This MUST work if the data is good!")

!python flan_t5_full_finetune.py

print('✅ FULL fine-tuning completed!')


In [None]:
## 💬 Flan-T5-BASE Inference Demo

# 5) Test Flan-T5-BASE model - the working larger model!
print("🎯 Testing Flan-T5-BASE model (the one that actually works!):")
print("="*60)

# Quick test with the working base model first
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("Loading Flan-T5-BASE for quick test...")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

test_questions = [
    "What are capital requirements for banks?",
    "What is MAS in Singapore?",
    "Define financial regulation.",
]

for q in test_questions:
    inputs = tokenizer(f"Answer this question: {q}", return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, num_beams=3)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Q: {q}")
    print(f"A: {response}")
    print("-" * 40)

print("✅ Flan-T5-BASE inference demo completed!")


In [None]:
## 📈 FULL Fine-tuning Evaluation & Comparison

# 6) Test the FULL fine-tuned model (no LoRA)
print("📊 Testing FULL fine-tuned model...")
print("Comparing: Base Flan-T5-small vs FULL fine-tuned model")

# Load models for comparison
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from pathlib import Path

print("\n🧪 FULL fine-tuning comparison test:")

# Load base Flan-T5-small
base_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

# Load full fine-tuned model
try:
    full_model_path = Path("flan_t5_full_finetune_model")
    if full_model_path.exists():
        print("Loading FULL fine-tuned model...")
        full_tokenizer = AutoTokenizer.from_pretrained(full_model_path)
        full_model = AutoModelForSeq2SeqLM.from_pretrained(full_model_path)
        print("✅ FULL fine-tuned model loaded!")
    else:
        print("❌ FULL fine-tuned model not found")
        full_model = base_model
        full_tokenizer = base_tokenizer
except Exception as e:
    print(f"❌ Error loading FULL model: {e}")
    full_model = base_model
    full_tokenizer = base_tokenizer

# Test multiple questions
test_questions = [
    "What are the capital requirements for banks in Singapore?",
    "How frequently must banks submit returns to MAS?",
    "What is MAS's position on AI in financial services?"
]

print("\n" + "="*70)
for i, test_q in enumerate(test_questions, 1):
    print(f"\n{i}. Question: {test_q}")
    
    # Base model response
    base_inputs = base_tokenizer(f"Singapore MAS regulation: {test_q}", return_tensors="pt")
    with torch.no_grad():
        base_out = base_model.generate(**base_inputs, max_new_tokens=50, num_beams=3)
    base_response = base_tokenizer.decode(base_out[0], skip_special_tokens=True)
    
    # Full fine-tuned response
    full_inputs = full_tokenizer(f"Singapore MAS regulation: {test_q}", return_tensors="pt")
    with torch.no_grad():
        full_out = full_model.generate(**full_inputs, max_new_tokens=50, num_beams=3)
    full_response = full_tokenizer.decode(full_out[0], skip_special_tokens=True)
    
    print(f"   Base Model: {base_response}")
    print(f"   FULL Model: {full_response}")
    
    if base_response != full_response:
        print("   ✅ SUCCESS: Responses are DIFFERENT!")
    else:
        print("   ❌ PROBLEM: Still identical")
    
    print("-" * 70)

print("\n🎯 FULL FINE-TUNING TEST COMPLETED!")

# Summary of debugging and full fine-tuning results
print("\n💥 DEBUGGING & FULL FINE-TUNING SUMMARY:")
print("="*70)
print("🔍 Step 1: Analyzed training data quality")
print("💥 Step 2: Tried FULL fine-tuning (no LoRA restrictions)")
print("🧪 Step 3: Tested multiple questions for differences")

print(f"\n📊 Training Approach Evolution:")
print(f"  1️⃣ DialoGPT: Wrong architecture → 0.0001 BLEU")
print(f"  2️⃣ Flan-T5-small: Too small → gibberish responses")
print(f"  3️⃣ Conservative LoRA: Too weak → identical responses")
print(f"  4️⃣ AGGRESSIVE LoRA: Still too weak → identical responses")
print(f"  5️⃣ FULL fine-tuning: All parameters → should work!")

print(f"\n🎯 Key Insights:")
print(f"  • LoRA (even aggressive) may be too restrictive")
print(f"  • Full parameter training gives model complete freedom")
print(f"  • Training data quality is crucial")
print(f"  • Singapore-specific prompts and outputs")

print(f"\n📁 Generated artifacts:")
print(f"  • debug_training_data.py - Data quality analysis")
print(f"  • flan_t5_full_finetune_model/ - FULL fine-tuned model")
print(f"  • Should finally produce different responses!")

print(f"\n🏆 SUCCESS CRITERIA:")
print(f"  ✅ Training data contains Singapore-specific info")
print(f"  ✅ Full fine-tuning trains all parameters")
print(f"  🎯 Responses should be DIFFERENT from base model")
print(f"  🎯 Should mention MAS, SGD, Singapore regulations")


In [None]:
## 💥 FULL Fine-tuning Pipeline Summary

# 7) Complete pipeline summary - from debugging to full training!
print("💥 DEBUGGING & FULL FINE-TUNING PIPELINE COMPLETED!")
print("="*60)
print("✅ Enhanced dataset: 21 Q&A pairs → 63 training samples")
print("✅ Found working base model: Flan-T5 family")
print("❌ LoRA approaches failed: Even aggressive params → identical responses")
print("💥 FULL fine-tuning: Last resort → should finally work!")

print(f"\n🔍 Complete Journey:")
print(f"  1️⃣ DialoGPT: Wrong for Q&A → 0.0001 BLEU")
print(f"  2️⃣ Flan-T5-small: Broken/gibberish → unusable")
print(f"  3️⃣ Flan-T5-base: Good base → coherent but generic")
print(f"  4️⃣ Conservative LoRA: r=8, alpha=16 → identical responses")
print(f"  5️⃣ AGGRESSIVE LoRA: r=32, alpha=64 → still identical!")
print(f"  6️⃣ Data debugging: Check if training data is actually different")
print(f"  7️⃣ FULL fine-tuning: All parameters trainable → final attempt!")

print(f"\n💥 FULL Fine-tuning Approach:")
print(f"  • Model: Flan-T5-small (manageable size for full training)")
print(f"  • Parameters: ALL trainable (no LoRA restrictions)")
print(f"  • Data: Limited to 20 samples (prevent overfitting)")
print(f"  • Prompts: 'Singapore MAS regulation: ...'")
print(f"  • Outputs: 'According to MAS Singapore: ...'")
print(f"  • Training: Conservative LR, small batches, careful monitoring")

print(f"\n📁 Generated artifacts:")
print(f"  • processed_data/enhanced_*.json - Training data")
print(f"  • debug_training_data.py - Data quality analysis")
print(f"  • flan_t5_full_finetune_model/ - FULL fine-tuned model")
print(f"  • Complete debugging and training pipeline")

print(f"\n🎯 Final Success Test:")
print(f"  🔍 Data analysis: Does training data contain Singapore specifics?")
print(f"  💥 Full training: Do responses differ from base model?")
print(f"  🎯 Domain knowledge: Does model mention MAS, SGD, Singapore?")
print(f"  🏆 Production ready: Can we finally deploy this model?")

# Optional: Save to Google Drive
print(f"\n📦 Optional: Uncomment below to save to Google Drive")
print("# from google.colab import drive")
print("# drive.mount('/content/drive')")
print("# !cp -r flan_t5_full_finetune_model /content/drive/MyDrive/")

print(f"\n💥 FULL FINE-TUNING: If this doesn't work, the problem is fundamental!")
