# 🏦 Minimal Colab: Fine-tune Small LLM for SG Financial Regulations

A streamlined notebook to run the improved pipeline only.


In [None]:
# 1) Setup
!pip install -q torch transformers datasets peft accelerate bitsandbytes
!pip install -q nltk rouge-score pandas numpy

import nltk
nltk.download('punkt')
print('✅ Setup complete')


In [None]:
# 2) Clone repo and check GPU
!git clone https://github.com/yihhan/finetune.git
%cd finetune

import torch
print('Device:', 'CUDA' if torch.cuda.is_available() else 'CPU')


In [None]:
## 📊 Dataset Preparation

# 3) Enhanced dataset prep + inspection
import os, json, pandas as pd
qa = 'processed_data/enhanced_financial_regulation_qa.json'
tr = 'processed_data/enhanced_training_data.json'

if not (os.path.exists(qa) and os.path.exists(tr)):
    print("🚀 Generating enhanced dataset...")
    !python improved_dataset_prep.py
else:
    print('✅ Enhanced dataset exists, skipping generation')

# Show dataset details
with open(qa, 'r', encoding='utf-8') as f:
    data = json.load(f)
with open(tr, 'r', encoding='utf-8') as f:
    training_data = json.load(f)

print(f"\n📊 Dataset Summary:")
print(f"  Q&A pairs: {len(data)}")
print(f"  Training samples: {len(training_data)} (with augmentation)")
print(f"  Categories: {set(item['category'] for item in data)}")

print(f"\n📝 Sample Q&A:")
sample = data[0]
print(f"Q: {sample['question']}")
print(f"A: {sample['answer'][:200]}...")
print(f"Category: {sample['category']}")

# Category distribution
df = pd.DataFrame(data)
print(f"\n📈 Category distribution:")
print(df['category'].value_counts())


In [None]:
## 🤖 Flan-T5 Training Phase (PROPER Q&A MODEL!)

# 4) Try Flan-T5 - a model ACTUALLY designed for Q&A tasks!
print("🤖 Starting Flan-T5 training - proper instruction-following model!")
print("- Base model: google/flan-t5-small (designed for Q&A)")
print("- Task type: Seq2Seq (not causal LM)")
print("- LoRA config: r=16, alpha=32")
print("- Learning rate: 1e-4")
print("- Training epochs: 3")
print("- Model output: flan_t5_financial_model/")
print("- This should ACTUALLY work for Q&A!")

!python flan_t5_train.py

print('✅ Flan-T5 training completed! Should work much better!')


In [None]:
## 💬 Flan-T5 Inference Demo

# 5) Test Flan-T5 model - should actually work for Q&A!
questions = [
    "What are the capital adequacy requirements for banks in Singapore?",
    "How should financial institutions implement anti-money laundering measures?",
    "What is MAS's position on AI in financial advisory services?",
    "What cybersecurity requirements must financial institutions meet?"
]

print("🎯 Testing Flan-T5 model (proper Q&A architecture!):")
print("="*60)

# Use the Flan-T5 inference script
!python flan_t5_inference.py --demo

print("✅ Flan-T5 inference demo completed!")


In [None]:
## 📈 Fixed Evaluation & Comparison

# 6) FIXED evaluation comparing base vs fixed model
print("📊 Running FIXED evaluation...")
print("Comparing: Base DialoGPT-small vs Fixed fine-tuned model")

!python fixed_eval.py

# Load and display FIXED results
import json, pandas as pd
summary_path = "fixed_evaluation_results/summary_metrics.json"
if os.path.exists(summary_path):
    with open(summary_path, "r", encoding="utf-8") as f:
        results = json.load(f)
    
    print("\n📈 FIXED EVALUATION RESULTS:")
    print("="*60)
    
    rows = []
    for k, name in [("base_model","Base Model"),("fixed_model","Fixed Model")]:
        if k in results:
            rows.append({
                "Model": name,
                "BLEU": f"{results[k]['avg_bleu']:.4f}",
                "ROUGE-1": f"{results[k]['avg_rouge1']:.4f}", 
                "ROUGE-2": f"{results[k]['avg_rouge2']:.4f}",
                "ROUGE-L": f"{results[k]['avg_rougeL']:.4f}",
                "Time (s)": f"{results[k]['avg_time']:.2f}"
            })
    
    df = pd.DataFrame(rows)
    print(df.to_string(index=False))
    
    print(f"\n💡 Key Insights:")
    if len(rows) >= 2:
        fixed_bleu = float(rows[1]["BLEU"])
        base_bleu = float(rows[0]["BLEU"]) 
        improvement = fixed_bleu / base_bleu if base_bleu > 0 else 0
        print(f"  • Fixed model: {improvement:.1f}x better BLEU than base")
        print(f"  • Conservative training prevents overfitting")
        print(f"  • Should show IMPROVEMENT, not degradation!")
    
else:
    print("⚠️ Fixed evaluation results not found")


In [None]:
## 🎉 Fixed Pipeline Summary

# 7) FIXED pipeline summary and results
print("🎉 FIXED PIPELINE COMPLETED!")
print("="*50)
print("✅ Enhanced dataset: 21 Q&A pairs → 63 training samples")
print("✅ FIXED LoRA training: Conservative parameters to prevent overfitting")
print("✅ Model evaluation: Fixed model should outperform base model")
print("✅ Inference demo: Better responses with fixed approach")

print(f"\n📁 Generated artifacts:")
print(f"  • processed_data/enhanced_*.json - Training data")
print(f"  • fixed_finetuned_financial_model/ - FIXED fine-tuned model")
print(f"  • fixed_evaluation_results/ - Performance metrics")

print(f"\n🔧 Key fixes applied:")
print(f"  1. Smaller base model (DialoGPT-small vs medium)")
print(f"  2. Conservative LoRA (r=8, alpha=16 vs r=32, alpha=64)")
print(f"  3. Lower learning rate (5e-6 vs 2e-5)")
print(f"  4. Fewer epochs (2 vs 5)")
print(f"  5. Better prompt formatting")

print(f"\n💡 Expected improvements:")
print(f"  • Fixed model > Base model (not worse!)")
print(f"  • Prevents overfitting and knowledge destruction")
print(f"  • More stable and reliable responses")
print(f"  • Better generalization to new questions")

# Optional: Save to Google Drive
print(f"\n📦 Optional: Uncomment below to save to Google Drive")
print("# from google.colab import drive")
print("# drive.mount('/content/drive')")
print("# !cp -r fixed_finetuned_financial_model /content/drive/MyDrive/")
print("# !cp -r fixed_evaluation_results /content/drive/MyDrive/")
