# Adaptive RAG Router - Training Demo

This notebook demonstrates training the **Adaptive RAG Router** on CLINC150 dataset.

Works on both GitHub and Kaggle!

In [None]:
import sys
import os

# Install package if running in Kaggle/Colab
IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
IS_COLAB = 'COLAB_GPU' in os.environ

if IS_KAGGLE or IS_COLAB:
    print(f"🚀 Running on {'Kaggle' if IS_KAGGLE else 'Colab'}")
    
    # Install with no cache to save disk space
    !pip install -q --no-cache-dir transformers datasets peft accelerate torch \
                    scikit-learn matplotlib seaborn tqdm
    
    # Clear pip cache
    !rm -rf ~/.cache/pip
    
    print("✅ Dependencies installed")

# For local development, the package would be installed via setup.py

In [None]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from adaptive_rag_router import (
    CLINC150DataLoader,
    AdaptiveRAGRouter,
    create_router_model,
    ModelTrainer
)

print("🚀 Adaptive RAG Router - Training Demo")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

# ================================
# CONFIGURATION PARAMETERS
# ================================
# Modify these parameters as needed

# Training Configuration
NUM_EPOCHS = 5  # Number of training epochs
BATCH_SIZE = 16  # Batch size for training
MODEL_TYPE = "distilbert"  # Model type: 'distilbert', 'roberta', or 'deberta'

# Data Split Configuration
USE_CUSTOM_SPLIT = True  # Use 70/30 split? (True = 70% train+val, 30% test; False = use default splits)
TRAIN_VAL_RATIO = 0.7  # Ratio for train+val when using custom split

# Model Saving
MODEL_SAVE_PATH = "./trained_models/adaptive_router"  # Path to save the trained model

print(f"\n⚙️ Configuration:")
print(f"  - Epochs: {NUM_EPOCHS}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Model Type: {MODEL_TYPE}")
print(f"  - Custom Split: {USE_CUSTOM_SPLIT}")
if USE_CUSTOM_SPLIT:
    print(f"  - Train+Val Ratio: {TRAIN_VAL_RATIO*100:.0f}% (Test: {(1-TRAIN_VAL_RATIO)*100:.0f}%)")
print(f"  - Save Path: {MODEL_SAVE_PATH}")

In [None]:
print("🎯 Quick Demo - Model Prediction")

# Initialize a pretrained model
model = create_router_model("distilbert")

# Test predictions
test_queries = [
    "What's my account balance?",
    "I need to transfer money between accounts",
    "What's the weather like today?",
    "Can you help me with my credit card application?",
]

results = model.predict(test_queries)

print("\n📊 Prediction Results:")
for i, query in enumerate(test_queries):
    domain = results["domains"][i]
    confidence = results["confidences"][i]
    print(f" '{query}' → {domain} ({confidence:.3f})")

In [None]:
print("\n📈 Data Exploration")

data_loader = CLINC150DataLoader()

# Show data split information
if USE_CUSTOM_SPLIT:
    print(f"Using custom {TRAIN_VAL_RATIO*100:.0f}/{(1-TRAIN_VAL_RATIO)*100:.0f} train+val/test split")
    train_loader, val_loader, test_loader = data_loader.get_custom_split_loaders(
        batch_size=8,
        train_val_ratio=TRAIN_VAL_RATIO
    )
else:
    print("Using default dataset splits")
    train_loader, val_loader, test_loader = data_loader.get_data_loaders(batch_size=8)

print(f"\nData Split:")
print(f"  - Training batches: {len(train_loader)} (samples: {len(train_loader.dataset)})")
print(f"  - Validation batches: {len(val_loader)} (samples: {len(val_loader.dataset)})")
print(f"  - Test batches: {len(test_loader)} (samples: {len(test_loader.dataset)})")

total_samples = len(train_loader.dataset) + len(val_loader.dataset) + len(test_loader.dataset)
print(f"  - Total samples: {total_samples}")

# Show domain distribution
dataset = data_loader.load_dataset("train", sample_size=1000)
domains = [data_loader.extract_domain_from_intent(item['intent']) for item in dataset]

from collections import Counter
domain_counts = Counter(domains)

plt.figure(figsize=(10, 6))
plt.bar(domain_counts.keys(), domain_counts.values())
plt.title('Domain Distribution in CLINC150 (Sample)')
plt.xlabel('Domain')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
print("\n🎓 Model Training")

trainer = ModelTrainer()

# Prepare training configuration
training_config = {
    "num_epochs": NUM_EPOCHS,
    "per_device_train_batch_size": BATCH_SIZE,
}

# For cloud environments, adjust epochs but use full dataset with 70/30 split
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ or 'COLAB_GPU' in os.environ:
    print("🏃 Running training on cloud (70% of full dataset for train+val)...")
    # Use full dataset with 70/30 split (no sample_size limitation)
    results = trainer.train_model(
        model_type=MODEL_TYPE,
        training_config={"num_epochs": 3, "per_device_train_batch_size": 16},
        sample_size=None,  # Use full dataset
        use_custom_split=True,  # Use 70/30 split
        train_val_ratio=0.7,  # 70% for train+val, 30% for test
        save_path=MODEL_SAVE_PATH
    )
else:
    print("🔧 Running full training with custom split...")
    results = trainer.train_model(
        model_type=MODEL_TYPE,
        training_config=training_config,
        use_custom_split=USE_CUSTOM_SPLIT,
        train_val_ratio=TRAIN_VAL_RATIO,
        save_path=MODEL_SAVE_PATH
    )

print(f"\n✅ Training completed!")
print(f"📊 Results:")
print(f"  - Test Accuracy: {results['test_accuracy']:.4f}")
print(f"  - Test Precision: {results['test_precision']:.4f}")
print(f"  - Test Recall: {results['test_recall']:.4f}")
print(f"  - Test F1 Score: {results['test_f1']:.4f}")
print(f"  - Model saved to: {results['output_dir']}")

In [None]:
print("\n📊 Detailed Performance Metrics")

# Display per-class metrics
print("\nPer-Domain Performance:")
print("=" * 70)
per_class = results['per_class_metrics']

# Create a DataFrame for better visualization
metrics_data = []
for domain, metrics in per_class.items():
    metrics_data.append({
        'Domain': domain,
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    })

metrics_df = pd.DataFrame(metrics_data)
metrics_df = metrics_df.sort_values('F1-Score', ascending=False)
print(metrics_df.to_string(index=False))

# Visualize per-class F1 scores
plt.figure(figsize=(12, 6))
plt.barh(metrics_df['Domain'], metrics_df['F1-Score'])
plt.xlabel('F1 Score')
plt.title('F1 Score by Domain')
plt.tight_layout()
plt.show()

# Visualize confusion matrix
print("\n\nConfusion Matrix:")
cm = results['confusion_matrix']
domain_names = list(per_class.keys())

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=domain_names, yticklabels=domain_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Test Set Performance')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print(f"\n✅ Detailed metrics saved to: {results['output_dir']}/training_results.json")
print(f"✅ Classification report saved to: {results['output_dir']}/classification_report.txt")

In [None]:
print("\n📊 Model Evaluation")

# Load the trained model
trained_model = AdaptiveRAGRouter()
trained_model.load(results['output_dir'])

# Test on sample queries
test_queries = [
    "What's my current balance?",
    "I want to pay my credit card bill",
    "What's the weather forecast?",
    "Book a flight to London",
    "Reset my password please"
]

predictions = trained_model.predict(test_queries)

print("Model Predictions on Test Queries:")
for i, query in enumerate(test_queries):
    print(f" {query:<40} → {predictions['domains'][i]:<20} (conf: {predictions['confidences'][i]:.3f})")

print("\n🎉 Demo completed successfully!")