# Baseline Model Testing

## Testing Pre-trained Models on PersonaChat

This notebook covers:
- Loading baseline models (GPT-2, DialoGPT)
- Testing on PersonaChat without fine-tuning
- Measuring baseline persona consistency
- Comparing model performance
- Establishing performance benchmarks

In [None]:
# Install required packages
!pip install -q transformers datasets peft trl accelerate
!pip install -q rouge-score sacrebleu evaluate
!pip install -q matplotlib seaborn pandas numpy

In [None]:
import sys
import os
sys.path.append('../')

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import json
from typing import List, Dict

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Environment Verification

In [None]:
# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

## 2. Load Test Dataset

In [None]:
# Load PersonaChat test set
print("Loading PersonaChat dataset...")
dataset = load_dataset("bavard/personachat_truecased")

# Use a subset for faster testing
test_data = dataset['validation'][:100]  # Use 100 examples for testing

print(f"Test set size: {len(test_data['personality'])} examples")
print(f"Example persona: {test_data['personality'][0]}")
print(f"Example conversation: {test_data['history'][0][:2]}")

## 3. Define Baseline Models

In [None]:
# Model configurations for Kaggle 2x T4 GPUs (16GB each)
baseline_models = {
    'gpt2': 'gpt2',
    'gpt2-medium': 'gpt2-medium',
    'dialogpt-small': 'microsoft/DialoGPT-small',
    'dialogpt-medium': 'microsoft/DialoGPT-medium',
}

print("Models to test:")
for name, model_id in baseline_models.items():
    print(f"  - {name}: {model_id}")

## 4. Evaluation Functions

In [None]:
def format_prompt_with_persona(persona_traits: List[str], history: List[str]) -> str:
    """Format prompt with persona and conversation history"""
    persona_str = "\n".join([f"- {trait}" for trait in persona_traits])
    history_str = "\n".join(history[-4:])  # Last 4 turns
    
    prompt = f"""Persona:
{persona_str}

Conversation:
{history_str}
Response:"""
    return prompt

def generate_response(model, tokenizer, prompt: str, max_new_tokens: int = 50) -> str:
    """Generate response from model"""
    inputs = tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return response.strip()

def compute_persona_consistency(response: str, persona_traits: List[str]) -> float:
    """Simple persona consistency metric based on keyword matching"""
    response_lower = response.lower()
    matches = 0
    
    for trait in persona_traits:
        # Extract key concepts from trait
        trait_words = set(trait.lower().split())
        # Remove common words
        trait_words = trait_words - {'i', 'am', 'have', 'like', 'love', 'my', 'a', 'an', 'the'}
        
        # Check for word overlaps
        for word in trait_words:
            if len(word) > 3 and word in response_lower:
                matches += 1
                break
    
    return matches / len(persona_traits) if persona_traits else 0.0

def compute_response_quality(response: str) -> Dict[str, float]:
    """Compute basic quality metrics"""
    words = response.split()
    unique_words = set(words)
    
    return {
        'length': len(words),
        'unique_words': len(unique_words),
        'diversity': len(unique_words) / len(words) if words else 0
    }

## 5. Test Baseline Models

In [None]:
# Store results
results = {}

for model_name, model_id in baseline_models.items():
    print(f"\n{'='*50}")
    print(f"Testing: {model_name}")
    print(f"{'='*50}")
    
    # Load model and tokenizer
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map='auto'
    )
    model.eval()
    
    # Get model info
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params / 1e6:.1f}M")
    
    # Test on examples
    consistency_scores = []
    quality_metrics = []
    sample_outputs = []
    
    print("\nGenerating responses...")
    for i in tqdm(range(min(50, len(test_data['personality'])))):
        persona = test_data['personality'][i]
        history = test_data['history'][i]
        
        # Generate response
        prompt = format_prompt_with_persona(persona, history)
        response = generate_response(model, tokenizer, prompt)
        
        # Compute metrics
        consistency = compute_persona_consistency(response, persona)
        quality = compute_response_quality(response)
        
        consistency_scores.append(consistency)
        quality_metrics.append(quality)
        
        # Save first 3 examples
        if i < 3:
            sample_outputs.append({
                'persona': persona,
                'history': history[-2:],
                'response': response,
                'consistency': consistency
            })
    
    # Store results
    results[model_name] = {
        'total_params_M': total_params / 1e6,
        'avg_consistency': np.mean(consistency_scores),
        'std_consistency': np.std(consistency_scores),
        'avg_length': np.mean([m['length'] for m in quality_metrics]),
        'avg_diversity': np.mean([m['diversity'] for m in quality_metrics]),
        'sample_outputs': sample_outputs
    }
    
    print(f"\nResults:")
    print(f"  Avg Persona Consistency: {results[model_name]['avg_consistency']:.3f} ± {results[model_name]['std_consistency']:.3f}")
    print(f"  Avg Response Length: {results[model_name]['avg_length']:.1f} words")
    print(f"  Avg Diversity: {results[model_name]['avg_diversity']:.3f}")
    
    # Clean up GPU memory
    del model
    torch.cuda.empty_cache()

print("\n" + "="*50)
print("Baseline testing complete!")
print("="*50)

## 6. Compare Results

In [None]:
# Create output directory
os.makedirs('../outputs', exist_ok=True)

# Create comparison dataframe
comparison_df = pd.DataFrame([
    {
        'Model': name,
        'Parameters (M)': data['total_params_M'],
        'Persona Consistency': data['avg_consistency'],
        'Response Length': data['avg_length'],
        'Diversity': data['avg_diversity']
    }
    for name, data in results.items()
])

comparison_df = comparison_df.sort_values('Persona Consistency', ascending=False)
print("\nBaseline Model Comparison:")
print(comparison_df.to_string(index=False))

# Save results
comparison_df.to_csv('../outputs/baseline_comparison.csv', index=False)
print("\nResults saved to: outputs/baseline_comparison.csv")

## 7. Visualize Results

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Persona consistency
axes[0].barh(comparison_df['Model'], comparison_df['Persona Consistency'])
axes[0].set_xlabel('Persona Consistency Score')
axes[0].set_title('Persona Consistency by Model')
axes[0].set_xlim(0, 1)

# Response length
axes[1].barh(comparison_df['Model'], comparison_df['Response Length'], color='orange')
axes[1].set_xlabel('Average Response Length (words)')
axes[1].set_title('Response Length by Model')

# Diversity
axes[2].barh(comparison_df['Model'], comparison_df['Diversity'], color='green')
axes[2].set_xlabel('Lexical Diversity')
axes[2].set_title('Response Diversity by Model')
axes[2].set_xlim(0, 1)

plt.tight_layout()
plt.savefig('../outputs/baseline_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved to: outputs/baseline_comparison.png")

## 8. Display Sample Outputs

In [None]:
# Show sample outputs from best performing model
best_model = comparison_df.iloc[0]['Model']
print(f"Sample outputs from: {best_model}")
print("="*70)

for i, sample in enumerate(results[best_model]['sample_outputs'][:3]):
    print(f"\nExample {i+1}:")
    print(f"Persona: {', '.join(sample['persona'])}")
    print(f"\nLast exchange:")
    for turn in sample['history'][-2:]:
        print(f"  {turn}")
    print(f"\nGenerated response: {sample['response']}")
    print(f"Consistency score: {sample['consistency']:.3f}")
    print("-"*70)

## 9. Establish Target Metrics

In [None]:
# Calculate target improvements
best_baseline_consistency = comparison_df['Persona Consistency'].max()
target_consistency = 0.85  # Project goal: 85%+ consistency

print("Baseline Performance:")
print(f"  Best baseline consistency: {best_baseline_consistency:.3f}")
print(f"\nProject Goals:")
print(f"  Target consistency: {target_consistency:.3f}")
print(f"  Required improvement: {(target_consistency - best_baseline_consistency):.3f}")
print(f"  Relative improvement needed: {((target_consistency / best_baseline_consistency) - 1) * 100:.1f}%")

# Additional project objectives
print(f"\nAdditional Objectives:")
print(f"  - 75-80% reduction in training costs vs full fine-tuning")
print(f"  - 60-70% reduction in training time using LoRA")
print(f"  - Achieve 85%+ persona consistency across multi-turn conversations")
print(f"  - Benchmark against SOTA scores without API calls")

# Save baseline results for later comparison
baseline_summary = {
    'best_model': best_model,
    'best_consistency': float(best_baseline_consistency),
    'target_consistency': target_consistency,
    'all_results': {k: {key: val for key, val in v.items() if key != 'sample_outputs'} 
                    for k, v in results.items()}
}

with open('../outputs/baseline_summary.json', 'w') as f:
    json.dump(baseline_summary, f, indent=2)

print("\nBaseline summary saved to: outputs/baseline_summary.json")

## Summary

This notebook has:
- ✅ Loaded and tested multiple baseline models
- ✅ Measured persona consistency and response quality
- ✅ Compared model performance
- ✅ Established target metrics for improvement
- ✅ Saved baseline results for future comparison

**Key Findings:**
- Baseline models show limited persona consistency
- Significant improvement needed to reach 85% target
- Ready to proceed with supervised fine-tuning

Next: Proceed to `3_sft_training.ipynb` for supervised fine-tuning with LoRA.