# Full Evaluation Suite

**Target:** 85%+ persona consistency

In [None]:
!pip install -q transformers datasets peft wandb evaluate matplotlib seaborn pandas

In [None]:
import sys; sys.path.append('../')
import torch, wandb, json, pandas as pd
import matplotlib.pyplot as plt
from src.data.loader import DatasetLoader
from src.data.processor import DataProcessor
from src.model.base import load_base_model, load_tokenizer
from src.eval.persona import PersonaEvaluator
from src.eval.engagement import EngagementEvaluator
from src.eval.quality import QualityEvaluator
from src.eval.benchmark import BenchmarkEvaluator

In [None]:
wandb.init(project='persona-chatbot-rlhf', name='evaluation', tags=['eval'])
print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')

## 1. Load Models

In [None]:
# Load all models
models = {
    'baseline': load_base_model({'name': 'gpt2-medium', 'device_map': 'auto'}),
    'sft': load_base_model({'name': '../models/sft/final', 'device_map': 'auto'}),
    'rlhf': load_base_model({'name': '../models/rlhf/checkpoint-final', 'device_map': 'auto'})
}
tokenizer = load_tokenizer({'name': 'gpt2-medium'})
print('✅ Models loaded')

## 2. Load Test Data

In [None]:
loader = DatasetLoader()
test_data = loader.load_personachat(split='validation', use_synthetic=True)
processor = DataProcessor(config={'base_model': 'gpt2-medium', 'max_length': 512})
processed_test = processor.preprocess(test_data)
print(f'Test data: {len(processed_test)} examples')

## 3. Persona Consistency (TARGET: 85%+)

In [None]:
persona_eval = PersonaEvaluator(tokenizer_name='gpt2-medium')
results = {}

for name, model in models.items():
    print(f'Evaluating {name}...')
    consistency = persona_eval.evaluate_consistency(model, processed_test, max_samples=200, generate_responses=True)
    results[name] = {'persona_consistency': consistency}
    print(f'  {name}: {consistency:.3f}')
    wandb.log({f'{name}_persona_consistency': consistency})

# Check RLHF target
rlhf_consistency = results['rlhf']['persona_consistency']
target_met = rlhf_consistency >= 0.85
print(f'\nRLHF Persona Consistency: {rlhf_consistency:.3f}')
print(f'Target (85%): {"✅ ACHIEVED" if target_met else "❌ NOT MET"}')
wandb.log({'rlhf_target_85%_met': target_met})

## 4. Engagement & Quality

In [None]:
engagement_eval = EngagementEvaluator()
quality_eval = QualityEvaluator()

for name, model in models.items():
    print(f'\nEvaluating {name}...')
    
    # Engagement
    engagement = engagement_eval.evaluate_engagement(model, processed_test, max_samples=200, generate_responses=True)
    results[name]['engagement'] = engagement
    
    # Quality
    perplexity = quality_eval.compute_perplexity(model, processed_test, text_field='text', batch_size=8)
    bleu = quality_eval.compute_bleu(model, processed_test, max_samples=100)
    rouge = quality_eval.compute_rouge(model, processed_test, max_samples=100)
    
    results[name].update({'perplexity': perplexity, 'bleu': bleu['bleu'], 'rouge1': rouge['rouge1']})
    
    print(f'  Engagement: {engagement:.3f}, Perplexity: {perplexity:.2f}, BLEU: {bleu["bleu"]:.3f}')
    
    wandb.log({f'{name}_engagement': engagement, f'{name}_perplexity': perplexity, f'{name}_bleu': bleu['bleu']})

## 5. Comparison & Visualization

In [None]:
# Create comparison table
df = pd.DataFrame(results).T
print('\nComparison:')
print(df.to_string())
df.to_csv('../outputs/evaluation_results.csv')

# Plot
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
df['persona_consistency'].plot(kind='bar', ax=axes[0], title='Persona Consistency')
axes[0].axhline(y=0.85, color='r', linestyle='--', label='Target')
axes[0].legend()
df['engagement'].plot(kind='bar', ax=axes[1], title='Engagement', color='orange')
df['bleu'].plot(kind='bar', ax=axes[2], title='BLEU', color='green')
plt.tight_layout()
plt.savefig('../outputs/evaluation_comparison.png', dpi=300)
wandb.log({'comparison': wandb.Image(plt)})
plt.show()
print('✅ Saved to outputs/')

In [None]:
wandb.finish()
print('✅ Complete! Next: 6_analysis_demo.ipynb')