# 5. Model Evaluation and Comparison

## Overview

In this notebook, we'll compare all our models and analyze their performance.

### Models to Compare:
1. **Baseline**: HOG features + SVM
2. **Custom CNN**: Trained from scratch
3. **Transfer Learning**: ResNet18 fine-tuned

### Metrics We'll Analyze:
- Overall accuracy
- Per-class precision, recall, F1-score
- Confusion matrices
- Common misclassifications

In [None]:
import numpy as np
import pandas as pd
import pickle
import json
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support

plt.style.use('seaborn-v0_8-whitegrid')
print("Libraries imported!")

## Step 1: Load All Results

In [None]:
EMOTION_CLASSES = ["anger", "disgust", "fear", "happiness", "neutral", "sadness", "surprise"]

# Load results from each model
results = {}

# Baseline
try:
    with open('baseline_results.pkl', 'rb') as f:
        results['HOG + SVM'] = pickle.load(f)
    print("Loaded baseline results")
except FileNotFoundError:
    print("Baseline results not found - run notebook 02 first")

# Custom CNN
try:
    with open('custom_cnn_results.pkl', 'rb') as f:
        results['Custom CNN'] = pickle.load(f)
    print("Loaded Custom CNN results")
except FileNotFoundError:
    print("Custom CNN results not found - run notebook 03 first")

# Transfer Learning
try:
    with open('transfer_learning_results.pkl', 'rb') as f:
        results['ResNet18'] = pickle.load(f)
    print("Loaded Transfer Learning results")
except FileNotFoundError:
    print("Transfer Learning results not found - run notebook 04 first")

print(f"\nLoaded {len(results)} model results")

## Step 2: Compare Overall Accuracy

In [None]:
# Extract accuracies
model_names = list(results.keys())
accuracies = [results[name]['test_accuracy'] for name in model_names]

# Print comparison
print("=" * 60)
print("MODEL ACCURACY COMPARISON")
print("=" * 60)
print(f"\n{'Model':<25} {'Test Accuracy':>15}")
print("-" * 40)
for name, acc in zip(model_names, accuracies):
    print(f"{name:<25} {acc*100:>14.2f}%")

# Best model
best_idx = np.argmax(accuracies)
print(f"\nBest Model: {model_names[best_idx]} ({accuracies[best_idx]*100:.2f}%)")

In [None]:
# Bar chart comparison
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['steelblue', 'darkorange', 'forestgreen'][:len(model_names)]
bars = ax.bar(model_names, [acc * 100 for acc in accuracies], color=colors, edgecolor='white', linewidth=2)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax.annotate(f'{acc*100:.2f}%',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 5),
                textcoords="offset points",
                ha='center', va='bottom',
                fontsize=14, fontweight='bold')

ax.set_ylabel('Test Accuracy (%)', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_ylim(0, 100)
ax.grid(True, alpha=0.3, axis='y')

# Highlight best model
bars[best_idx].set_edgecolor('gold')
bars[best_idx].set_linewidth(4)

plt.tight_layout()
plt.show()

## Step 3: Improvement Analysis

Let's see how much each model improves over the baseline.

In [None]:
if len(results) >= 2:
    baseline_acc = results['HOG + SVM']['test_accuracy'] if 'HOG + SVM' in results else accuracies[0]
    
    print("=" * 60)
    print("IMPROVEMENT OVER BASELINE")
    print("=" * 60)
    print(f"\n{'Model':<25} {'Accuracy':>12} {'Improvement':>15}")
    print("-" * 55)
    
    for name, acc in zip(model_names, accuracies):
        improvement = (acc - baseline_acc) * 100
        sign = "+" if improvement >= 0 else ""
        print(f"{name:<25} {acc*100:>11.2f}% {sign}{improvement:>14.2f}%")
else:
    print("Need at least 2 models to compare improvements")

## Step 4: Per-Class Performance Comparison

In [None]:
# Get per-class F1 scores for each model
per_class_f1 = {}

for name in model_names:
    if 'per_class_f1' in results[name]:
        per_class_f1[name] = results[name]['per_class_f1']
    else:
        # Calculate from confusion matrix if available
        cm = np.array(results[name]['confusion_matrix'])
        # This is approximate - actual F1 would need predictions
        precision = np.diag(cm) / (cm.sum(axis=0) + 1e-10)
        recall = np.diag(cm) / (cm.sum(axis=1) + 1e-10)
        f1 = 2 * precision * recall / (precision + recall + 1e-10)
        per_class_f1[name] = f1.tolist()

if per_class_f1:
    # Create comparison DataFrame
    f1_df = pd.DataFrame(per_class_f1, index=EMOTION_CLASSES)
    print("Per-Class F1 Scores:")
    print(f1_df.round(3))
    print("\n" + "=" * 60)

In [None]:
if per_class_f1:
    # Grouped bar chart for per-class comparison
    x = np.arange(len(EMOTION_CLASSES))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(14, 6))
    
    for i, (name, f1_scores) in enumerate(per_class_f1.items()):
        offset = (i - len(per_class_f1)/2 + 0.5) * width
        bars = ax.bar(x + offset, f1_scores, width, label=name, color=colors[i])
    
    ax.set_xlabel('Emotion Class', fontsize=12)
    ax.set_ylabel('F1 Score', fontsize=12)
    ax.set_title('Per-Class F1 Score Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(EMOTION_CLASSES, rotation=45, ha='right')
    ax.legend(loc='upper right')
    ax.set_ylim(0, 1.0)
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

## Step 5: Confusion Matrix Comparison

In [None]:
# Plot confusion matrices side by side
n_models = len(results)
fig, axes = plt.subplots(1, n_models, figsize=(6*n_models, 5))

if n_models == 1:
    axes = [axes]

for ax, (name, res) in zip(axes, results.items()):
    cm = np.array(res['confusion_matrix'])
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    
    sns.heatmap(cm_normalized, annot=True, fmt='.1%', cmap='Blues',
                xticklabels=EMOTION_CLASSES, yticklabels=EMOTION_CLASSES,
                ax=ax, cbar=False)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    acc = res['test_accuracy'] * 100
    ax.set_title(f'{name}\nAccuracy: {acc:.2f}%', fontsize=12)

plt.suptitle('Confusion Matrix Comparison (Normalized)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## Step 6: Error Analysis

Let's identify which emotion pairs are most commonly confused.

In [None]:
# Analyze common confusions for each model
print("=" * 60)
print("COMMON MISCLASSIFICATIONS")
print("=" * 60)

for name, res in results.items():
    print(f"\n{name}:")
    cm = np.array(res['confusion_matrix'])
    
    # Get off-diagonal elements (misclassifications)
    confusions = []
    for i in range(len(EMOTION_CLASSES)):
        for j in range(len(EMOTION_CLASSES)):
            if i != j and cm[i, j] > 0:
                confusions.append((EMOTION_CLASSES[i], EMOTION_CLASSES[j], cm[i, j]))
    
    # Sort by frequency
    confusions.sort(key=lambda x: x[2], reverse=True)
    
    print("  Top 5 confusions (True -> Predicted: Count):")
    for true_label, pred_label, count in confusions[:5]:
        print(f"    {true_label:12s} -> {pred_label:12s}: {count}")

## Step 7: Hardest and Easiest Classes

In [None]:
# Identify which classes are hardest/easiest across all models
if per_class_f1:
    # Average F1 across all models for each class
    avg_f1 = {emotion: np.mean([per_class_f1[model][i] for model in model_names]) 
              for i, emotion in enumerate(EMOTION_CLASSES)}
    
    sorted_emotions = sorted(avg_f1.items(), key=lambda x: x[1], reverse=True)
    
    print("=" * 60)
    print("EMOTION DIFFICULTY RANKING (by average F1)")
    print("=" * 60)
    print(f"\n{'Rank':<6} {'Emotion':<15} {'Avg F1':>10} {'Difficulty':>15}")
    print("-" * 50)
    
    for rank, (emotion, f1) in enumerate(sorted_emotions, 1):
        if f1 > 0.7:
            difficulty = "Easy"
        elif f1 > 0.5:
            difficulty = "Medium"
        else:
            difficulty = "Hard"
        print(f"{rank:<6} {emotion:<15} {f1:>10.3f} {difficulty:>15}")
    
    print(f"\nEasiest: {sorted_emotions[0][0]} (F1={sorted_emotions[0][1]:.3f})")
    print(f"Hardest: {sorted_emotions[-1][0]} (F1={sorted_emotions[-1][1]:.3f})")

## Step 8: Summary Visualization

In [None]:
# Create a comprehensive summary figure
fig = plt.figure(figsize=(16, 10))

# Accuracy comparison (top left)
ax1 = fig.add_subplot(2, 2, 1)
bars = ax1.bar(model_names, [acc * 100 for acc in accuracies], color=colors[:len(model_names)])
for bar, acc in zip(bars, accuracies):
    ax1.annotate(f'{acc*100:.1f}%', xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                 xytext=(0, 3), textcoords='offset points', ha='center', fontweight='bold')
ax1.set_ylabel('Accuracy (%)')
ax1.set_title('Overall Accuracy', fontsize=12, fontweight='bold')
ax1.set_ylim(0, 100)

# Per-class F1 heatmap (top right)
if per_class_f1:
    ax2 = fig.add_subplot(2, 2, 2)
    f1_matrix = np.array([per_class_f1[m] for m in model_names])
    sns.heatmap(f1_matrix, annot=True, fmt='.2f', cmap='RdYlGn',
                xticklabels=EMOTION_CLASSES, yticklabels=model_names,
                ax=ax2, vmin=0, vmax=1)
    ax2.set_title('Per-Class F1 Scores', fontsize=12, fontweight='bold')

# Class difficulty (bottom left)
if per_class_f1:
    ax3 = fig.add_subplot(2, 2, 3)
    avg_f1_values = [avg_f1[e] for e in EMOTION_CLASSES]
    colors_by_difficulty = ['green' if f > 0.7 else 'orange' if f > 0.5 else 'red' for f in avg_f1_values]
    bars = ax3.barh(EMOTION_CLASSES, avg_f1_values, color=colors_by_difficulty)
    ax3.set_xlabel('Average F1 Score')
    ax3.set_title('Emotion Difficulty (Avg across models)', fontsize=12, fontweight='bold')
    ax3.set_xlim(0, 1)

# Key findings (bottom right)
ax4 = fig.add_subplot(2, 2, 4)
ax4.axis('off')

findings = f"""
KEY FINDINGS
{'='*40}

Best Model: {model_names[best_idx]}
Best Accuracy: {accuracies[best_idx]*100:.2f}%

Improvement over baseline:
"""
if len(results) > 1:
    baseline_acc = accuracies[0]
    for name, acc in zip(model_names[1:], accuracies[1:]):
        improvement = (acc - baseline_acc) * 100
        findings += f"  {name}: +{improvement:.1f}%\n"

if per_class_f1:
    findings += f"\nEasiest emotion: {sorted_emotions[0][0]}\n"
    findings += f"Hardest emotion: {sorted_emotions[-1][0]}\n"

ax4.text(0.1, 0.9, findings, transform=ax4.transAxes, fontsize=12,
         verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('Facial Expression Recognition - Model Comparison Summary', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## Summary

### Key Findings:

1. **Transfer Learning wins**: Pretrained models consistently outperform training from scratch

2. **Deep learning beats traditional ML**: Both CNNs outperformed HOG+SVM baseline

3. **Class imbalance matters**: Rare emotions (fear, disgust) are harder to classify

4. **Common confusions**: Similar emotions (e.g., neutral/sadness) often confused

### Recommendations:

1. **Use transfer learning** for production systems
2. **Collect more data** for underrepresented classes
3. **Consider class weights** or oversampling for imbalanced classes
4. **Use Grad-CAM** (next notebook) to verify model focuses on faces

### Next Steps:
- **Notebook 6**: Visualize what the models learn with Grad-CAM