# Model Evaluation

## Objective
Comprehensive evaluation of trained models:
- Confusion matrices
- Per-class metrics
- Error analysis
- Confidence calibration
- Model fairness (if applicable)

In [None]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import scipy.sparse
import json

# Sklearn metrics
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support
)
from sklearn.model_selection import train_test_split

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Models and Test Data

In [None]:
# Load models
models_dir = Path('../models')
with open(models_dir / 'category_model.pkl', 'rb') as f:
    cat_model = pickle.load(f)
with open(models_dir / 'priority_model.pkl', 'rb') as f:
    pri_model = pickle.load(f)

# Load encoders
data_dir = Path('../data/processed')
with open(data_dir / 'category_encoder.pkl', 'rb') as f:
    category_encoder = pickle.load(f)
with open(data_dir / 'priority_encoder.pkl', 'rb') as f:
    priority_encoder = pickle.load(f)

# Load test data (recreate split from training notebook)
X = scipy.sparse.load_npz(data_dir / 'X_tfidf.npz')
y_category = np.load(data_dir / 'y_category.npy')
y_priority = np.load(data_dir / 'y_priority.npy')

RANDOM_STATE = 42
X_train, X_test, y_cat_train, y_cat_test, y_pri_train, y_pri_test = train_test_split(
    X, y_category, y_priority, test_size=0.2, random_state=RANDOM_STATE, stratify=y_category
)

print(f"Test set size: {X_test.shape[0]}")

## 2. Category Model Evaluation

In [None]:
# Predictions
y_cat_pred = cat_model.predict(X_test)
y_cat_proba = cat_model.predict_proba(X_test)

# Overall accuracy
cat_accuracy = accuracy_score(y_cat_test, y_cat_pred)
print(f"Category Model Accuracy: {cat_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_cat_test, y_cat_pred, target_names=category_encoder.classes_))

### Confusion Matrix

In [None]:
# Confusion matrix
cm_cat = confusion_matrix(y_cat_test, y_cat_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm_cat, annot=True, fmt='d', cmap='Blues',
            xticklabels=category_encoder.classes_,
            yticklabels=category_encoder.classes_)
plt.title('Category Prediction Confusion Matrix')
plt.ylabel('True Category')
plt.xlabel('Predicted Category')
plt.tight_layout()
plt.show()

## 3. Priority Model Evaluation

In [None]:
# Predictions
y_pri_pred = pri_model.predict(X_test)
y_pri_proba = pri_model.predict_proba(X_test)

# Overall accuracy
pri_accuracy = accuracy_score(y_pri_test, y_pri_pred)
print(f"Priority Model Accuracy: {pri_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_pri_test, y_pri_pred, target_names=priority_encoder.classes_))

### Confusion Matrix

In [None]:
# Confusion matrix
cm_pri = confusion_matrix(y_pri_test, y_pri_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_pri, annot=True, fmt='d', cmap='Oranges',
            xticklabels=priority_encoder.classes_,
            yticklabels=priority_encoder.classes_)
plt.title('Priority Prediction Confusion Matrix')
plt.ylabel('True Priority')
plt.xlabel('Predicted Priority')
plt.tight_layout()
plt.show()

## 4. Error Analysis

In [None]:
# TODO: Load original text data to analyze misclassifications
# Identify samples where model prediction was wrong
# cat_errors = (y_cat_pred != y_cat_test)
# print(f"Category errors: {cat_errors.sum()} / {len(y_cat_test)}")

# Analyze error patterns:
# - Which categories are most confused?
# - Are low-confidence predictions more likely to be wrong?
# - What text patterns lead to errors?

## 5. Confidence Analysis

In [None]:
# Extract max probability (confidence) for each prediction
cat_confidence = y_cat_proba.max(axis=1)
pri_confidence = y_pri_proba.max(axis=1)

# Analyze confidence distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(cat_confidence, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.title('Category Confidence Distribution')
plt.axvline(cat_confidence.mean(), color='red', linestyle='--', label=f'Mean: {cat_confidence.mean():.2f}')
plt.axvline(0.6, color='green', linestyle='--', label='Threshold: 0.6')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(pri_confidence, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.title('Priority Confidence Distribution')
plt.axvline(pri_confidence.mean(), color='red', linestyle='--', label=f'Mean: {pri_confidence.mean():.2f}')
plt.axvline(0.6, color='green', linestyle='--', label='Threshold: 0.6')
plt.legend()

plt.tight_layout()
plt.show()

# Percentage above threshold
print(f"Category predictions with confidence > 0.6: {(cat_confidence > 0.6).mean()*100:.1f}%")
print(f"Priority predictions with confidence > 0.6: {(pri_confidence > 0.6).mean()*100:.1f}%")

### Accuracy vs. Confidence

In [None]:
# Check if higher confidence correlates with higher accuracy
confidence_thresholds = np.arange(0.5, 1.0, 0.05)
accuracies_cat = []
coverage_cat = []

for threshold in confidence_thresholds:
    mask = cat_confidence >= threshold
    if mask.sum() > 0:
        acc = accuracy_score(y_cat_test[mask], y_cat_pred[mask])
        cov = mask.mean()
        accuracies_cat.append(acc)
        coverage_cat.append(cov)
    else:
        accuracies_cat.append(0)
        coverage_cat.append(0)

plt.figure(figsize=(10, 5))
plt.plot(confidence_thresholds, accuracies_cat, marker='o', label='Accuracy')
plt.plot(confidence_thresholds, coverage_cat, marker='s', label='Coverage')
plt.xlabel('Confidence Threshold')
plt.ylabel('Score')
plt.title('Category Model: Accuracy vs. Coverage by Confidence Threshold')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 6. Model Performance Summary

In [None]:
# Create summary metrics
summary = {
    "category_model": {
        "accuracy": float(cat_accuracy),
        "mean_confidence": float(cat_confidence.mean()),
        "predictions_above_0.6_confidence": float((cat_confidence > 0.6).mean())
    },
    "priority_model": {
        "accuracy": float(pri_accuracy),
        "mean_confidence": float(pri_confidence.mean()),
        "predictions_above_0.6_confidence": float((pri_confidence > 0.6).mean())
    },
    "test_set_size": int(len(y_cat_test)),
    "evaluation_date": pd.Timestamp.now().isoformat()
}

print(json.dumps(summary, indent=2))

## 7. Recommendations

Based on evaluation results:

**TODO: Fill in after running evaluation**

- **Confidence Threshold**: Recommend using 0.6 or higher to ensure quality predictions
- **Model Improvements**: <!-- e.g., Collect more data for minority classes, try advanced models -->
- **Production Monitoring**: Track prediction accuracy over time to detect drift
- **Fallback Strategy**: For low-confidence predictions, route to human agent without suggestion

**Next Steps**:
1. Document findings in `docs/MODEL-CARD.md`
2. Implement prediction API in `api/app.py`
3. Set up monitoring dashboard
4. Plan model retraining schedule