# Model Evaluation

Measure and improve model performance.

## Why Evaluate Models?
- Measure accuracy
- Compare models
- Detect overfitting
- Optimize performance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    learning_curve
)
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

sns.set_style('whitegrid')

## 1. Regression Metrics

In [None]:
# Generate sample data
np.random.seed(42)
X = np.random.rand(100, 1) * 10
y_true = 2.5 * X + 5 + np.random.randn(100, 1) * 2

# Train model
model = LinearRegression()
model.fit(X, y_true)
y_pred = model.predict(X)

print("Regression model trained!")

In [None]:
# Calculate metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

print("Regression Metrics:")
print(f"MAE:  {mae:.3f}")
print(f"MSE:  {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²:   {r2:.3f}")

print("\nInterpretation:")
print(f"- On average, predictions are off by "
      f"{mae:.2f} units (MAE)")
print(f"- Model explains {r2*100:.1f}% of "
      f"variance (R²)")

## 2. Classification Metrics

In [None]:
# Load iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

# Binary classification (class 0 vs rest)
y_binary = (y == 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.3, random_state=42
)

# Train model
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

print("Classification model trained!")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

# Extract values
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives:  {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives:  {tp}")

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Classification Metrics:")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")

print("\nInterpretation:")
print(f"- {accuracy*100:.1f}% of predictions "
      f"are correct")
print(f"- {precision*100:.1f}% of positive "
      f"predictions are correct")
print(f"- {recall*100:.1f}% of actual positives "
      f"were found")

## 3. ROC Curve and AUC

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(
    y_test, 
    y_pred_proba
)
auc = roc_auc_score(y_test, y_pred_proba)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(
    fpr, tpr, 
    linewidth=2, 
    label=f'Model (AUC = {auc:.3f})'
)
plt.plot(
    [0, 1], [0, 1], 
    'k--', 
    label='Random'
)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

print(f"AUC Score: {auc:.3f}")
print("\nInterpretation:")
if auc > 0.9:
    print("Excellent model!")
elif auc > 0.8:
    print("Good model")
elif auc > 0.7:
    print("Fair model")
else:
    print("Poor model")

## 4. Cross-Validation

In [None]:
# 5-fold cross-validation
scores = cross_val_score(
    clf, X, y_binary, 
    cv=5, 
    scoring='accuracy'
)

print("Cross-Validation Scores:")
for i, score in enumerate(scores, 1):
    print(f"Fold {i}: {score:.3f}")

print(f"\nMean: {scores.mean():.3f}")
print(f"Std:  {scores.std():.3f}")

# Visualize
plt.figure(figsize=(8, 5))
plt.bar(range(1, 6), scores)
plt.axhline(
    y=scores.mean(), 
    color='r', 
    linestyle='--',
    label=f'Mean: {scores.mean():.3f}'
)
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Scores')
plt.legend()
plt.show()

## 5. Learning Curves

In [None]:
# Generate learning curve
train_sizes, train_scores, val_scores = learning_curve(
    clf, X, y_binary,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

# Calculate means and stds
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(
    train_sizes, train_mean, 
    label='Training score',
    linewidth=2
)
plt.fill_between(
    train_sizes,
    train_mean - train_std,
    train_mean + train_std,
    alpha=0.2
)
plt.plot(
    train_sizes, val_mean,
    label='Validation score',
    linewidth=2
)
plt.fill_between(
    train_sizes,
    val_mean - val_std,
    val_mean + val_std,
    alpha=0.2
)
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.show()

## 6. Model Comparison

In [None]:
# Compare multiple models
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=200
    ),
    'Decision Tree': DecisionTreeClassifier(
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42
    )
}

results = []

for name, model in models.items():
    scores = cross_val_score(
        model, X, y_binary, 
        cv=5, 
        scoring='accuracy'
    )
    results.append({
        'Model': name,
        'Mean': scores.mean(),
        'Std': scores.std()
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(
    'Mean', 
    ascending=False
)
results_df

In [None]:
# Visualize comparison
plt.figure(figsize=(10, 6))
plt.barh(
    results_df['Model'], 
    results_df['Mean']
)
plt.xlabel('Mean Accuracy')
plt.title('Model Comparison')
plt.xlim(0.8, 1.0)
plt.grid(axis='x')
plt.tight_layout()
plt.show()

## 7. Overfitting vs Underfitting

In [None]:
# Demonstrate overfitting
from sklearn.tree import DecisionTreeClassifier

depths = range(1, 20)
train_scores = []
test_scores = []

for depth in depths:
    dt = DecisionTreeClassifier(
        max_depth=depth, 
        random_state=42
    )
    dt.fit(X_train, y_train)
    
    train_scores.append(
        dt.score(X_train, y_train)
    )
    test_scores.append(
        dt.score(X_test, y_test)
    )

# Plot
plt.figure(figsize=(10, 6))
plt.plot(
    depths, train_scores, 
    label='Training',
    linewidth=2
)
plt.plot(
    depths, test_scores, 
    label='Testing',
    linewidth=2
)
plt.xlabel('Tree Depth')
plt.ylabel('Accuracy')
plt.title('Overfitting Example')
plt.legend()
plt.grid(True)
plt.show()

print("Interpretation:")
print("- Gap between train and test = overfitting")
print("- Both low = underfitting")
print("- Both high = good fit")

## Practice Exercises

### Exercise 1
Calculate precision, recall, and F1 score 
for a custom threshold.

In [None]:
# Your code here


### Exercise 2
Compare 3 different models using 
cross-validation.

In [None]:
# Your code here


## Key Takeaways

✅ **Regression** - MAE, MSE, RMSE, R²  
✅ **Classification** - Accuracy, Precision, Recall, F1  
✅ **ROC/AUC** - Overall performance  
✅ **Cross-Validation** - Robust evaluation  
✅ **Learning Curves** - Detect over/underfitting  

**Next:** [Feature Engineering](05_feature_engineering.ipynb) →