# Notebook 11: Model Comparison and Selection

Systematically compare models and select the best one.

## Learning Objectives
- Compare multiple models fairly
- Use statistical tests for significance
- Understand ensemble methods
- Make informed model selection decisions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    VotingClassifier, StackingClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Setup and Baseline

In [None]:
# Generate data
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=12,
    n_redundant=4,
    n_classes=3,
    n_clusters_per_class=2,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training: {len(X_train)}, Test: {len(X_test)}")

## Part 2: Define Models to Compare

In [None]:
# Define models with reasonable defaults
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

print(f"Comparing {len(models)} models")

## Part 3: Cross-Validation Comparison

In [None]:
# Perform cross-validation for all models
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = {}
cv_scores = {}

print("Cross-Validation Results (10-fold):")
print("="*60)

for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    cv_scores[name] = scores
    
    results[name] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'min': scores.min(),
        'max': scores.max()
    }
    
    print(f"{name:25s}: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Sort by mean score
sorted_results = sorted(results.items(), key=lambda x: x[1]['mean'], reverse=True)

print("\nRanking:")
for rank, (name, res) in enumerate(sorted_results, 1):
    print(f"{rank}. {name}: {res['mean']:.4f}")

In [None]:
# Visualize CV results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
names = list(cv_scores.keys())
scores_list = [cv_scores[name] for name in names]

bp = axes[0].boxplot(scores_list, labels=names, patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('lightblue')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Comparison: CV Scores')
axes[0].tick_params(axis='x', rotation=45)

# Bar plot with error bars
means = [results[name]['mean'] for name in names]
stds = [results[name]['std'] for name in names]

x_pos = np.arange(len(names))
axes[1].bar(x_pos, means, yerr=stds, capsize=5, alpha=0.7)
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(names, rotation=45, ha='right')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Mean CV Score with Std Dev')

plt.tight_layout()
plt.show()

## Part 4: Statistical Significance Testing

In [None]:
# Pairwise t-tests between models
model_names = list(cv_scores.keys())
n_models = len(model_names)

# Create p-value matrix
p_values = np.ones((n_models, n_models))

for i in range(n_models):
    for j in range(i+1, n_models):
        # Paired t-test
        t_stat, p_value = stats.ttest_rel(cv_scores[model_names[i]], 
                                          cv_scores[model_names[j]])
        p_values[i, j] = p_value
        p_values[j, i] = p_value

# Visualize
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(p_values, dtype=bool), k=1)
sns.heatmap(p_values, annot=True, fmt='.3f', cmap='RdYlGn_r',
            xticklabels=model_names, yticklabels=model_names,
            mask=mask, vmin=0, vmax=0.1)
plt.title('Pairwise t-test p-values\n(Green = significant difference at Î±=0.05)')
plt.tight_layout()
plt.show()

print("p < 0.05: Statistically significant difference")

## Part 5: Test Set Evaluation

In [None]:
# Train all models and evaluate on test set
test_results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    
    test_results.append({
        'Model': name,
        'Train': train_score,
        'Test': test_score,
        'Overfit': train_score - test_score
    })

df_test = pd.DataFrame(test_results).sort_values('Test', ascending=False)
print("Test Set Results:")
print(df_test.to_string(index=False))

## Part 6: Ensemble Methods

In [None]:
# Voting Classifier (Hard Voting)
voting_hard = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
    ],
    voting='hard'
)

# Voting Classifier (Soft Voting)
voting_soft = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
    ],
    voting='soft'
)

# Evaluate ensembles
print("Ensemble Methods:")
print("="*50)

for name, model in [('Hard Voting', voting_hard), ('Soft Voting', voting_soft)]:
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    model.fit(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    
    print(f"\n{name}:")
    print(f"  CV Score: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print(f"  Test Score: {test_score:.4f}")

In [None]:
# Stacking Classifier
stacking = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('svm', SVC(kernel='rbf', probability=True, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)

scores = cross_val_score(stacking, X_train_scaled, y_train, cv=5)
stacking.fit(X_train_scaled, y_train)
test_score = stacking.score(X_test_scaled, y_test)

print("\nStacking Classifier:")
print(f"  CV Score: {scores.mean():.4f} (+/- {scores.std():.4f})")
print(f"  Test Score: {test_score:.4f}")

## Part 7: Final Model Selection

In [None]:
# Comprehensive comparison
all_models = {
    **models,
    'Voting (Soft)': voting_soft,
    'Stacking': stacking
}

final_comparison = []

for name, model in all_models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    if not hasattr(model, 'classes_'):
        model.fit(X_train_scaled, y_train)
    
    test_score = model.score(X_test_scaled, y_test)
    
    final_comparison.append({
        'Model': name,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std(),
        'Test': test_score
    })

df_final = pd.DataFrame(final_comparison).sort_values('Test', ascending=False)
print("Final Model Comparison:")
print(df_final.to_string(index=False))

In [None]:
# Select best model
best_model_name = df_final.iloc[0]['Model']
best_model = all_models[best_model_name]

print(f"\nSelected Model: {best_model_name}")
print(f"Test Accuracy: {df_final.iloc[0]['Test']:.4f}")

# Final evaluation
y_pred = best_model.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Summary

In this notebook, you learned:

### Model Comparison
- Use cross-validation for fair comparison
- Consider both mean and variance of scores
- Statistical tests for significance

### Ensemble Methods
- **Voting**: Combine predictions (hard/soft)
- **Stacking**: Use meta-learner
- Often outperform individual models

### Selection Criteria
- Performance (accuracy, F1, etc.)
- Stability (low variance across folds)
- Overfitting (train-test gap)
- Interpretability and speed

### Next Steps
Continue to **Notebook 12** for the complete ML pipeline!