# Model Comparison – Cricket Match Winner Prediction

This notebook trains **all 6 classification models** on the Match dataset, evaluates them side-by-side, and saves trained models + metrics.

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report, roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## 1. Load Dataset

In [None]:
df = pd.read_csv('../Match_dataset.csv')
print(f'Dataset shape: {df.shape}')
print(f'\nTarget distribution:\n{df["Winner"].value_counts()}')
df.head()

## 2. Feature Engineering & Preprocessing

In [None]:
# Derived features
df['Ranking_Diff'] = df['Team_A_Ranking'] - df['Team_B_Ranking']
df['Form_Diff'] = df['Team_A_Form'] - df['Team_B_Form']
df['Tech_Diff'] = df['Team_A_Tech_Index'] - df['Team_B_Tech_Index']
df['H2H_Diff'] = df['HeadToHead_A_Wins'] - df['HeadToHead_B_Wins']
df['Team_A_Won_Toss'] = (df['Toss_Winner'] == 'Team_A').astype(int)
df['Toss_Bat'] = (df['Toss_Decision'] == 'Bat').astype(int)

# Encode categorical features
le_pitch = LabelEncoder()
df['Pitch_Type_Enc'] = le_pitch.fit_transform(df['Pitch_Type'])
le_stage = LabelEncoder()
df['Stage_Enc'] = le_stage.fit_transform(df['Stage'])

# Encode target
le_target = LabelEncoder()
df['Winner_Enc'] = le_target.fit_transform(df['Winner'])

# Select features
feature_cols = [
    'Team_A_Ranking', 'Team_B_Ranking', 'Team_A_Form', 'Team_B_Form',
    'HeadToHead_A_Wins', 'HeadToHead_B_Wins', 'Venue_HomeAdvantage_A',
    'Venue_HomeAdvantage_B', 'Avg_T20_Score_Venue', 'Team_A_Tech_Index',
    'Team_B_Tech_Index', 'Match_Total', 'Ranking_Diff', 'Form_Diff',
    'Tech_Diff', 'H2H_Diff', 'Team_A_Won_Toss', 'Toss_Bat',
    'Pitch_Type_Enc', 'Stage_Enc'
]

X = df[feature_cols]
y = df['Winner_Enc']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set:     {X_test.shape[0]} samples')

## 3. Save Preprocessing Artifacts

In [None]:
os.makedirs('../model_pkls', exist_ok=True)

preprocessing = {
    'scaler': scaler,
    'le_pitch': le_pitch,
    'le_stage': le_stage,
    'le_target': le_target,
    'feature_cols': feature_cols
}
joblib.dump(preprocessing, '../model_pkls/preprocessing.pkl')
print('Preprocessing artifacts saved.')

## 4. Define All 6 Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=7),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42),
    'XGBoost': XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        use_label_encoder=False, eval_metric='logloss', random_state=42
    )
}

## 5. Train, Evaluate & Save Each Model

In [None]:
all_metrics = {}

for name, model in models.items():
    print(f"{'='*60}")
    print(f'Training: {name}')
    print(f"{'='*60}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy  = accuracy_score(y_test, y_pred)
    auc       = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall    = recall_score(y_test, y_pred)
    f1        = f1_score(y_test, y_pred)
    mcc       = matthews_corrcoef(y_test, y_pred)
    cm        = confusion_matrix(y_test, y_pred).tolist()

    all_metrics[name] = {
        'Accuracy': round(accuracy, 4),
        'AUC': round(auc, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4),
        'Confusion_Matrix': cm
    }

    print(f'  Accuracy:  {accuracy:.4f}')
    print(f'  AUC Score: {auc:.4f}')
    print(f'  Precision: {precision:.4f}')
    print(f'  Recall:    {recall:.4f}')
    print(f'  F1 Score:  {f1:.4f}')
    print(f'  MCC Score: {mcc:.4f}')
    print(f'\n  Confusion Matrix:\n  {confusion_matrix(y_test, y_pred)}')
    print(f'\n  Classification Report:\n{classification_report(y_test, y_pred, target_names=le_target.classes_)}')

    # Save model
    safe_name = name.lower().replace(' ', '_')
    joblib.dump(model, f'../model_pkls/{safe_name}.pkl')
    print(f'  Model saved to ../model_pkls/{safe_name}.pkl\n')

## 6. Save All Metrics

In [None]:
with open('../model_pkls/metrics.json', 'w') as f:
    json.dump(all_metrics, f, indent=2)

print('All metrics saved to ../model_pkls/metrics.json')

## 7. Model Comparison Table

In [None]:
comparison_rows = []
for name, m in all_metrics.items():
    comparison_rows.append({
        'Model': name,
        'Accuracy': m['Accuracy'],
        'AUC': m['AUC'],
        'Precision': m['Precision'],
        'Recall': m['Recall'],
        'F1': m['F1'],
        'MCC': m['MCC']
    })

comparison_df = pd.DataFrame(comparison_rows).set_index('Model')
comparison_df.style.highlight_max(axis=0, color='#c6efce')

## 8. Visual Comparison – Bar Chart

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
comparison_df.plot(kind='bar', ax=ax)
ax.set_ylabel('Score')
ax.set_title('Model Comparison – All Evaluation Metrics')
ax.set_ylim(0, 1.05)
ax.legend(loc='lower right', fontsize=9)
plt.xticks(rotation=25, ha='right')
plt.tight_layout()
plt.show()

## 9. Confusion Matrices – All Models

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(16, 9))
target_names = le_target.classes_
cmaps = ['Blues', 'Greens', 'Oranges', 'Purples', 'YlGn', 'Reds']

for idx, (name, m) in enumerate(all_metrics.items()):
    ax = axes[idx // 3, idx % 3]
    cm = np.array(m['Confusion_Matrix'])
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmaps[idx],
                xticklabels=target_names, yticklabels=target_names, ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.suptitle('Confusion Matrices – All Models', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 10. ROC Curves – All Models

In [None]:
plt.figure(figsize=(8, 6))

for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc_val = all_metrics[name]['AUC']
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc_val:.4f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Baseline')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves – All Models')
plt.legend(loc='lower right', fontsize=9)
plt.tight_layout()
plt.show()

## 11. Summary

All 6 models have been trained, evaluated, and saved. The comparison table and visualisations above provide a clear view of each model's strengths and weaknesses on this dataset.