# Pre-train All Models & Save as .pkl Files

This notebook trains all 6 ML classification models on the Match dataset and saves them (along with the scaler and label encoder) as `.pkl` files in `model_pkls/`.

In [None]:
import os
import warnings
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

warnings.filterwarnings('ignore')

## 1. Load Dataset

In [None]:
DATA_PATH = '../Match_dataset.csv'
SAVE_DIR  = '../model_pkls'

df = pd.read_csv(DATA_PATH)
print(f'Shape: {df.shape}')
df.head()

## 2. Feature Engineering & Preprocessing

In [None]:
df['Ranking_Diff'] = df['Team_A_Ranking'] - df['Team_B_Ranking']
df['Form_Diff'] = df['Team_A_Form'] - df['Team_B_Form']
df['Tech_Diff'] = df['Team_A_Tech_Index'] - df['Team_B_Tech_Index']
df['H2H_Diff'] = df['HeadToHead_A_Wins'] - df['HeadToHead_B_Wins']
df['Team_A_Won_Toss'] = (df['Toss_Winner'] == 'Team_A').astype(int)
df['Toss_Bat'] = (df['Toss_Decision'] == 'Bat').astype(int)

le_pitch = LabelEncoder()
df['Pitch_Type_Enc'] = le_pitch.fit_transform(df['Pitch_Type'])
le_stage = LabelEncoder()
df['Stage_Enc'] = le_stage.fit_transform(df['Stage'])

le_target = LabelEncoder()
df['Winner_Enc'] = le_target.fit_transform(df['Winner'])

feature_cols = [
    'Team_A_Ranking', 'Team_B_Ranking', 'Team_A_Form', 'Team_B_Form',
    'HeadToHead_A_Wins', 'HeadToHead_B_Wins', 'Venue_HomeAdvantage_A',
    'Venue_HomeAdvantage_B', 'Avg_T20_Score_Venue', 'Team_A_Tech_Index',
    'Team_B_Tech_Index', 'Match_Total', 'Ranking_Diff', 'Form_Diff',
    'Tech_Diff', 'H2H_Diff', 'Team_A_Won_Toss', 'Toss_Bat',
    'Pitch_Type_Enc', 'Stage_Enc'
]

X = df[feature_cols]
y = df['Winner_Enc']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape[0]},  Test: {X_test.shape[0]}')

## 3. Save Scaler & Label Encoder

In [None]:
os.makedirs(SAVE_DIR, exist_ok=True)

scaler_path = os.path.join(SAVE_DIR, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f'Saved scaler -> {scaler_path}')

le_path = os.path.join(SAVE_DIR, 'label_encoder.pkl')
joblib.dump(le_target, le_path)
print(f'Saved label encoder -> {le_path}')

## 4. Define Models

In [None]:
MODELS = {
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
    'decision_tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=7),
    'naive_bayes': GaussianNB(),
    'random_forest': RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42),
    'xgboost': XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        use_label_encoder=False, eval_metric='logloss', random_state=42,
    ),
}

## 5. Train & Save All Models

In [None]:
for name, model in MODELS.items():
    print(f'Training {name} ...')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    model_path = os.path.join(SAVE_DIR, f'{name}.pkl')
    joblib.dump(model, model_path)
    print(f'  Accuracy: {acc:.4f}  |  AUC: {auc:.4f}  |  Saved -> {model_path}')

print(f'\nAll {len(MODELS)} models saved to {SAVE_DIR}/')