# K-Nearest Neighbors (KNN) Classifier – Cricket Match Winner Prediction

This notebook implements a **KNN Classifier** on the Match dataset to predict the winner (Team_A or Team_B) of a cricket match.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib, os, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report, roc_curve
)

## 1. Load & Explore the Dataset

In [None]:
df = pd.read_csv('../Match_dataset.csv')
print(f'Shape: {df.shape}')
df.head()

In [None]:
print('Target Distribution:')
print(df['Winner'].value_counts())
print(f'\nMissing values:\n{df.isnull().sum().sum()}')

## 2. Feature Engineering & Preprocessing

In [None]:
df['Ranking_Diff'] = df['Team_A_Ranking'] - df['Team_B_Ranking']
df['Form_Diff'] = df['Team_A_Form'] - df['Team_B_Form']
df['Tech_Diff'] = df['Team_A_Tech_Index'] - df['Team_B_Tech_Index']
df['H2H_Diff'] = df['HeadToHead_A_Wins'] - df['HeadToHead_B_Wins']
df['Team_A_Won_Toss'] = (df['Toss_Winner'] == 'Team_A').astype(int)
df['Toss_Bat'] = (df['Toss_Decision'] == 'Bat').astype(int)

le_pitch = LabelEncoder()
df['Pitch_Type_Enc'] = le_pitch.fit_transform(df['Pitch_Type'])
le_stage = LabelEncoder()
df['Stage_Enc'] = le_stage.fit_transform(df['Stage'])

le_target = LabelEncoder()
df['Winner_Enc'] = le_target.fit_transform(df['Winner'])

feature_cols = [
    'Team_A_Ranking', 'Team_B_Ranking', 'Team_A_Form', 'Team_B_Form',
    'HeadToHead_A_Wins', 'HeadToHead_B_Wins', 'Venue_HomeAdvantage_A',
    'Venue_HomeAdvantage_B', 'Avg_T20_Score_Venue', 'Team_A_Tech_Index',
    'Team_B_Tech_Index', 'Match_Total', 'Ranking_Diff', 'Form_Diff',
    'Tech_Diff', 'H2H_Diff', 'Team_A_Won_Toss', 'Toss_Bat',
    'Pitch_Type_Enc', 'Stage_Enc'
]

X = df[feature_cols]
y = df['Winner_Enc']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape[0]},  Test: {X_test.shape[0]}')

## 3. Choose Optimal K

In [None]:
k_range = range(1, 21)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure(figsize=(8, 4))
plt.plot(k_range, scores, marker='o')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.title('KNN – Accuracy vs K')
plt.xticks(k_range)
plt.grid(True)
plt.tight_layout()
plt.show()

best_k = list(k_range)[np.argmax(scores)]
print(f'Best K = {best_k} with accuracy = {max(scores):.4f}')

## 4. Train KNN Classifier

In [None]:
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train, y_train)
print('Model training complete.')

## 5. Evaluation Metrics

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy  = accuracy_score(y_test, y_pred)
auc       = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)
mcc       = matthews_corrcoef(y_test, y_pred)

print(f'Accuracy  : {accuracy:.4f}')
print(f'AUC Score : {auc:.4f}')
print(f'Precision : {precision:.4f}')
print(f'Recall    : {recall:.4f}')
print(f'F1 Score  : {f1:.4f}')
print(f'MCC Score : {mcc:.4f}')

## 6. Confusion Matrix & Classification Report

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges',
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix – KNN')
plt.tight_layout()
plt.show()

In [None]:
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

## 7. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve – KNN')
plt.legend()
plt.tight_layout()
plt.show()

## 8. Save Model

In [None]:
os.makedirs('../model_pkls', exist_ok=True)
joblib.dump(model, '../model_pkls/knn.pkl')
print('Model saved to ../model_pkls/knn.pkl')