In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

# ------------------------------
# 1. Data Preprocessing
# ------------------------------
# Load the dataset
df = pd.read_csv('/kaggle/input/cricket-predictor/over_features.csv')

# Create engineered features
df['pressure_index'] = df['dot_ball_pressure'] * df['required_desired_run_rate']
df['wicket_pressure'] = df['number_of_wickets_lost'] * df['required_desired_run_rate']
df['late_over_flag'] = (df['over'] > 15).astype(int)
df['bowler_pressure'] = df['current_bowler_economy'] * (df['bowler_wickets_in_match'] + 1)
df['aggressiveness_index'] = df['striker_strike_rate'] * (df['striker_boundaries_hit'] + 1)

# Drop unneeded columns
df = df.drop(columns=['match_id'])

# One-hot encode categorical columns if needed
df = pd.get_dummies(df, columns=['team', 'match_phase'], drop_first=True)

# Separate features and target (target: wicket_next_over)
X = df.drop(columns=['wicket_next_over'])
y = df['wicket_next_over']

# ------------------------------
# 2. Train-Test Split & Scaling
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------
# 2.5 Addressing Class Imbalance with SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("Class distribution before SMOTE:", np.bincount(y_train))
print("Class distribution after SMOTE:", np.bincount(y_train_balanced))

# ------------------------------
# 3. Define Evaluation Helper Function
# ------------------------------
# This function computes metrics on both training and test data.
def evaluate_model_full(model, model_name, X_train, y_train, X_test, y_test, threshold=0.5):
    # Training predictions
    if hasattr(model, "predict_proba"):
        y_train_prob = model.predict_proba(X_train)[:, 1]
    else:
        y_train_prob = model.predict(X_train)
    y_train_pred = (y_train_prob >= threshold).astype(int)
    train_metrics = {
        'Accuracy': accuracy_score(y_train, y_train_pred),
        'Precision': precision_score(y_train, y_train_pred, zero_division=0),
        'Recall': recall_score(y_train, y_train_pred, zero_division=0),
        'F1 Score': f1_score(y_train, y_train_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_train, y_train_prob) if hasattr(model, "predict_proba") else None
    }
    
    # Test predictions
    if hasattr(model, "predict_proba"):
        y_test_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_test_prob = model.predict(X_test)
    y_test_pred = (y_test_prob >= threshold).astype(int)
    test_metrics = {
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred, zero_division=0),
        'Recall': recall_score(y_test, y_test_pred, zero_division=0),
        'F1 Score': f1_score(y_test, y_test_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_test_prob) if hasattr(model, "predict_proba") else None
    }
    
    print(f"{model_name} Train Metrics: {train_metrics}")
    print(f"{model_name} Test Metrics: {test_metrics}\n")
    return train_metrics, test_metrics

# Dictionaries to store metrics for all models
results_train = {}
results_test = {}

# ------------------------------
# 3. Model Training and Evaluation
# ------------------------------

# 3.1 Linear Regression (as a baseline classifier)
lin_reg = LinearRegression()
lin_reg.fit(X_train_balanced, y_train_balanced)
# For linear regression, we use a threshold of 0.5 on predictions.
train_m, test_m = evaluate_model_full(lin_reg, "Linear Regression", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Linear Regression"] = train_m
results_test["Linear Regression"] = test_m

# 3.2 Logistic Regression
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(log_reg, "Logistic Regression", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Logistic Regression"] = train_m
results_test["Logistic Regression"] = test_m

# 3.3 K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(knn, "KNN", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["KNN"] = train_m
results_test["KNN"] = test_m

# 3.4 Decision Tree
tree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
tree.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(tree, "Decision Tree", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Decision Tree"] = train_m
results_test["Decision Tree"] = test_m

# 3.6 Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(rf, "Random Forest", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Random Forest"] = train_m
results_test["Random Forest"] = test_m

# 3.7 Ensemble Method: Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('tree', tree), ('rf', rf)],
    voting='soft'
)
voting_clf.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(voting_clf, "Voting Classifier", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Voting Classifier"] = train_m
results_test["Voting Classifier"] = test_m

# 3.8 K-Means Clustering (Unsupervised)
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
kmeans.fit(X_train_balanced)
clusters_train = kmeans.labels_
# Map clusters to the majority class in the balanced training data
mapping = {}
for cluster in np.unique(clusters_train):
    indices = np.where(clusters_train == cluster)[0]
    majority_class = y_train_balanced.iloc[indices].mode()[0]
    mapping[cluster] = majority_class
clusters_test = kmeans.predict(X_test_scaled)
y_pred_kmeans = np.array([mapping[cluster] for cluster in clusters_test])
# For KMeans we evaluate manually since there's no predict_proba
train_pred_kmeans = kmeans.labels_
# We'll compute train metrics using the mapping on training data:
y_train_pred_kmeans = np.array([mapping[cluster] for cluster in train_pred_kmeans])
train_metrics_kmeans = {
    'Accuracy': accuracy_score(y_train_balanced, y_train_pred_kmeans),
    'Precision': precision_score(y_train_balanced, y_train_pred_kmeans, zero_division=0),
    'Recall': recall_score(y_train_balanced, y_train_pred_kmeans, zero_division=0),
    'F1 Score': f1_score(y_train_balanced, y_train_pred_kmeans, zero_division=0),
    'ROC-AUC': None
}
test_metrics_kmeans = {
    'Accuracy': accuracy_score(y_test, y_pred_kmeans),
    'Precision': precision_score(y_test, y_pred_kmeans, zero_division=0),
    'Recall': recall_score(y_test, y_pred_kmeans, zero_division=0),
    'F1 Score': f1_score(y_test, y_pred_kmeans, zero_division=0),
    'ROC-AUC': None
}
print("KMeans Clustering Train Metrics:", train_metrics_kmeans)
print("KMeans Clustering Test Metrics:", test_metrics_kmeans, "\n")
results_train["KMeans Clustering"] = train_metrics_kmeans
results_test["KMeans Clustering"] = test_metrics_kmeans

# Gaussian Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(nb_model, "Gaussian Naive Bayes", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Gaussian Naive Bayes"] = train_m
results_test["Gaussian Naive Bayes"] = test_m

# AdaBoost
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced', random_state=42),
    n_estimators=50,
    random_state=42
)
ada_model.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(ada_model, "AdaBoost", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["AdaBoost"] = train_m
results_test["AdaBoost"] = test_m

# Stacking Classifier
stacking_estimators = [
    ('lr', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)),
    ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
]
stacking_model = StackingClassifier(
    estimators=stacking_estimators,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
)
stacking_model.fit(X_train_balanced, y_train_balanced)
train_m, test_m = evaluate_model_full(stacking_model, "Stacking Classifier", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Stacking Classifier"] = train_m
results_test["Stacking Classifier"] = test_m

# (Optional) Hyperparameter tuning for Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']
}
grid_log_reg = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
grid_log_reg.fit(X_train_balanced, y_train_balanced)
print("Best parameters for Logistic Regression:", grid_log_reg.best_params_)
print("Best F1 score from GridSearch:", grid_log_reg.best_score_)

best_log_reg = grid_log_reg.best_estimator_
train_m, test_m = evaluate_model_full(best_log_reg, "Logistic Regression (Tuned)", X_train_scaled, y_train, X_test_scaled, y_test)
results_train["Logistic Regression (Tuned)"] = train_m
results_test["Logistic Regression (Tuned)"] = test_m

# Adjust decision threshold for tuned logistic model (e.g., threshold = 0.4)
threshold = 0.4
if hasattr(best_log_reg, "predict_proba"):
    y_prob_best_log = best_log_reg.predict_proba(X_test_scaled)[:, 1]
else:
    y_prob_best_log = best_log_reg.predict(X_test_scaled)
y_pred_thresh = (y_prob_best_log >= threshold).astype(int)
adj_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_thresh),
    'Precision': precision_score(y_test, y_pred_thresh, zero_division=0),
    'Recall': recall_score(y_test, y_pred_thresh, zero_division=0),
    'F1 Score': f1_score(y_test, y_pred_thresh, zero_division=0),
    'ROC-AUC': roc_auc_score(y_test, y_prob_best_log) if hasattr(best_log_reg, "predict_proba") else None
}
print(f"Logistic Regression (Tuned, Threshold {threshold}) Test Metrics: {adj_metrics}\n")
results_test[f"Logistic Regression (Tuned, Threshold {threshold})"] = adj_metrics

# ------------------------------
# 4. Graphs for Train and Test Evaluation Metrics
# ------------------------------
import matplotlib.pyplot as plt

# Convert the results dictionaries to DataFrames
train_df = pd.DataFrame(results_train).T
test_df = pd.DataFrame(results_test).T

# Define the metrics to plot
metrics_list = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
models = train_df.index.tolist()

for metric in metrics_list:
    plt.figure(figsize=(10, 6))
    indices = np.arange(len(models))
    width = 0.35

Class distribution before SMOTE: [23721 10166]
Class distribution after SMOTE: [23721 10166]
Linear Regression Train Metrics: {'Accuracy': 0.7025112875143861, 'Precision': 0.5457481162540366, 'Recall': 0.049872122762148335, 'F1 Score': 0.09139251915277151, 'ROC-AUC': None}
Linear Regression Test Metrics: {'Accuracy': 0.7008970727101038, 'Precision': 0.5158730158730159, 'Recall': 0.05114083398898505, 'F1 Score': 0.09305654974946313, 'ROC-AUC': None}

Logistic Regression Train Metrics: {'Accuracy': 0.621772361082421, 'Precision': 0.3900273790757488, 'Recall': 0.4624237654928192, 'F1 Score': 0.42315135694675726, 'ROC-AUC': 0.6033725801540555}
Logistic Regression Test Metrics: {'Accuracy': 0.6176817752596789, 'Precision': 0.3865190491696516, 'Recall': 0.466955153422502, 'F1 Score': 0.4229467308034919, 'ROC-AUC': 0.5977768763027347}

KNN Train Metrics: {'Accuracy': 0.7518517425561425, 'Precision': 0.651126784792706, 'Recall': 0.37231949636041706, 'F1 Score': 0.4737467926653733, 'ROC-AUC': 0

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>