In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from scipy.stats import spearmanr
import os


In [6]:
# ===============================================================
# COACH CHANGE PREDICTION ‚Äî DATA PREPARATION
# ===============================================================


# ===============================================================
# Load Data
# ===============================================================
coaches = pd.read_csv("data/coaches.csv")
teams = pd.read_csv("data/teams.csv")

# ===============================================================
# Basic Cleaning and Derived Metrics
# ===============================================================
# Compute win ratio
coaches["win_ratio"] = coaches["won"] / (coaches["won"] + coaches["lost"])
coaches = coaches.sort_values(["tmID", "year"]).reset_index(drop=True)


# ===============================================================
# Add info about next year's coach (target variable)
# ===============================================================
coaches["next_coachID"] = coaches.groupby("tmID")["coachID"].shift(-1)
coaches["coach_changed_next_year"] = (coaches["coachID"] != coaches["next_coachID"]).astype(int)

# ===============================================================
# Add team context (ranking and playoff info)
# ===============================================================
teams_subset = teams[["year", "tmID", "rank", "playoff", "won", "lost"]]
coach_teams = pd.merge(coaches, teams_subset, on=["year", "tmID"], how="left")

# ===============================================================
# Compute change vs previous year
# ===============================================================
coach_teams = coach_teams.sort_values(["tmID", "year"]).reset_index(drop=True)

coach_teams["prev_win_ratio"] = coach_teams.groupby("tmID")["win_ratio"].shift(1)
coach_teams["prev_rank"] = coach_teams.groupby("tmID")["rank"].shift(1)
coach_teams["prev_playoff"] = coach_teams.groupby("tmID")["playoff"].shift(1)

# Performance change features
coach_teams["win_ratio_change"] = coach_teams["win_ratio"] - coach_teams["prev_win_ratio"]
coach_teams["rank_change"] = coach_teams["prev_rank"] - coach_teams["rank"]  # positive = improved rank
coach_teams["playoff_miss"] = ((coach_teams["prev_playoff"] == "Y") & (coach_teams["playoff"] == "N")).astype(int)

# ===============================================================
# Drop invalid / incomplete rows
# ===============================================================
coach_teams = coach_teams.dropna(subset=["win_ratio", "prev_win_ratio"])


# ===============================================================
# üß© Add richer context features
# ===============================================================

# 1Ô∏è‚É£ 3-year rolling average of win ratio (performance trend)
coach_teams["rolling_win_ratio_3yr"] = (
    coach_teams.groupby("tmID")["win_ratio"]
    .rolling(window=3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

# 2Ô∏è‚É£ Playoff miss streak
coach_teams["made_playoffs"] = (coach_teams["playoff"] == "Y").astype(int)

def playoff_streak(series):
    streaks = []
    streak = 0
    for made in series:
        if made == 0:
            streak += 1
        else:
            streak = 0
        streaks.append(streak)
    return streaks

coach_teams["playoff_miss_streak"] = coach_teams.groupby("tmID")["made_playoffs"].transform(playoff_streak)

# 3Ô∏è‚É£ Years with team (coach tenure)
coach_teams = coach_teams.sort_values(["coachID", "tmID", "year"])
coach_teams["years_with_team"] = (
    coach_teams.groupby(["coachID", "tmID"]).cumcount() + 1
)

# 4Ô∏è‚É£ Expectation gap (current vs. recent average)
coach_teams["expectation_gap"] = coach_teams["win_ratio"] - coach_teams["rolling_win_ratio_3yr"]

# 5Ô∏è‚É£ Team stability index (how often the team fires coaches)
coach_teams["team_stability_index"] = (
    coach_teams.groupby("tmID")["coach_changed_next_year"]
    .rolling(3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

# ===============================================================
# Keep useful columns only
# ===============================================================
final_cols = [
    "year", "tmID", "coachID", "win_ratio", "prev_win_ratio", "win_ratio_change",
    "rank", "prev_rank", "rank_change", "playoff", "prev_playoff", "playoff_miss",
    "rolling_win_ratio_3yr", "playoff_miss_streak", "years_with_team", "expectation_gap", 
    "team_stability_index", "coach_changed_next_year"
]
coach_model_df = coach_teams[final_cols]

# ===============================================================
# 8Ô∏è‚É£ Save prepared dataset
# ===============================================================
os.makedirs("data_models", exist_ok=True)
coach_model_df.to_csv("data_models/coach_firing_dataset.csv", index=False)

print("‚úÖ Dataset ready: data_models/coach_firing_dataset.csv")
print(f"Shape: {coach_model_df.shape}")
print(coach_model_df.head(10))


‚úÖ Dataset ready: data_models/coach_firing_dataset.csv
Shape: (142, 18)
     year tmID     coachID  win_ratio  prev_win_ratio  win_ratio_change  rank  \
154     5  WAS  adamsmi01w   0.500000        0.264706          0.235294     4   
85      2  NYL  adubari99w   0.656250        0.625000          0.031250     2   
86      3  NYL  adubari99w   0.562500        0.656250         -0.093750     1   
87      4  NYL  adubari99w   0.470588        0.562500         -0.091912     6   
88      5  NYL  adubari99w   0.437500        0.470588         -0.033088     2   
155     6  WAS  adubari99w   0.470588        0.500000         -0.029412     5   
156     7  WAS  adubari99w   0.529412        0.470588          0.058824     4   
157     8  WAS  adubari99w   0.000000        0.529412         -0.529412     5   
73      2  MIN  aglerbr99w   0.375000        0.468750         -0.093750     6   
74      3  MIN  aglerbr99w   0.315789        0.375000         -0.059211     8   

     prev_rank  rank_change playoff