
# Fantasy Hockey Classification — Starter Notebook

This notebook builds two **classification** models using your multi-year fantasy hockey data:

1. **Skaters:** Predict whether a player will score **≥ 20 goals** (`is_20_goals`).
2. **Goalies:** Predict whether a goalie will have a **save% above the seasonal median** (`is_above_median_sv`).

We'll cover: loading, cleaning (including **deduplication**), EDA, feature engineering, model training, evaluation, and insights.


In [None]:

import os
import numpy as np
import pandas as pd
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             ConfusionMatrixDisplay, RocCurveDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 120)


## Data paths

In [None]:

DATA_PATHS = {
    "skaters": [
        "/mnt/data/22_23_player_stats.csv",
        "/mnt/data/23_24_player_stats.csv",
        "/mnt/data/24_25_player_stats.csv",
        "/mnt/data/skaters_combined_3yr.csv",  # optional merged
    ],
    "goalies": [
        "/mnt/data/22_23_goalie_stats.csv",
        "/mnt/data/23_24_goalie_stats.csv",
        "/mnt/data/24_25_goalie_stats.csv",
        "/mnt/data/goalies_combined_3yr.csv", # optional merged
    ]
}

# Verify available files
for group, paths in DATA_PATHS.items():
    print(f"\n{group.upper()}")
    for p in paths:
        print("  exists:", os.path.exists(p), p)


## Load & combine

In [None]:

def load_concat(paths):
    frames = []
    for p in paths:
        if os.path.exists(p):
            df = pd.read_csv(p)
            df["source_file"] = Path(p).name
            frames.append(df)
    if not frames:
        raise FileNotFoundError("No data files found. Check DATA_PATHS above.")
    return pd.concat(frames, ignore_index=True, sort=False)

skaters_raw = load_concat(DATA_PATHS["skaters"])
goalies_raw = load_concat(DATA_PATHS["goalies"])

print("Skaters shape:", skaters_raw.shape)
print("Goalies shape:", goalies_raw.shape)
skaters_raw.head(3)


## Basic cleaning & deduplication

In [None]:

def clean_standardize(df, id_cols_guess=("Player","player","Name","name"), team_cols=("Team","team"), pos_cols=("Pos","Position","position")):
    df = df.copy()

    # Trim strings
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()

    # Standardize team & position if present
    for t in team_cols:
        if t in df.columns:
            df[t] = df[t].str.upper()

    for p in pos_cols:
        if p in df.columns:
            df[p] = df[p].str.upper().str.replace("C/LW","C-LW").str.replace("LW/C","LW-C")

    # Remove exact duplicate rows
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print(f"Exact duplicate rows removed: {before - after}")

    # If we have an obvious ID + season, dedup by that
    season_cols = [c for c in df.columns if "season" in c.lower() or c.lower() in {"season","year"}]
    id_col = next((c for c in id_cols_guess if c in df.columns), None)

    if id_col is not None and season_cols:
        key_cols = [id_col] + season_cols
        before = len(df)
        df = df.sort_index().drop_duplicates(subset=key_cols, keep="first")
        after = len(df)
        print(f"Per-ID-per-season duplicates removed: {before - after}")

    return df

skaters = clean_standardize(skaters_raw)
goalies = clean_standardize(goalies_raw)
skaters.head(3)


## Quick EDA

In [None]:

display(skaters.describe(include='all').T.head(20))
display(goalies.describe(include='all').T.head(20))

# If goal columns exist, visualize distribution
for gcol in ["G","Goals","goals"]:
    if gcol in skaters.columns:
        fig = px.histogram(skaters, x=gcol, nbins=30, title=f"Skater {gcol} distribution")
        fig.show()
        break

# Save EDA figures (Matplotlib example)
plt.figure()
numeric_cols = skaters.select_dtypes(include=np.number).columns[:10]
skaters[numeric_cols].corr(numeric_only=True).abs().stack().nlargest(1)
plt.title("Placeholder Figure")
plt.savefig("../figures/placeholder.png", bbox_inches="tight")
plt.close()


## Skater classification target: `is_20_goals`

In [None]:

GOAL_COL = next((c for c in ["G","Goals","goals"] if c in skaters.columns), None)
if GOAL_COL is None:
    raise ValueError("Could not find goals column in skater data (expected one of G/Goals/goals).")

skaters = skaters.copy()
skaters["is_20_goals"] = (skaters[GOAL_COL] >= 20).astype(int)
print(skaters["is_20_goals"].value_counts())
skaters[["is_20_goals", GOAL_COL]].head()


### Select feature columns for skater model

In [None]:

# Heuristic: choose common numeric performance columns if present
candidate_num = [c for c in ["G","A","P","S","SOG","TOI","TOI/S","CF","xG","iCF","iHDCF","PP TOI","SH TOI","Hits","Blocks","PIM","FO%","SH%"] if c in skaters.columns]
candidate_cat = [c for c in ["Team","Pos","Position","playerTeam","TeamName"] if c in skaters.columns]

features_skaters_num = candidate_num
features_skaters_cat = candidate_cat

print("Numeric features:", features_skaters_num)
print("Categorical features:", features_skaters_cat)

model_df = skaters.dropna(subset=[GOAL_COL]).copy()
X_num = features_skaters_num
X_cat = features_skaters_cat
y = model_df["is_20_goals"]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), X_num),
    ("cat", OneHotEncoder(handle_unknown="ignore"), X_cat)
], remainder="drop")

models = {
    "LogReg": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "SVM": SVC(probability=True, kernel="rbf"),
    "RF": RandomForestClassifier(n_estimators=300, random_state=42)
}

X = model_df[X_num + X_cat]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

results_skaters = []
for name, clf in models.items():
    pipe = Pipeline(steps=[("prep", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe, "predict_proba") else None

    metrics = {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
    }
    if y_proba is not None and len(np.unique(y_test)) == 2:
        try:
            metrics["roc_auc"] = roc_auc_score(y_test, y_proba)
        except Exception:
            metrics["roc_auc"] = np.nan
    results_skaters.append(metrics)

pd.DataFrame(results_skaters).sort_values("f1", ascending=False)


In [None]:

best_name = max(results_skaters, key=lambda m: m["f1"])["model"]
best_pipe = Pipeline(steps=[("prep", preprocess), ("clf", models[best_name])])
best_pipe.fit(X_train, y_train)
y_pred = best_pipe.predict(X_test)

disp = ConfusionMatrixDisplay.from_estimator(best_pipe, X_test, y_test)
plt.title(f"Skaters: Confusion Matrix ({best_name})")
plt.show()

if hasattr(best_pipe, "predict_proba"):
    RocCurveDisplay.from_estimator(best_pipe, X_test, y_test)
    plt.title(f"Skaters: ROC Curve ({best_name})")
    plt.show()


## Goalie classification target: `is_above_median_sv`

In [None]:

# Try to find save percentage and games played columns
SV_COL = next((c for c in ["SV%", "Sv%", "SVPCT", "Save%", "save_percentage", "sv%"] if c in goalies.columns), None)
GP_COL = next((c for c in ["GP","Games","games_played"] if c in goalies.columns), None)

if SV_COL is None:
    # try to compute SV% if S and SA exist
    if all(c in goalies.columns for c in ["Saves","ShotsAgainst"]):
        goalies["SV_computed"] = goalies["Saves"] / goalies["ShotsAgainst"]
        SV_COL = "SV_computed"
    else:
        raise ValueError("Could not find save% column in goalie data (expected SV%/variants or Saves & ShotsAgainst).")

# Filter out small samples (e.g., fewer than 10 games) if GP available
gdf = goalies.copy()
if GP_COL is not None:
    gdf = gdf[gdf[GP_COL] >= 10].copy()

median_sv = gdf[SV_COL].median()
gdf["is_above_median_sv"] = (gdf[SV_COL] >= median_sv).astype(int)
print("Median SV:", median_sv)
print(gdf["is_above_median_sv"].value_counts())
gdf[[SV_COL, "is_above_median_sv"]].head()


In [None]:

candidate_num_g = [c for c in ["SV%","SVPCT","GAA","GSAA","QS%","RBS","SO","Wins","Losses","ShotsAgainst","Saves","TOI"] if c in gdf.columns]
candidate_cat_g = [c for c in ["Team","team"] if c in gdf.columns]

features_goalies_num = candidate_num_g
features_goalies_cat = candidate_cat_g
print("Numeric features (goalies):", features_goalies_num)
print("Categorical features (goalies):", features_goalies_cat)

y_g = gdf["is_above_median_sv"]
X_g = gdf[features_goalies_num + features_goalies_cat]

preprocess_g = ColumnTransformer([
    ("num", StandardScaler(), features_goalies_num),
    ("cat", OneHotEncoder(handle_unknown="ignore"), features_goalies_cat)
])

models_g = {
    "LogReg": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(probability=True, kernel="rbf"),
    "RF": RandomForestClassifier(n_estimators=300, random_state=42)
}

Xg_train, Xg_test, yg_train, yg_test = train_test_split(X_g, y_g, test_size=0.25, random_state=42, stratify=y_g)

results_goalies = []
for name, clf in models_g.items():
    pipe = Pipeline(steps=[("prep", preprocess_g), ("clf", clf)])
    pipe.fit(Xg_train, yg_train)
    yg_pred = pipe.predict(Xg_test)
    yg_proba = pipe.predict_proba(Xg_test)[:,1] if hasattr(pipe, "predict_proba") else None

    metrics = {
        "model": name,
        "accuracy": accuracy_score(yg_test, yg_pred),
        "precision": precision_score(yg_test, yg_pred, zero_division=0),
        "recall": recall_score(yg_test, yg_pred, zero_division=0),
        "f1": f1_score(yg_test, yg_pred, zero_division=0),
    }
    if yg_proba is not None and len(np.unique(yg_test)) == 2:
        try:
            metrics["roc_auc"] = roc_auc_score(yg_test, yg_proba)
        except Exception:
            metrics["roc_auc"] = np.nan
    results_goalies.append(metrics)

pd.DataFrame(results_goalies).sort_values("f1", ascending=False)


In [None]:

best_name_g = max(results_goalies, key=lambda m: m["f1"])["model"]
best_pipe_g = Pipeline(steps=[("prep", preprocess_g), ("clf", models_g[best_name_g])])
best_pipe_g.fit(Xg_train, yg_train)

ConfusionMatrixDisplay.from_estimator(best_pipe_g, Xg_test, yg_test)
plt.title(f"Goalies: Confusion Matrix ({best_name_g})")
plt.show()

if hasattr(best_pipe_g, "predict_proba"):
    RocCurveDisplay.from_estimator(best_pipe_g, Xg_test, yg_test)
    plt.title(f"Goalies: ROC Curve ({best_name_g})")
    plt.show()



## Notes & Next Steps

- Tune hyperparameters (GridSearchCV / RandomizedSearchCV).
- Address any **class imbalance** (try class weights or resampling).
- Cross-validate and add **confidence intervals** on metrics.
- Expand features (rolling rates, 5v5 vs PP, usage/linemates, venue effects).
- Export best model & pipeline for deployment.
