In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[1]

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("qepc in root?", (PROJECT_ROOT / "qepc").exists())


In [None]:
from qepc.brain.games_loader import fetch_league_games, build_games_table
from qepc.brain.scripts import label_game_scripts_by_total_points

# ðŸ‘‡ switch this to "2025-26" later when you want to run current season
season = "2023-24"

team_games = fetch_league_games(season)
games_df = build_games_table(team_games)

print("Season:", season)
print("games_df rows:", len(games_df))
display(games_df.head())

scripts_df = label_game_scripts_by_total_points(
    games_df,
    low_quantile=0.25,
    high_quantile=0.75,
)

print("scripts_df shape:", scripts_df.shape)
display(scripts_df.head())

print("\nScript label distribution:")
print(scripts_df["SCRIPT_LABEL"].value_counts(normalize=True))


In [None]:
from qepc.brain.teams_loader import fetch_league_team_season_stats

# Base counting stats
team_stats_base = fetch_league_team_season_stats(
    season,
    measure_type="Base",
)

print("team_stats_base shape:", team_stats_base.shape)
display(team_stats_base.head())

# Advanced stats (OFF_RATING, DEF_RATING, PACE, etc.)
team_stats_adv = fetch_league_team_season_stats(
    season,
    measure_type="Advanced",
)

print("team_stats_adv shape:", team_stats_adv.shape)
display(team_stats_adv.head())


In [None]:
import pandas as pd

# Keep only the columns we care about from advanced stats
adv_cols_keep = [
    "TEAM_ID",
    "TEAM_NAME",
    "TEAM_ABBREVIATION",
    "GP",
    "W",
    "L",
    "W_PCT",
    "MIN",
    "OFF_RATING",
    "DEF_RATING",
    "NET_RATING",
    "PACE",
    "PIE",
]

adv_cols_keep = [c for c in adv_cols_keep if c in team_stats_adv.columns]
team_adv_small = team_stats_adv[adv_cols_keep].copy()

print("team_adv_small columns:", list(team_adv_small.columns))
display(team_adv_small.head())

# Make home/away copies with prefixes
home_adv = team_adv_small.add_prefix("HOME_")  # TEAM_ID -> HOME_TEAM_ID
away_adv = team_adv_small.add_prefix("AWAY_")  # TEAM_ID -> AWAY_TEAM_ID

# Start from games_df
games_feat = games_df.copy()

# Merge home team stats:
games_feat = games_feat.merge(
    home_adv,
    left_on="HOME_TEAM_ID",   # from games_df
    right_on="HOME_TEAM_ID",  # from home_adv (prefixed TEAM_ID)
    how="left",
)

# Merge away team stats:
games_feat = games_feat.merge(
    away_adv,
    left_on="AWAY_TEAM_ID",   # from games_df
    right_on="AWAY_TEAM_ID",  # from away_adv (prefixed TEAM_ID)
    how="left",
)

print("games_feat shape after merges:", games_feat.shape)
display(games_feat.head())


In [None]:
games_with_scripts = games_feat.merge(
    scripts_df[["GAME_ID", "SCRIPT_LABEL", "SCRIPT_INDEX"]],
    on="GAME_ID",
    how="left",
)

print("games_with_scripts shape:", games_with_scripts.shape)
display(games_with_scripts.head())


In [None]:
import numpy as np

feature_cols = [
    "HOME_OFF_RATING",
    "HOME_DEF_RATING",
    "HOME_NET_RATING",
    "HOME_PACE",
    "HOME_PIE",
    "AWAY_OFF_RATING",
    "AWAY_DEF_RATING",
    "AWAY_NET_RATING",
    "AWAY_PACE",
    "AWAY_PIE",
]

feature_cols = [c for c in feature_cols if c in games_with_scripts.columns]

print("Using feature columns:", feature_cols)

# Drop rows where we don't have a script label (shouldn't be many, but just in case)
model_df = games_with_scripts.dropna(subset=["SCRIPT_INDEX"]).copy()

X = model_df[feature_cols].values.astype(float)
y = model_df["SCRIPT_INDEX"].values.astype(int)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Script label counts:", np.bincount(y))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Track original row indices so we can map predictions back to model_df rows
idx = np.arange(len(model_df))

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X,
    y,
    idx,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\nClassification report (SCRIPT_INDEX):")
print(classification_report(y_test, y_pred, digits=3))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
probs = clf.predict_proba(X_test)

print("probs shape:", probs.shape)  # (n_test, 3)

# Map class indices (0=GRIND, 1=BALANCED, 2=CHAOS) to columns in probs
class_order = list(clf.classes_)   # e.g. [0, 1, 2]
i_grind = class_order.index(0)
i_bal   = class_order.index(1)
i_chaos = class_order.index(2)

# Take the first 10 test samples
test_idx_subset = idx_test[:10]

# Columns weâ€™d like to see for context
preview_cols = [
    "GAME_ID",
    "HOME_TEAM_NAME",
    "AWAY_TEAM_NAME",
    "TOTAL_POINTS",
    "SCRIPT_LABEL",
]

# Keep only the ones that actually exist in model_df
preview_cols = [c for c in preview_cols if c in model_df.columns]

# Grab those rows from model_df
results_preview = model_df.iloc[test_idx_subset][preview_cols].copy()

# Attach predicted probabilities
results_preview["P_GRIND"]    = probs[:10, i_grind]
results_preview["P_BALANCED"] = probs[:10, i_bal]
results_preview["P_CHAOS"]    = probs[:10, i_chaos]

display(results_preview)


In [None]:
import joblib
import pandas as pd

out_dir = PROJECT_ROOT / "data" / "processed" / "nba" / "models"
out_dir.mkdir(parents=True, exist_ok=True)

# 1) Sanity-check that clf and model_df exist
print("clf type:", type(clf))
print("model_df shape:", model_df.shape)

# 2) Save the classifier
model_path = out_dir / f"script_classifier_rf_{season}.joblib"
joblib.dump(clf, model_path)
print("Saved model to:", model_path)

# 3) Copy model_df and drop duplicate columns to keep parquet happy
model_df_out = model_df.copy()

# If there are duplicate column names (common after merges), parquet will explode.
# This keeps only the first occurrence of each column name.
model_df_out = model_df_out.loc[:, ~model_df_out.columns.duplicated()]

print("model_df_out shape after dropping duplicate columns:", model_df_out.shape)

dataset_path = out_dir / f"script_classifier_dataset_{season}.parquet"
model_df_out.to_parquet(dataset_path, index=False)
print("Saved dataset to:", dataset_path)

print("âœ… Saved model + dataset to:", out_dir)
