In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[1]

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("qepc in root?", (PROJECT_ROOT / "qepc").exists())


In [None]:
import pandas as pd

from qepc.brain.games_loader import fetch_league_games, build_games_table
from qepc.brain.scripts import label_game_scripts_by_total_points

season = "2023-24"

# Team-level game logs (one row per TEAM_ID / GAME_ID)
team_games = fetch_league_games(season)
print("team_games shape:", team_games.shape)
display(team_games.head())

# Game-level table (one row per game)
games_df = build_games_table(team_games)
print("games_df shape:", games_df.shape)
display(games_df.head())

# Script labels for games (we may use later)
scripts_df = label_game_scripts_by_total_points(
    games_df,
    low_quantile=0.25,
    high_quantile=0.75,
)
print("scripts_df shape:", scripts_df.shape)
display(scripts_df.head())


In [None]:
from pathlib import Path

from qepc.brain.boxscores_loader import fetch_boxscores_for_games

data_dir = PROJECT_ROOT / "data" / "raw" / "nba" / "boxscores"
data_dir.mkdir(parents=True, exist_ok=True)

trad_path = data_dir / f"boxscores_traditional_{season}.parquet"

if trad_path.exists():
    print("Loading existing traditional boxscores from:", trad_path)
    trad_df = pd.read_parquet(trad_path)
else:
    print("No parquet found, fetching boxscores from API (this may take a while)...")
    game_ids = games_df["GAME_ID"].unique().tolist()
    trad_df, _ = fetch_boxscores_for_games(game_ids)  # ignore advanced for now

    print("Raw trad_df shape:", trad_df.shape)

    # Basic normalize: ensure column names we care about exist and types are sane
    # (You can extend this if needed; this matches the earlier structure we saw.)
    trad_df["GAME_ID"] = trad_df["GAME_ID"].astype(str)
    trad_df["TEAM_ID"] = trad_df["TEAM_ID"].astype(int)
    trad_df["PLAYER_ID"] = trad_df["PLAYER_ID"].astype(int)

    trad_df.to_parquet(trad_path, index=False)
    print("Saved full-season trad_df to:", trad_path)

print("trad_df shape:", trad_df.shape)
display(trad_df.head())


In [None]:
import numpy as np

df = trad_df.copy()

# Figure out the points column name
if "points" in df.columns:
    pts_col = "points"
elif "PTS" in df.columns:
    pts_col = "PTS"
else:
    raise ValueError("Could not find a points column in trad_df (expected 'points' or 'PTS').")

# Minutes might be "minutes" as 'MM:SS' or already numeric
min_col = "minutes" if "minutes" in df.columns else None
if min_col is None:
    raise ValueError("Could not find a 'minutes' column in trad_df.")

player_games = df[["GAME_ID", "TEAM_ID", "PLAYER_ID", "firstName", "familyName", pts_col, min_col]].copy()
player_games = player_games.rename(columns={pts_col: "POINTS", min_col: "MIN_RAW"})

# Build a simple name for convenience
player_games["PLAYER_NAME"] = player_games["firstName"].astype(str) + " " + player_games["familyName"].astype(str)

# Convert minutes "MM:SS" to float minutes
def parse_minutes(m):
    if isinstance(m, (int, float, np.number)):
        return float(m)
    if isinstance(m, str):
        if ":" in m:
            mm, ss = m.split(":")
            try:
                return float(mm) + float(ss) / 60.0
            except ValueError:
                return np.nan
        try:
            return float(m)
        except ValueError:
            return np.nan
    return np.nan

player_games["MINUTES"] = player_games["MIN_RAW"].apply(parse_minutes)

# Filter out players with 0 minutes or missing minutes
player_games = player_games[player_games["MINUTES"].notna() & (player_games["MINUTES"] > 0)].copy()

# Ensure GAME_ID is string to match games_df
player_games["GAME_ID"] = player_games["GAME_ID"].astype(str)
player_games["TEAM_ID"] = player_games["TEAM_ID"].astype(int)
player_games["PLAYER_ID"] = player_games["PLAYER_ID"].astype(int)

print("player_games shape:", player_games.shape)
display(player_games.head())


In [None]:
# Figure out team points column
if "PTS" in team_games.columns:
    team_pts_col = "PTS"
elif "TEAM_POINTS" in team_games.columns:
    team_pts_col = "TEAM_POINTS"
else:
    raise ValueError("team_games must contain 'PTS' or 'TEAM_POINTS'.")

team_pts = team_games[["GAME_ID", "TEAM_ID", team_pts_col]].copy()
team_pts = team_pts.rename(columns={team_pts_col: "TEAM_POINTS"})

# Make sure GAME_ID is string to match player_games
team_pts["GAME_ID"] = team_pts["GAME_ID"].astype(str)
team_pts["TEAM_ID"] = team_pts["TEAM_ID"].astype(int)

player_games = player_games.merge(
    team_pts,
    on=["GAME_ID", "TEAM_ID"],
    how="left",
)

print("After merge, player_games shape:", player_games.shape)
display(player_games.head())

# Compute per-game share of team points
player_games["POINTS"] = player_games["POINTS"].astype(float)
player_games["TEAM_POINTS"] = player_games["TEAM_POINTS"].astype(float)

# Avoid divide-by-zero
player_games["PTS_SHARE"] = np.where(
    player_games["TEAM_POINTS"] > 0,
    player_games["POINTS"] / player_games["TEAM_POINTS"],
    0.0,
)

display(player_games.head())


In [None]:
group_cols = ["PLAYER_ID", "TEAM_ID", "PLAYER_NAME"]

agg_player = (
    player_games.groupby(group_cols)
    .agg(
        GAMES_PLAYED=("GAME_ID", "nunique"),
        TOTAL_POINTS=("POINTS", "sum"),
        TOTAL_MINUTES=("MINUTES", "sum"),
        MEAN_POINTS=("POINTS", "mean"),
        MEAN_MINUTES=("MINUTES", "mean"),
        MEAN_PTS_SHARE=("PTS_SHARE", "mean"),
    )
    .reset_index()
)

print("agg_player shape (before filters):", agg_player.shape)
display(agg_player.head())


In [None]:
# Simple filters to keep players with some real sample size
min_games = 10     # tweak if you like
min_minutes = 100  # total minutes over season

mask = (agg_player["GAMES_PLAYED"] >= min_games) & (agg_player["TOTAL_MINUTES"] >= min_minutes)
player_lambdas = agg_player[mask].copy()

print("player_lambdas shape (after filters):", player_lambdas.shape)
display(player_lambdas.head())


In [None]:
# Team season average points (per game)
team_avg_pts = (
    team_pts.groupby("TEAM_ID")["TEAM_POINTS"]
    .mean()
    .rename("TEAM_MEAN_POINTS")
    .reset_index()
)

print("team_avg_pts shape:", team_avg_pts.shape)
display(team_avg_pts.head())

player_lambdas = player_lambdas.merge(
    team_avg_pts,
    on="TEAM_ID",
    how="left",
)

# λ_player v0: expected points in a "typical" game = team_avg_pts * mean share
player_lambdas["LAMBDA_POINTS_V0"] = (
    player_lambdas["TEAM_MEAN_POINTS"] * player_lambdas["MEAN_PTS_SHARE"]
)

print("player_lambdas with λ shape:", player_lambdas.shape)
display(
    player_lambdas.sort_values("LAMBDA_POINTS_V0", ascending=False).head(15)
)


In [None]:
out_dir = PROJECT_ROOT / "data" / "processed" / "nba"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / f"player_points_lambdas_{season}.parquet"
player_lambdas.to_parquet(out_path, index=False)

print("Saved player λ table to:", out_path)


In [None]:
def find_player(name_substring, df=None, top=10):
    if df is None:
        df = player_lambdas
    mask = df["PLAYER_NAME"].str.contains(name_substring, case=False, na=False)
    return df[mask].sort_values("LAMBDA_POINTS_V0", ascending=False).head(top)

print("Example: Nikola Jokic")
display(find_player("Jokic"))

print("Example: LeBron James")
display(find_player("LeBron"))

print("Example: Jayson Tatum")
display(find_player("Tatum"))
