In [None]:
# 02_build_player_logs_all_seasons.ipynb
# Goal: Canonical "NBA_Player_Logs_All_Seasons.csv" for QEPC.

from pathlib import Path
import sys

import numpy as np
import pandas as pd

print("=== QEPC Player Logs Bootstrap ===")

cwd = Path.cwd()
core_root = None
cur = cwd

for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:", project_root)
print("raw_root:    ", cfg.raw_root)
print("=== Bootstrap OK ===")


In [None]:
# Locate the best raw player-game file

raw_root = cfg.raw_root
print("raw_root:", raw_root)

candidate_names = [
    "NBA_Player_Logs_All_Seasons.csv",   # if you already have a canonical-ish one
    "Player_Game_Logs_All_Seasons.csv",  # big NBA API export
    "PlayerStatistics.csv",              # fallback
]

found_paths = []
for name in candidate_names:
    p = raw_root / name
    if p.exists():
        found_paths.append(p)

if not found_paths:
    raise FileNotFoundError(
        f"No player logs file found in {raw_root}. "
        f"Tried: {candidate_names}"
    )

# Prefer the most detailed file by our priority above
source_path = found_paths[0]
print("Using source player logs file:", source_path)

df_raw = pd.read_csv(source_path, low_memory=False)
print("Raw shape:", df_raw.shape)
print("Raw columns (first 20):", list(df_raw.columns[:20]))

display(df_raw.head())


In [None]:
# Canonicalize raw player game logs into QEPC schema

df = df_raw.copy()

# 1) Standardize date & season
# GAME_DATE looks like "2015-04-15T00:00:00"
if "GAME_DATE" in df.columns:
    df["gameDate"] = pd.to_datetime(df["GAME_DATE"], errors="coerce")
elif "gameDate" in df.columns:
    df["gameDate"] = pd.to_datetime(df["gameDate"], errors="coerce")
else:
    raise KeyError("No GAME_DATE / gameDate column found in player logs.")

# Season string
if "Season" in df.columns:
    df["Season"] = df["Season"].astype(str)
elif "SEASON_YEAR" in df.columns:
    df["Season"] = df["SEASON_YEAR"].astype(str)
else:
    df["Season"] = df["gameDate"].dt.year.astype(str)  # fallback

# 2) IDs and names
df["gameId"] = df.get("GAME_ID", df.get("gameId"))
df["playerId"] = df.get("PLAYER_ID", df.get("personId"))
df["playerName"] = df.get("PLAYER_NAME", df.get("fullName"))

df["teamId"] = df.get("TEAM_ID", df.get("teamId"))
df["teamAbbrev"] = df.get("TEAM_ABBREVIATION", df.get("teamAbbrev"))
df["teamName"] = df.get("TEAM_NAME", df.get("teamName"))

# 3) Matchup, WL, home/away, opponent abbrev
df["matchup"] = df.get("MATCHUP", df.get("matchup"))
df["WL"] = df.get("WL", df.get("win"))

def infer_home_flag(matchup: str) -> float:
    # Examples: "MIA @ PHI" => away; "NOP vs. SAS" => home
    if not isinstance(matchup, str):
        return np.nan
    if "@ " in matchup:
        return 0.0  # '@' means this team is away
    if "vs." in matchup or "vs " in matchup:
        return 1.0  # 'vs.' means this team is home
    return np.nan

def infer_opponent_abbrev(matchup: str, team_abbrev: str) -> str:
    if not isinstance(matchup, str) or not isinstance(team_abbrev, str):
        return np.nan
    parts = matchup.replace("vs.", "vs").replace("@", "@").split()
    # Typical patterns:
    #  ["MIA", "@", "PHI"]
    #  ["NOP", "vs", "SAS"]
    if len(parts) >= 3:
        # parts[0] is teamAbbrev, parts[2] is opponentAbbrev
        return parts[2]
    return np.nan

df["home"] = df["matchup"].apply(infer_home_flag)
df["opponentTeamAbbrev"] = [
    infer_opponent_abbrev(m, t) for m, t in zip(df["matchup"], df["teamAbbrev"])
]

# win flag from WL if present
def wl_to_win_flag(wl: str) -> float:
    if isinstance(wl, str):
        return 1.0 if wl.upper() == "W" else 0.0 if wl.upper() == "L" else np.nan
    return np.nan

df["win"] = df["WL"].apply(wl_to_win_flag)

# 4) Core box score stats
df["minutes"] = df.get("MIN", df.get("minutes"))

df["fgm"] = df.get("FGM", df.get("fgm"))
df["fga"] = df.get("FGA", df.get("fga"))
df["fg_pct"] = df.get("FG_PCT", df.get("fg_pct"))

df["fg3m"] = df.get("FG3M", df.get("fg3m"))
df["fg3a"] = df.get("FG3A", df.get("fg3a"))
df["fg3_pct"] = df.get("FG3_PCT", df.get("fg3_pct"))

df["ftm"] = df.get("FTM", df.get("ftm"))
df["fta"] = df.get("FTA", df.get("fta"))
df["ft_pct"] = df.get("FT_PCT", df.get("ft_pct"))

df["oreb"] = df.get("OREB", df.get("reboundsOffensive"))
df["dreb"] = df.get("DREB", df.get("reboundsDefensive"))
df["reb"] = df.get("REB", df.get("reboundsTotal"))

df["ast"] = df.get("AST", df.get("assists"))
df["stl"] = df.get("STL", df.get("steals"))
df["blk"] = df.get("BLK", df.get("blocks"))
df["tov"] = df.get("TOV", df.get("turnovers"))
df["pf"] = df.get("PF", df.get("foulsPersonal"))

df["pts"] = df.get("PTS", df.get("points"))
df["plus_minus"] = df.get("PLUS_MINUS", df.get("plusMinusPoints"))

# 5) Build final canonical frame
canonical_cols = [
    "gameId",
    "gameDate",
    "Season",
    "playerId",
    "playerName",
    "teamId",
    "teamAbbrev",
    "teamName",
    "opponentTeamAbbrev",
    "home",
    "win",
    "minutes",
    "fgm", "fga", "fg_pct",
    "fg3m", "fg3a", "fg3_pct",
    "ftm", "fta", "ft_pct",
    "oreb", "dreb", "reb",
    "ast", "stl", "blk",
    "tov", "pf",
    "pts", "plus_minus",
    "matchup",
]

df_canon = df[canonical_cols].copy()

# Dtype cleanup
df_canon["gameId"] = df_canon["gameId"].astype(str)
df_canon["playerId"] = pd.to_numeric(df_canon["playerId"], errors="coerce").astype("Int64")
df_canon["teamId"] = pd.to_numeric(df_canon["teamId"], errors="coerce").astype("Int64")

print("Canonical player logs shape:", df_canon.shape)
print("Date range:", df_canon["gameDate"].min(), "→", df_canon["gameDate"].max())
print("Seasons:", sorted(df_canon["Season"].dropna().unique()))

display(df_canon.head())


In [None]:
# Basic sanity checks on canonical player logs

print("Null counts for key columns:")
print(df_canon[[
    "gameId", "gameDate", "Season",
    "playerId", "playerName",
    "teamId", "teamAbbrev", "teamName",
    "minutes", "pts"
]].isna().sum())

print("\nSample by season:")
display(
    df_canon.groupby("Season")
            .agg(
                games=("gameId", "nunique"),
                player_rows=("playerId", "count")
            )
            .head(15)
)

print("\nSample rows:")
display(df_canon.sample(10, random_state=42))


In [None]:
# Write canonical player logs to disk (opt-in)

target_path = raw_root / "NBA_Player_Logs_All_Seasons.csv"
backup_path = target_path.with_suffix(".backup_before_rebuild.csv")

WRITE_CHANGES = True  # <-- set to True when ready

if WRITE_CHANGES:
    if target_path.exists():
        print(f"Backing up existing file to: {backup_path}")
        target_path.rename(backup_path)

    print(f"Writing canonical player logs to: {target_path}")
    df_canon.to_csv(target_path, index=False)

    # Reload to sanity check
    df_check = pd.read_csv(target_path, low_memory=False, parse_dates=["gameDate"])
    print("Reloaded shape:", df_check.shape)
    print("Reloaded date range:", df_check["gameDate"].min(), "→", df_check["gameDate"].max())
else:
    print("WRITE_CHANGES=True → dry run only, no file written.")
