# üß™ QEPC ‚Äì Enrich All-Seasons Team Logs from nba_api

This notebook:
1. Loads `NBA_Team_Logs_All_Seasons.csv`.
2. Finds games/teams with missing advanced team stats.
3. Uses `nba_api` boxscore endpoints to fetch:
   - points in the paint
   - fast break points
   - points off turnovers
   - second chance points
   - bench points
   - biggest lead / scoring run
   - lead changes / times tied
4. Merges those into the master table and saves it back (with a backup).


In [None]:
# 04_enrich_team_logs_from_nba_api.ipynb
# Goal: fill in advanced team stats (paint, fastbreak, bench, etc.)
#       in NBA_Team_Logs_All_Seasons.csv using nba_api live boxscore.

from pathlib import Path
import sys
import time

import numpy as np
import pandas as pd

print("=== QEPC Boxscore Enrichment Bootstrap ===")

# Find qepc_core by walking up from CWD
cwd = Path.cwd()
core_root = None
cur = cwd

for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

# Repo root is three levels above qepc_core (qepc_project/experimental/GTP_REWRITE/qepc_core)
repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:", project_root)
print("raw_root:    ", cfg.raw_root)
print("=== Bootstrap OK ===")


In [None]:
# Load the merged all-seasons team logs file

raw_root = cfg.raw_root
all_seasons_path = raw_root / "NBA_Team_Logs_All_Seasons.csv"

print("All-seasons path:", all_seasons_path)

df = pd.read_csv(all_seasons_path, low_memory=False)

# Normalize dates
df["gameDate"] = pd.to_datetime(df["gameDate"], errors="coerce", utc=True).dt.tz_convert(None)

print("Shape:", df.shape)
print("Date range:", df["gameDate"].min(), "‚Üí", df["gameDate"].max())
print("Columns:", list(df.columns))

display(df.head())


In [None]:
# Columns we want to backfill from the NBA live boxscore endpoint
ENRICH_COLS = [
    "pointsInThePaint",
    "benchPoints",
    "pointsFastBreak",
    "pointsFromTurnovers",
    "pointsSecondChance",
    "biggestLead",
    "biggestScoringRun",
    "leadChanges",
    "timesTied",
    "timeoutsRemaining",
]

print("Enrichment columns:", ENRICH_COLS)

# Ensure columns exist (create with NaN if missing)
for col in ENRICH_COLS:
    if col not in df.columns:
        df[col] = np.nan

# Rows where ANY of these enrichment columns are missing
missing_any = df[ENRICH_COLS].isna().any(axis=1)
print("Rows with any missing enrich-col:", int(missing_any.sum()))

# Focus on recent seasons where the live boxscore API is most reliable
if "Season" in df.columns:
    # You can adjust this list as you like
    recent_mask = df["Season"].isin([
        "2020-21", "2021-22", "2022-23", "2023-24", "2025-26"
    ])
else:
    # Fallback if Season is missing ‚Äì limit by date
    recent_mask = df["gameDate"] >= pd.Timestamp("2020-01-01")

rows_to_enrich = df[missing_any & recent_mask].copy()

print("Rows to enrich (recent seasons + missing cols):", len(rows_to_enrich))
print("Unique games to enrich:", rows_to_enrich["gameId"].nunique())

display(
    rows_to_enrich.head(10)[
        ["gameDate", "Season", "teamCity", "teamName", "gameId"] + ENRICH_COLS
    ]
)


In [None]:
# Use the LIVE boxscore endpoint (JSON) from nba_api

from nba_api.live.nba.endpoints import boxscore as live_boxscore

print("nba_api live imports OK")

def std_game_id(game_id_val) -> str:
    """
    Normalize our stored gameId into a standard 10-char NBA GAME_ID string.
    Example: 22000001 -> '0002200001'
    """
    if pd.isna(game_id_val):
        return None
    try:
        n = int(game_id_val)
    except (ValueError, TypeError):
        return str(game_id_val)
    return str(n).zfill(10)


def _team_record_from_live(game_dict: dict, which: str, gid: str) -> dict:
    """
    Helper: pull team-level stats from live boxscore 'homeTeam' or 'awayTeam'.
    which must be 'homeTeam' or 'awayTeam'.
    """
    team = game_dict.get(which, {})
    stats = team.get("statistics", {}) or {}

    return {
        "gameId": gid,
        "teamId": team.get("teamId"),
        "teamCity_live": team.get("teamCity"),
        "teamName_live": team.get("teamName"),
        "teamScore_live": team.get("score"),

        # enrichment fields matching our ENRICH_COLS
        "pointsInThePaint":    stats.get("pointsInThePaint"),
        "benchPoints":         stats.get("benchPoints"),
        "pointsFastBreak":     stats.get("pointsFastBreak"),
        "pointsFromTurnovers": stats.get("pointsFromTurnovers"),
        "pointsSecondChance":  stats.get("pointsSecondChance"),
        "biggestLead":         stats.get("biggestLead"),
        "biggestScoringRun":   stats.get("biggestScoringRun"),
        "leadChanges":         stats.get("leadChanges"),
        "timesTied":           stats.get("timesTied"),
        "timeoutsRemaining":   team.get("timeoutsRemaining"),
    }


def fetch_team_boxscore_for_game_live(game_id_val, sleep_sec: float = 0.4):
    """
    Fetch team-level boxscore info for a single gameId using the live boxscore endpoint.

    Returns: DataFrame with up to TWO rows (home + away team).
             Columns match our enrichment schema above.

    On failure: returns None.
    """
    gid = std_game_id(game_id_val)
    if gid is None:
        return None

    try:
        bs = live_boxscore.BoxScore(game_id=gid)
        data = bs.get_dict()
        game = data.get("game", {})
        if not game:
            print(f"‚ö†Ô∏è No 'game' payload for gameId={gid}")
            return None
    except Exception as e:
        # Most common reasons: invalid gameId, preseason, or NBA endpoint issues
        print(f"‚ö†Ô∏è Live boxscore error for gameId={gid}: {e}")
        return None
    finally:
        # polite pause to be nice to the API
        time.sleep(sleep_sec)

    records = []
    for which in ["homeTeam", "awayTeam"]:
        if which in game:
            rec = _team_record_from_live(game, which, gid)
            records.append(rec)

    if not records:
        return None

    return pd.DataFrame(records)


In [None]:
# Build / update a cache of enriched team-level stats from live boxscore

cache_path = raw_root / "boxscore_enrichment_cache.csv"

# Load existing cache if present
if cache_path.exists():
    cache_df = pd.read_csv(cache_path, low_memory=False)
    print(f"Loaded existing enrichment cache: {cache_path}  (rows: {len(cache_df)})")
else:
    cache_df = pd.DataFrame()
    print("No existing cache found; starting fresh.")

# Make sure gameId in cache is string for consistent merging
if "gameId" in cache_df.columns:
    cache_df["gameId"] = cache_df["gameId"].astype(str)

# GameIds we've already fetched
fetched_game_ids = set(cache_df["gameId"].unique()) if not cache_df.empty else set()

# Distinct gameIds we want to enrich (from rows_to_enrich)
games_list = sorted(rows_to_enrich["gameId"].dropna().astype(str).unique())
print("Total distinct gameIds to process:", len(games_list))

records = []

for idx, gid in enumerate(games_list, start=1):
    if gid in fetched_game_ids:
        continue  # already cached

    print(f"[{idx}/{len(games_list)}] Fetching gameId={gid} ...", end="\r")

    df_bs = fetch_team_boxscore_for_game_live(gid, sleep_sec=0.4)
    if df_bs is None:
        continue

    # Standardize dtypes
    df_bs = df_bs.copy()
    df_bs["gameId"] = df_bs["gameId"].astype(str)

    # Collect records
    records.extend(df_bs.to_dict(orient="records"))

    # Every 25 new games, flush to disk
    if idx % 25 == 0:
        tmp_df = pd.DataFrame(records)
        cache_df = pd.concat([cache_df, tmp_df], ignore_index=True)
        cache_df.drop_duplicates(subset=["gameId", "teamId"], keep="last", inplace=True)
        cache_df.to_csv(cache_path, index=False)
        print(f"\nFlushed cache at {idx} games ‚Üí {len(cache_df)} rows")
        records = []

# Final flush
if records:
    tmp_df = pd.DataFrame(records)
    cache_df = pd.concat([cache_df, tmp_df], ignore_index=True)
    cache_df.drop_duplicates(subset=["gameId", "teamId"], keep="last", inplace=True)
    cache_df.to_csv(cache_path, index=False)
    print(f"\nFinal cache flush ‚Üí {len(cache_df)} rows")

print("Enrichment cache complete; rows:", len(cache_df))
display(cache_df.head())


In [None]:
# Merge cache_df back into df (all-seasons team logs)

print("=== Merging enrichment into all-seasons table ===")

enrich_df = cache_df.copy()

# Standardize merge keys as strings / Int64
df["gameId"] = df["gameId"].astype(str)
enrich_df["gameId"] = enrich_df["gameId"].astype(str)

df["teamId"] = pd.to_numeric(df["teamId"], errors="coerce").astype("Int64")
enrich_df["teamId"] = pd.to_numeric(enrich_df["teamId"], errors="coerce").astype("Int64")

merged = df.copy()

key_cols = ["gameId", "teamId"]

for col in ENRICH_COLS:
    print(f"Updating column: {col}")
    # Series to align on key_cols
    update_series = enrich_df.set_index(key_cols)[col]

    temp = merged.set_index(key_cols)
    before_missing = temp[col].isna().sum()

    temp[col] = temp[col].where(~temp[col].isna(), update_series)
    after_missing = temp[col].isna().sum()

    print(f"  Missing before: {before_missing}, after: {after_missing}")

    merged = temp.reset_index()

print("Merged shape:", merged.shape)
display(merged.tail())


In [None]:
WRITE_CHANGES = False  # <-- set to True when you're ready to commit changes

if WRITE_CHANGES:
    backup_path = all_seasons_path.with_suffix(".backup_before_enrichment.csv")
    print(f"Backing up original all-seasons file to: {backup_path}")
    df.to_csv(backup_path, index=False)

    print(f"Writing enriched all-seasons file to: {all_seasons_path}")
    merged.to_csv(all_seasons_path, index=False)

    # Quick verification
    check = pd.read_csv(all_seasons_path, low_memory=False)
    check["gameDate"] = pd.to_datetime(check["gameDate"], errors="coerce", utc=True).dt.tz_convert(None)
    print("New shape:", check.shape)
    print("New date range:", check["gameDate"].min(), "‚Üí", check["gameDate"].max())
    for col in ENRICH_COLS:
        print(f"{col}: missing {check[col].isna().sum()} rows")
else:
    print("WRITE_CHANGES=False ‚Üí dry run only; no files written.")
