# üßº QEPC ‚Äì Update Team_Stats.csv from nba_api

This notebook:

1. Loads the existing `Team_Stats.csv` from `data/raw`.
2. Detects the last game date currently stored.
3. Uses `nba_api` to fetch NEW team game logs after that date.
4. Maps them into the Team_Stats schema.
5. Concatenates & de-duplicates.
6. Optionally writes the updated file back to disk.


In [None]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd

print("=== QEPC Experimental Bootstrap (Team_Stats updater) ===")

cwd = Path.cwd()
core_root = None
cur = cwd

# Walk up until we find qepc_core
for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

# Repo root is 3 levels up: .../qepc_project/experimental/GTP_REWRITE/qepc_core
repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:  ", project_root)
print("data/raw:      ", cfg.raw_root)
print("=== Bootstrap OK ===")


In [None]:
from datetime import timedelta

team_stats_path = cfg.raw_root / "Team_Stats.csv"
print("Team_Stats path:", team_stats_path)

team_stats = pd.read_csv(team_stats_path)

# Keep original string as backup (optional)
team_stats["gameDate_raw"] = team_stats["gameDate"]

# Robust datetime parse: force UTC, then drop tz info
team_stats["gameDate"] = pd.to_datetime(
    team_stats["gameDate"],
    errors="coerce",
    utc=True,
)

# Convert to naive datetime (no timezone)
team_stats["gameDate"] = team_stats["gameDate"].dt.tz_convert(None)

# Filter to valid dates only
valid_mask = team_stats["gameDate"].notna()
invalid_count = (~valid_mask).sum()

print("Existing Team_Stats shape:", team_stats.shape)
print("Columns:", list(team_stats.columns))

if invalid_count > 0:
    print(f"‚ö†Ô∏è Dropping {invalid_count} rows with invalid gameDate")
    team_stats = team_stats[valid_mask].copy()

min_date = team_stats["gameDate"].min()
max_date = team_stats["gameDate"].max()

print(f"Date range in Team_Stats: {min_date} ‚Üí {max_date}")

# We'll fetch NEW games after this date
last_date = max_date.date()
date_from_for_api = (last_date + timedelta(days=1)).strftime("%m/%d/%Y")

print(f"\nLast recorded gameDate: {last_date}")
print(f"We will request nba_api games from: {date_from_for_api} onward")

display(team_stats.tail())


In [None]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams as nba_teams

print("=== Fetching new games from nba_api ===")

# Get all NBA teams metadata for city/nickname mapping
teams_meta = nba_teams.get_teams()
teams_by_id = {t["id"]: t for t in teams_meta}
teams_by_abbrev = {t["abbreviation"]: t for t in teams_meta}

print("Loaded team metadata for", len(teams_meta), "teams")

# Call leaguegamefinder for Regular Season, all teams, games after date_from_for_api
lgf = leaguegamefinder.LeagueGameFinder(
    league_id_nullable="00",
    season_type_nullable="Regular Season",
    date_from_nullable=date_from_for_api,  # e.g. "11/18/2025"
)

df_new_raw = lgf.get_data_frames()[0]
print("Raw nba_api new logs shape:", df_new_raw.shape)

if df_new_raw.empty:
    print("‚úÖ No new games found after", date_from_for_api)
else:
    print("Columns from nba_api:", list(df_new_raw.columns))
    display(df_new_raw.head())


In [None]:
if df_new_raw.empty:
    new_team_stats = pd.DataFrame(columns=team_stats.columns)
    print("No new games to map; skipping mapping step.")
else:
    df = df_new_raw.copy()

    # Standardize datetime
    df["gameDate"] = pd.to_datetime(df["GAME_DATE"], errors="coerce")

    # Basic stats mapping
    df["teamScore"] = df["PTS"]
    df["reboundsTotal"] = df["REB"]
    df["assists"] = df["AST"]
    df["threePointersMade"] = df["FG3M"]
    df["threePointersAttempted"] = df["FG3A"]
    df["blocks"] = df["BLK"]
    df["steals"] = df["STL"]
    df["fieldGoalsAttempted"] = df["FGA"]
    df["fieldGoalsMade"] = df["FGM"]
    df["fieldGoalsPercentage"] = df["FG_PCT"]
    df["threePointersPercentage"] = df["FG3_PCT"]
    df["freeThrowsAttempted"] = df["FTA"]
    df["freeThrowsMade"] = df["FTM"]
    df["freeThrowsPercentage"] = df["FT_PCT"]
    df["reboundsDefensive"] = df["DREB"]
    df["reboundsOffensive"] = df["OREB"]
    df["foulsPersonal"] = df["PF"]
    df["turnovers"] = df["TOV"]
    df["plusMinusPoints"] = df["PLUS_MINUS"]
    df["numMinutes"] = df["MIN"]

    # Team identity from teams metadata
    def map_team_city(row):
        meta = teams_by_id.get(row["TEAM_ID"])
        return meta["city"] if meta else row["TEAM_NAME"]

    def map_team_nickname(row):
        meta = teams_by_id.get(row["TEAM_ID"])
        return meta["nickname"] if meta else row["TEAM_NAME"]

    def map_team_abbrev(row):
        meta = teams_by_id.get(row["TEAM_ID"])
        return meta["abbreviation"] if meta else row["TEAM_ABBREVIATION"]

    df["teamCity"] = df.apply(map_team_city, axis=1)
    df["teamName"] = df.apply(map_team_nickname, axis=1)
    df["teamCity_hist"] = df["teamCity"]
    df["teamName_hist"] = df["teamName"]
    df["teamAbbrev_hist"] = df.apply(map_team_abbrev, axis=1)

    df["teamId"] = df["TEAM_ID"]
    df["gameId"] = df["GAME_ID"]
    df["league"] = "NBA"

    # Home/away and opponent info from MATCHUP
    def parse_matchup(row):
        matchup = row["MATCHUP"]
        # examples: "DEN vs. CHI", "DEN @ CHI"
        if " vs. " in matchup:
            team_abbr, opp_abbr = matchup.split(" vs. ")
            home = 1
        elif " @ " in matchup:
            team_abbr, opp_abbr = matchup.split(" @ ")
            home = 0
        else:
            # fallback
            parts = matchup.split(" ")
            team_abbr = parts[0]
            opp_abbr = parts[-1]
            home = 0
        team_abbr = team_abbr.strip()
        opp_abbr = opp_abbr.strip()
        return team_abbr, opp_abbr, home

    parsed = df.apply(parse_matchup, axis=1, result_type="expand")
    df["TEAM_ABBR_FROM_MATCHUP"] = parsed[0]
    df["OPP_ABBR"] = parsed[1]
    df["home"] = parsed[2].astype(int)

    # Sanity check: sometimes TEAM_ABBREVIATION already matches
    # Opponent meta
    def map_opp_city(row):
        meta = teams_by_abbrev.get(row["OPP_ABBR"])
        return meta["city"] if meta else np.nan

    def map_opp_nickname(row):
        meta = teams_by_abbrev.get(row["OPP_ABBR"])
        return meta["nickname"] if meta else row["OPP_ABBR"]

    def map_opp_id(row):
        meta = teams_by_abbrev.get(row["OPP_ABBR"])
        return meta["id"] if meta else np.nan

    df["opponentTeamCity"] = df.apply(map_opp_city, axis=1)
    df["opponentTeamName"] = df.apply(map_opp_nickname, axis=1)
    df["opponentTeamId"] = df.apply(map_opp_id, axis=1)

    # Win / loss flag
    df["win"] = (df["WL"] == "W").astype(int)

    # Season label
    # leaguegamefinder SEASON_ID like "2025-26"
    if "SEASON_ID" in df.columns:
        df["season"] = df["SEASON_ID"].str.extract(r"(\d{4})").astype(float)
    else:
        df["season"] = df["gameDate"].dt.year  # fallback

    # Dummy / unavailable fields
    for col in [
        "pointsInThePaint",
        "benchPoints",
        "q1Points",
        "q2Points",
        "q3Points",
        "q4Points",
        "biggestLead",
        "biggestScoringRun",
        "leadChanges",
        "pointsFastBreak",
        "pointsFromTurnovers",
        "pointsSecondChance",
        "timesTied",
        "timeoutsRemaining",
        "coachId",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # Season wins/losses (approximate record *before* each game)
    df = df.sort_values(["TEAM_ID", "gameDate"]).reset_index(drop=True)
    df["is_win"] = (df["WL"] == "W").astype(int)

    def add_record(g):
        g = g.sort_values("gameDate")
        g["wins_so_far"] = g["is_win"].shift(1).fillna(0).cumsum()
        g["games_so_far"] = np.arange(len(g))
        g["losses_so_far"] = g["games_so_far"] - g["wins_so_far"]
        return g

    df = df.groupby("TEAM_ID", group_keys=False).apply(add_record)
    df["seasonWins"] = df["wins_so_far"]
    df["seasonLosses"] = df["losses_so_far"]

    # For now, opponentScore is not provided directly; set to NaN.
    # (You already have a repair function in strengths to fix from pairs if needed.)
    df["opponentScore"] = np.nan

    # Map to final Team_Stats column order: align to existing file
    new_team_stats = df.copy()

    # Ensure all existing columns are present
    for col in team_stats.columns:
        if col not in new_team_stats.columns:
            new_team_stats[col] = np.nan

    # Keep only columns that exist in Team_Stats, and in the same order
    new_team_stats = new_team_stats[team_stats.columns]

    print("Mapped new_team_stats shape:", new_team_stats.shape)
    display(new_team_stats.head())


In [None]:
print("Existing Team_Stats rows:", len(team_stats))
print("New rows from nba_api:", len(new_team_stats))

# If no new games, just keep the original
if new_team_stats.empty:
    updated_team_stats = team_stats.copy()
else:
    combined = pd.concat([team_stats, new_team_stats], ignore_index=True)

    # De-duplicate by (gameId, teamId) pair, keeping the first (existing) row
    if {"gameId", "teamId"}.issubset(combined.columns):
        before = len(combined)
        combined = combined.drop_duplicates(subset=["gameId", "teamId"], keep="first")
        after = len(combined)
        print(f"De-duplicated by (gameId, teamId): {before} ‚Üí {after} rows")
    else:
        print("‚ö†Ô∏è Missing 'gameId' or 'teamId' columns; skipping de-duplication by key")

    updated_team_stats = combined

print("Final updated_team_stats rows:", len(updated_team_stats))
display(updated_team_stats.tail())


In [None]:
WRITE_CHANGES = True  # make sure this is True before running this cell

if WRITE_CHANGES:
    backup_path = team_stats_path.with_suffix(".backup_before_update.csv")
    print(f"Writing backup to: {backup_path}")
    team_stats.to_csv(backup_path, index=False)

    print(f"Writing updated Team_Stats to: {team_stats_path}")
    updated_team_stats.to_csv(team_stats_path, index=False)

    print("‚úÖ Done writing. Re-reading file from disk to verify...")

    ts_disk2 = pd.read_csv(team_stats_path)
    ts_disk2["gameDate"] = pd.to_datetime(ts_disk2["gameDate"], errors="coerce", utc=True).dt.tz_convert(None)
    print("On-disk shape (after write):", ts_disk2.shape)
    print("On-disk date range (after write):", ts_disk2["gameDate"].min(), "‚Üí", ts_disk2["gameDate"].max())
    display(ts_disk2.tail())
else:
    print("WRITE_CHANGES is False ‚Äì not writing anything to disk.")
