In [None]:
# CELL 1: Imports & basic config

from pathlib import Path
import time
import numpy as np
import pandas as pd

from nba_api.stats.endpoints import TeamGameLog

# ---- Project root & raw data root ----
# Put this notebook in qepc_project/ and this will just work.
project_root = Path.cwd().resolve()
raw_root = project_root / "data" / "raw"
raw_root.mkdir(parents=True, exist_ok=True)

print("Project root:", project_root)
print("Raw data root:", raw_root)

# ---- Seasons we want to fetch ----
TARGET_SEASONS = [
    "2021-22",
    "2022-23",
    "2023-24",
    "2024-25",
    "2025-26",
]

# ---- Canonical team mapping (ID ‚Üí abbrev, name) ----
TEAM_ID_TO_META = {
    1610612737: ("ATL", "Atlanta Hawks"),
    1610612738: ("BOS", "Boston Celtics"),
    1610612739: ("CLE", "Cleveland Cavaliers"),
    1610612740: ("NOP", "New Orleans Pelicans"),
    1610612741: ("CHI", "Chicago Bulls"),
    1610612742: ("DAL", "Dallas Mavericks"),
    1610612743: ("DEN", "Denver Nuggets"),
    1610612744: ("GSW", "Golden State Warriors"),
    1610612745: ("HOU", "Houston Rockets"),
    1610612746: ("LAC", "LA Clippers"),
    1610612747: ("LAL", "Los Angeles Lakers"),
    1610612748: ("MIA", "Miami Heat"),
    1610612749: ("MIL", "Milwaukee Bucks"),
    1610612750: ("MIN", "Minnesota Timberwolves"),
    1610612751: ("BKN", "Brooklyn Nets"),
    1610612752: ("NYK", "New York Knicks"),
    1610612753: ("ORL", "Orlando Magic"),
    1610612754: ("IND", "Indiana Pacers"),
    1610612755: ("PHI", "Philadelphia 76ers"),
    1610612756: ("PHX", "Phoenix Suns"),
    1610612757: ("POR", "Portland Trail Blazers"),
    1610612758: ("SAC", "Sacramento Kings"),
    1610612759: ("SAS", "San Antonio Spurs"),
    1610612760: ("OKC", "Oklahoma City Thunder"),
    1610612761: ("TOR", "Toronto Raptors"),
    1610612762: ("UTA", "Utah Jazz"),
    1610612763: ("MEM", "Memphis Grizzlies"),
    1610612764: ("WAS", "Washington Wizards"),
    1610612765: ("DET", "Detroit Pistons"),
    1610612766: ("CHA", "Charlotte Hornets"),
}

# Reverse maps (for possible use later)
abbrev_to_id = {abbrev: tid for tid, (abbrev, _) in TEAM_ID_TO_META.items()}
abbrev_to_name = {abbrev: name for _, (abbrev, name) in TEAM_ID_TO_META.items()}

# ---- Canonical schema for team game logs ----
TEAM_LOG_SCHEMA = [
    "Season",
    "gameId",
    "gameDate",
    "teamId",
    "teamAbbrev",
    "teamName",
    "opponentTeamId",
    "opponentTeamAbbrev",
    "opponentTeamName",
    "home",
    "win",
    "minutes",
    "pts",
    "reboundsTotal",
    "assists",
    "steals",
    "blocks",
    "turnovers",
    "foulsPersonal",
    "fieldGoalsMade",
    "fieldGoalsAttempted",
    "fieldGoalsPercentage",
    "threePointersMade",
    "threePointersAttempted",
    "threePointersPercentage",
    "freeThrowsMade",
    "freeThrowsAttempted",
    "freeThrowsPercentage",
    "reboundsOffensive",
    "reboundsDefensive",
]
print("TEAM_LOG_SCHEMA columns:", len(TEAM_LOG_SCHEMA))


In [None]:
# CELL 2: Normalizer for team game logs from nba_api

def normalize_teamgamelog_df(df_raw: pd.DataFrame, season: str) -> pd.DataFrame:
    """
    Take the raw df from nba_api TeamGameLog and convert into TEAM_LOG_SCHEMA.
    This version is defensive about column name casing and naming variants.
    """
    df = df_raw.copy()

    # Attach Season
    df["Season"] = season

    # ---- Case-insensitive column lookup helpers ----
    cols_lower = {c.lower(): c for c in df.columns}

    def get_exact_ci(*candidates):
        """Case-insensitive exact match: returns the real column name or None."""
        for cand in candidates:
            real = cols_lower.get(cand.lower())
            if real is not None:
                return real
        return None

    def find_by_substrings(*substrings):
        """
        Return the first column whose lowercase name contains ALL substrings.
        e.g. find_by_substrings("team", "abbrev") might match 'TEAM_ABBREVIATION'.
        """
        wanted = [s.lower() for s in substrings]
        for c in df.columns:
            cl = c.lower()
            if all(s in cl for s in wanted):
                return c
        return None

    # ---- Core ID columns ----
    game_id_col = get_exact_ci("GAME_ID", "Game_ID") or find_by_substrings("game", "id")
    team_id_col = get_exact_ci("TEAM_ID", "Team_ID") or find_by_substrings("team", "id")
    team_abbrev_col = (
        get_exact_ci("TEAM_ABBREVIATION", "Team_Abbreviation")
        or find_by_substrings("team", "abbrev")
    )
    team_name_col = (
        get_exact_ci("TEAM_NAME", "Team_Name")
        or find_by_substrings("team", "name")
    )
    game_date_col = get_exact_ci("GAME_DATE", "Game_Date") or find_by_substrings("game", "date")
    matchup_col = get_exact_ci("MATCHUP") or find_by_substrings("matchup")
    wl_col = get_exact_ci("WL") or find_by_substrings("wl")
    min_col = get_exact_ci("MIN") or find_by_substrings("min")

    # Map into our canonical columns
    df["gameId"] = df[game_id_col] if game_id_col else pd.NA
    df["teamId"] = df[team_id_col] if team_id_col else pd.NA
    df["teamAbbrev"] = df[team_abbrev_col] if team_abbrev_col else pd.NA
    df["teamName"] = df[team_name_col] if team_name_col else pd.NA

    # Parse gameDate ('OCT 20, 2021' style)
    if game_date_col:
        df["gameDate"] = pd.to_datetime(
            df[game_date_col].astype(str),
            format="%b %d, %Y",
            errors="coerce",
        )
    else:
        df["gameDate"] = pd.NaT

    # Win flag
    if wl_col:
        df["win"] = (df[wl_col] == "W").astype(int)
    else:
        df["win"] = pd.NA

    # Minutes
    df["minutes"] = df[min_col] if min_col else pd.NA

    # ---- Home / opponent info from matchup ----
    if matchup_col:
        matchup = df[matchup_col].astype(str)

        # home vs away: "CHI vs. BOS" (home) vs "CHI @ BOS" (away)
        df["home"] = matchup.str.contains(" vs\.").astype(int)

        # opponent abbrev: last token in "CHI vs. BOS" / "CHI @ BOS"
        opp_abbrev = matchup.str.split().str[-1]
        df["opponentTeamAbbrev"] = opp_abbrev
        df["opponentTeamId"] = df["opponentTeamAbbrev"].map(abbrev_to_id)
        df["opponentTeamName"] = df["opponentTeamAbbrev"].map(abbrev_to_name)
    else:
        df["home"] = pd.NA
        df["opponentTeamAbbrev"] = pd.NA
        df["opponentTeamId"] = pd.NA
        df["opponentTeamName"] = pd.NA

    # ---- Stats mapping (PTS, REB, etc.) ----
    def copy_stat(src_candidates, dst):
        src_col = get_exact_ci(*src_candidates)
        if not src_col:
            src_col = find_by_substrings(*src_candidates)
        if src_col:
            df[dst] = df[src_col]
        else:
            df[dst] = pd.NA

    copy_stat(["PTS"], "pts")
    copy_stat(["REB"], "reboundsTotal")
    copy_stat(["AST"], "assists")
    copy_stat(["STL"], "steals")
    copy_stat(["BLK"], "blocks")
    copy_stat(["TOV"], "turnovers")
    copy_stat(["PF"], "foulsPersonal")
    copy_stat(["PLUS_MINUS", "PLUSMINUS"], "plusMinusPoints")

    copy_stat(["FGM"], "fieldGoalsMade")
    copy_stat(["FGA"], "fieldGoalsAttempted")
    copy_stat(["FG_PCT", "FG_Pct"], "fieldGoalsPercentage")

    copy_stat(["FG3M"], "threePointersMade")
    copy_stat(["FG3A"], "threePointersAttempted")
    copy_stat(["FG3_PCT", "FG3_Pct"], "threePointersPercentage")

    copy_stat(["FTM"], "freeThrowsMade")
    copy_stat(["FTA"], "freeThrowsAttempted")
    copy_stat(["FT_PCT", "FT_Pct"], "freeThrowsPercentage")

    copy_stat(["OREB"], "reboundsOffensive")
    copy_stat(["DREB"], "reboundsDefensive")

    # ---- Ensure every schema column exists; fill missing with NaN ----
    for col in TEAM_LOG_SCHEMA:
        if col not in df.columns:
            df[col] = pd.NA

    # Restrict to schema columns in order
    df = df[TEAM_LOG_SCHEMA].copy()

    return df

print("normalize_teamgamelog_df defined.")


In [None]:
# CELL 3: Reset buffers (run this before the fetch loop if you restart)

all_logs = []
errors = []

print("Reset all_logs and errors.")


In [None]:
# CELL 4: Fetch team game logs for all seasons & teams

team_ids = sorted(TEAM_ID_TO_META.keys())

print("=== Fetching team game logs from nba_api ===")
print("Seasons:", TARGET_SEASONS)
print("Teams:", len(team_ids), "NBA teams")

for season in TARGET_SEASONS:
    print(f"\nüìÖ Season {season}")
    for i, team_id in enumerate(team_ids, start=1):
        abbrev, name = TEAM_ID_TO_META[team_id]
        print(f"  [{i:2d}/{len(team_ids)}] {season} ‚Äì {abbrev} ({team_id})", end="\r")

        try:
            tgl = TeamGameLog(
                team_id=team_id,
                season=season,
                season_type_all_star="Regular Season",
            )
            df_raw = tgl.get_data_frames()[0]
        except Exception as e:
            msg = f"{season} {team_id} ({abbrev}) ‚Äì api error: {e}"
            print("\n‚ö†Ô∏è", msg)
            errors.append(msg)
            time.sleep(1.0)
            continue

        if df_raw.empty:
            msg = f"{season} {team_id} ({abbrev}) ‚Äì empty frame"
            print("\n‚ö†Ô∏è", msg)
            errors.append(msg)
            time.sleep(0.5)
            continue

        try:
            df_norm = normalize_teamgamelog_df(df_raw, season=season)
            all_logs.append(df_norm)
        except Exception as e:
            msg = f"{season} {team_id} ({abbrev}) ‚Äì normalize error: {e}"
            print("\n‚ö†Ô∏è", msg)
            errors.append(msg)
            continue

        # polite pause for rate limiting
        time.sleep(0.5)

    print(f"\n‚úÖ Finished season {season}")

print("\nFetch complete.")
print("Total normalized chunks:", len(all_logs))
print("Total errors logged:", len(errors))


In [None]:
# CELL 5: Combine all logs and sanity check

if not all_logs:
    raise RuntimeError("all_logs is empty ‚Äì fetch step failed or did not run.")

team_game_logs_full = pd.concat(all_logs, ignore_index=True)

print("Combined team_game_logs_full shape:", team_game_logs_full.shape)

# Deduplicate by Season + gameId + teamId
key_cols = [c for c in ["Season", "gameId", "teamId"] if c in team_game_logs_full.columns]

if key_cols:
    before = len(team_game_logs_full)
    team_game_logs_full = (
        team_game_logs_full
        .sort_values(key_cols + (["gameDate"] if "gameDate" in team_game_logs_full.columns else []))
        .drop_duplicates(subset=key_cols, keep="last")
        .reset_index(drop=True)
    )
    after = len(team_game_logs_full)
    print(f"After dedupe on {key_cols}: {before} ‚Üí {after}")
else:
    print("No key_cols found for dedupe; keeping all rows.")

# Date range
if "gameDate" in team_game_logs_full.columns:
    print("Date range:", team_game_logs_full["gameDate"].min(), "‚Üí", team_game_logs_full["gameDate"].max())

print("\nPer-season approx game counts (unique gameIds):")
display(team_game_logs_full.groupby("Season")["gameId"].nunique())

print("\nSample rows with stats (before name patch):")
display(
    team_game_logs_full[
        [
            "Season", "gameDate", "teamId", "teamAbbrev", "teamName",
            "home", "win",
            "pts", "reboundsTotal", "assists", "steals", "blocks",
            "turnovers",
        ]
    ].head(10)
)


In [None]:
# CELL 6: Patch teamAbbrev / teamName (and opponents) from teamId

print("=== Patch teamAbbrev / teamName from teamId ===")

print("Columns:", list(team_game_logs_full.columns))
print("Non-null sample of teamId:", team_game_logs_full["teamId"].dropna().head())

def safe_lookup_team_meta(team_id_val):
    """
    Map various numeric / float / string representations of teamId
    into (abbrev, full_name). Returns (None, None) if unknown.
    """
    if pd.isna(team_id_val):
        return (None, None)
    try:
        tid = int(team_id_val)
    except (ValueError, TypeError):
        return (None, None)
    return TEAM_ID_TO_META.get(tid, (None, None))

# Fill teamAbbrev, teamName
abbrevs = []
names = []

for v in team_game_logs_full["teamId"]:
    ab, nm = safe_lookup_team_meta(v)
    abbrevs.append(ab)
    names.append(nm)

team_game_logs_full["teamAbbrev"] = team_game_logs_full["teamAbbrev"].fillna(pd.Series(abbrevs, index=team_game_logs_full.index))
team_game_logs_full["teamName"]   = team_game_logs_full["teamName"].fillna(pd.Series(names,    index=team_game_logs_full.index))

# Fill opponentTeam* if we have opponentTeamId
if "opponentTeamId" in team_game_logs_full.columns:
    opp_abbrevs = []
    opp_names = []
    for v in team_game_logs_full["opponentTeamId"]:
        ab, nm = safe_lookup_team_meta(v)
        opp_abbrevs.append(ab)
        opp_names.append(nm)

    if "opponentTeamAbbrev" not in team_game_logs_full.columns:
        team_game_logs_full["opponentTeamAbbrev"] = np.nan
    if "opponentTeamName" not in team_game_logs_full.columns:
        team_game_logs_full["opponentTeamName"] = np.nan

    team_game_logs_full["opponentTeamAbbrev"] = team_game_logs_full["opponentTeamAbbrev"].fillna(
        pd.Series(opp_abbrevs, index=team_game_logs_full.index)
    )
    team_game_logs_full["opponentTeamName"] = team_game_logs_full["opponentTeamName"].fillna(
        pd.Series(opp_names, index=team_game_logs_full.index)
    )

print("\nAfter patch, sample:")
display(
    team_game_logs_full[
        [
            "Season", "gameDate",
            "teamId", "teamAbbrev", "teamName",
            "opponentTeamId", "opponentTeamAbbrev", "opponentTeamName",
            "home", "win", "pts", "reboundsTotal", "assists",
        ]
    ].head(10)
)

print("\nUnique (Season, teamAbbrev) sample:")
display(
    team_game_logs_full[["Season", "teamAbbrev", "teamName"]]
    .dropna()
    .drop_duplicates()
    .head(20)
)


In [None]:
# CELL 7: Save canonical team game logs CSV

out_path = raw_root / "team_game_logs.csv"

team_game_logs_full.to_csv(out_path, index=False)

print("‚úÖ Wrote team game logs to:")
print("   ", out_path)
print("Final shape:", team_game_logs_full.shape)

print("\nPer-season game counts (unique gameIds):")
display(team_game_logs_full.groupby("Season")["gameId"].nunique())
