# ðŸ§± QEPC â€“ Build All-Seasons Team Logs

This notebook merges:

- `NBA_API_QEPC_Format.csv` (multi-season logs from 2014â€“15 to 2023â€“24)
- `Team_Stats.csv` (updated season logs, e.g. 2025â€“26 from nba_api)

into a single canonical file:

- `NBA_Team_Logs_All_Seasons.csv` in `data/raw/`

We:
1. Load & inspect both sources.
2. Normalize dates and seasons.
3. Align columns (union of both).
4. Concatenate and de-duplicate by (gameId, teamId).
5. Write the merged file with a backup of existing sources.


In [None]:
# 03_build_schedule_all_seasons.ipynb
# Goal: build NBA_Schedule_All_Seasons.csv from canonical team logs.

from pathlib import Path
import sys
import numpy as np
import pandas as pd

print("=== QEPC Schedule Bootstrap ===")

cwd = Path.cwd()
core_root = None
cur = cwd

for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:", project_root)
print("raw_root:    ", cfg.raw_root)
print("=== Bootstrap OK ===")


In [None]:
raw_root = cfg.raw_root
team_logs_path = raw_root / "NBA_Team_Logs_All_Seasons.csv"

print("Team logs path:", team_logs_path)

team_df = pd.read_csv(team_logs_path, low_memory=False, parse_dates=["gameDate"])

print("Team logs shape:", team_df.shape)
print("Date range:", team_df["gameDate"].min(), "â†’", team_df["gameDate"].max())
print("Seasons:", sorted(team_df["Season"].dropna().unique()))

# Quick peek
display(team_df.head())


In [None]:
# Check how many team-rows per gameId

per_game_counts = (
    team_df.groupby("gameId")
           .size()
           .rename("rows_per_game")
           .reset_index()
)

print(per_game_counts["rows_per_game"].value_counts().sort_index())

weird_games = per_game_counts[per_game_counts["rows_per_game"] != 2]
print("\nGames that don't have exactly 2 rows:", len(weird_games))

if len(weird_games) > 0:
    display(weird_games.head(20))


In [None]:
# Filter to only "good" games with exactly 2 team rows

good_game_ids = per_game_counts.loc[per_game_counts["rows_per_game"] == 2, "gameId"]
team_good = team_df[team_df["gameId"].isin(good_game_ids)].copy()

print("Total games with exactly 2 rows:", len(good_game_ids))

# Split into home and away
home_rows = team_good[team_good["home"] == 1].copy()
away_rows = team_good[team_good["home"] == 0].copy()

print("Home rows:", len(home_rows))
print("Away rows:", len(away_rows))

# If there are still duplicates (e.g. weird data), keep one row per (gameId, teamId)
home_rows = (
    home_rows.sort_values(["gameId", "gameDate"])
             .drop_duplicates(subset=["gameId", "teamId"], keep="last")
)
away_rows = (
    away_rows.sort_values(["gameId", "gameDate"])
             .drop_duplicates(subset=["gameId", "teamId"], keep="last")
)

# Merge home + away on gameId
schedule = home_rows.merge(
    away_rows,
    on="gameId",
    suffixes=("_home", "_away"),
)

print("Merged schedule shape (raw):", schedule.shape)
display(schedule.head())


In [None]:
# Build canonical schedule table

def pick_game_date(row):
    # prefer home row date, fallback to away
    if pd.notna(row["gameDate_home"]):
        return row["gameDate_home"]
    return row["gameDate_away"]

def pick_season(row):
    # prefer home season, fallback to away
    if pd.notna(row["Season_home"]):
        return row["Season_home"]
    return row["Season_away"]

sched = pd.DataFrame({
    "gameId": schedule["gameId"].astype(str),

    "gameDate": schedule.apply(pick_game_date, axis=1),
    "Season": schedule.apply(pick_season, axis=1),

    "homeTeamId": schedule["teamId_home"],
    "homeTeamAbbrev": schedule.get("teamAbbrev_home", schedule.get("teamAbbrev_home", np.nan)),
    "homeTeamName": schedule["teamName_home"],
    "homeTeamCity": schedule["teamCity_home"],

    "awayTeamId": schedule["teamId_away"],
    "awayTeamAbbrev": schedule.get("teamAbbrev_away", schedule.get("teamAbbrev_away", np.nan)),
    "awayTeamName": schedule["teamName_away"],
    "awayTeamCity": schedule["teamCity_away"],

    "homeScore": schedule["teamScore_home"],
    "awayScore": schedule["teamScore_away"],
})

# Derive homeWin
sched["homeWin"] = (sched["homeScore"] > sched["awayScore"]).astype(float)

# Tidy dtypes
sched["homeTeamId"] = pd.to_numeric(sched["homeTeamId"], errors="coerce").astype("Int64")
sched["awayTeamId"] = pd.to_numeric(sched["awayTeamId"], errors="coerce").astype("Int64")

sched["gameDate"] = pd.to_datetime(sched["gameDate"], errors="coerce")

print("Canonical schedule shape:", sched.shape)
print("Date range:", sched["gameDate"].min(), "â†’", sched["gameDate"].max())
print("Seasons:", sorted(sched["Season"].dropna().unique())[:15])

display(sched.head())


In [None]:
# === Fix missing gameDate in schedule by rejoining from team logs ===

print("Fixing gameDate via team logsâ€¦")

# Ensure both sides use the same gameId type
team_dates = (
    team_df.assign(gameId=team_df["gameId"].astype(str))
           .groupby("gameId", as_index=False)
           .agg(gameDate_fix=("gameDate", "min"))
)

print("team_dates shape:", team_dates.shape)
print("team_dates date range:",
      team_dates["gameDate_fix"].min(), "â†’", team_dates["gameDate_fix"].max())

# Drop existing gameDate in sched and merge the fixed one
sched = sched.copy()
sched["gameId"] = sched["gameId"].astype(str)

sched = (
    sched.drop(columns=["gameDate"])
         .merge(team_dates, on="gameId", how="left")
         .rename(columns={"gameDate_fix": "gameDate"})
)

print("After fix â€“ schedule date range:",
      sched["gameDate"].min(), "â†’", sched["gameDate"].max())

# Re-run per-season summary
per_season_fixed = (
    sched.groupby("Season")
         .agg(
             games=("gameId", "nunique"),
             first_date=("gameDate", "min"),
             last_date=("gameDate", "max"),
         )
         .sort_index()
)

print("\nPer-season summary AFTER fix:")
print(per_season_fixed)


In [None]:
# Per-season sanity check

per_season = (
    sched.groupby("Season")
         .agg(
             games=("gameId", "nunique"),
             first_date=("gameDate", "min"),
             last_date=("gameDate", "max"),
         )
         .sort_index()
)

print(per_season)


In [None]:
# Write canonical schedule file (opt-in)

schedule_path = raw_root / "NBA_Schedule_All_Seasons.csv"
backup_path = schedule_path.with_suffix(".backup_before_rebuild.csv")

WRITE_CHANGES = True  # <-- flip to True when you're ready

if WRITE_CHANGES:
    if schedule_path.exists():
        print(f"Backing up existing schedule to: {backup_path}")
        schedule_path.rename(backup_path)

    print(f"Writing canonical schedule to: {schedule_path}")
    sched.to_csv(schedule_path, index=False)

    # Quick reload check
    sched_check = pd.read_csv(schedule_path, parse_dates=["gameDate"])
    print("Reloaded shape:", sched_check.shape)
    print("Reloaded date range:", sched_check["gameDate"].min(), "â†’", sched_check["gameDate"].max())
else:
    print("WRITE_CHANGES=False â†’ dry run only, no file written.")
