# üß± QEPC ‚Äì Build All-Seasons Team Logs

This notebook merges:

- `NBA_API_QEPC_Format.csv` (multi-season logs from 2014‚Äì15 to 2023‚Äì24)
- `Team_Stats.csv` (updated season logs, e.g. 2025‚Äì26 from nba_api)

into a single canonical file:

- `NBA_Team_Logs_All_Seasons.csv` in `data/raw/`

We:
1. Load & inspect both sources.
2. Normalize dates and seasons.
3. Align columns (union of both).
4. Concatenate and de-duplicate by (gameId, teamId).
5. Write the merged file with a backup of existing sources.


In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from datetime import datetime

print("=== QEPC All-Seasons Builder Bootstrap ===")

cwd = Path.cwd()
core_root = None
cur = cwd

for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:", project_root)
print("raw_root:    ", cfg.raw_root)
print("=== Bootstrap OK ===")


In [None]:
raw_root = cfg.raw_root

nba_all_path = raw_root / "NBA_API_QEPC_Format.csv"
team_stats_path = raw_root / "Team_Stats.csv"

print("NBA_API_QEPC_Format path:", nba_all_path)
print("Team_Stats path:        ", team_stats_path)

nba_all = pd.read_csv(nba_all_path)
team_stats = pd.read_csv(team_stats_path)

# Normalize dates
def parse_dates(df, col="gameDate"):
    if col not in df.columns:
        return df
    df = df.copy()
    df[col] = pd.to_datetime(df[col], errors="coerce", utc=True).dt.tz_convert(None)
    return df

nba_all = parse_dates(nba_all, "gameDate")
team_stats = parse_dates(team_stats, "gameDate")

print("\n=== NBA_API_QEPC_Format.csv ===")
print("Shape:", nba_all.shape)
print("Columns:", list(nba_all.columns))
print("Date range:", nba_all["gameDate"].min(), "‚Üí", nba_all["gameDate"].max())
if "Season" in nba_all.columns:
    print("Season unique sample:", sorted(nba_all["Season"].astype(str).unique())[:10])

display(nba_all.head())

print("\n=== Team_Stats.csv ===")
print("Shape:", team_stats.shape)
print("Columns:", list(team_stats.columns))
print("Date range:", team_stats["gameDate"].min(), "‚Üí", team_stats["gameDate"].max())
if "season" in team_stats.columns:
    print("season unique sample:", sorted(team_stats["season"].astype(str).unique())[:10])

display(team_stats.head())


In [None]:
print("=== Normalizing seasons (Team_Stats) and aligning columns ===")

# --- 1) Normalize seasons for Team_Stats ---

ts = team_stats.copy()

# Ensure we have a clean numeric 'season' column based on gameDate year
ts["gameDate"] = pd.to_datetime(ts["gameDate"], errors="coerce", utc=True).dt.tz_convert(None)
ts = ts[ts["gameDate"].notna()].copy()

ts["season"] = ts["gameDate"].dt.year

# Build a Season label like "2025-26" from gameDate
def season_label(dt: pd.Timestamp) -> str:
    if pd.isna(dt):
        return np.nan
    year = dt.year
    # NBA seasons start in Oct; if game is Jul‚ÄìDec, season is year‚Äì(year+1)
    if dt.month >= 7:
        start = year
    else:
        start = year - 1
    end_two = (start + 1) % 100
    return f"{start}-{end_two:02d}"

ts["Season"] = ts["gameDate"].apply(season_label)

print("Team_Stats season range:", ts["season"].min(), "‚Üí", ts["season"].max())
print("Team_Stats Season labels sample:", sorted(ts["Season"].dropna().unique())[:10])
display(ts.head())

# --- 2) Align columns and merge with NBA_API_QEPC_Format ---

print("\n=== Aligning columns and merging ===")

# Union of columns from both sources
all_cols = sorted(set(nba_all.columns) | set(ts.columns))
print("Total unified columns:", len(all_cols))

def align_columns(df, cols):
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    return df[cols]

nba_all_aligned = align_columns(nba_all, all_cols)
ts_aligned = align_columns(ts, all_cols)

print("nba_all_aligned shape:", nba_all_aligned.shape)
print("ts_aligned shape:", ts_aligned.shape)

combined = pd.concat([nba_all_aligned, ts_aligned], ignore_index=True)
print("Combined shape before de-dup:", combined.shape)

# De-duplicate by (gameId, teamId) when available
if {"gameId", "teamId"}.issubset(combined.columns):
    before = len(combined)
    combined = combined.drop_duplicates(subset=["gameId", "teamId"], keep="first")
    after = len(combined)
    print(f"De-duplicated by (gameId, teamId): {before} ‚Üí {after}")
else:
    print("‚ö†Ô∏è Missing 'gameId' or 'teamId'; skipping de-duplication by key")

print("Combined shape after de-dup:", combined.shape)
print("Date range:", combined["gameDate"].min(), "‚Üí", combined["gameDate"].max())
if "Season" in combined.columns:
    print("Season labels sample:", sorted(combined["Season"].dropna().astype(str).unique())[:15])

display(combined.tail())


In [None]:
WRITE_CHANGES = True  # set False if you just want a dry run

all_seasons_path = raw_root / "NBA_Team_Logs_All_Seasons.csv"

if WRITE_CHANGES:
    # Backups
    nba_all_backup = nba_all_path.with_suffix(".backup_before_merge.csv")
    ts_backup = team_stats_path.with_suffix(".backup_before_merge.csv")

    print(f"Backing up NBA_API_QEPC_Format.csv to: {nba_all_backup}")
    nba_all.to_csv(nba_all_backup, index=False)

    print(f"Backing up Team_Stats.csv to: {ts_backup}")
    team_stats.to_csv(ts_backup, index=False)

    # Write merged all-seasons file
    print(f"Writing merged all-seasons logs to: {all_seasons_path}")
    combined.to_csv(all_seasons_path, index=False)

    # OPTIONAL: overwrite NBA_API_QEPC_Format.csv with the merged version
    OVERWRITE_ORIGINAL = False  # flip to True if you want to replace it

    if OVERWRITE_ORIGINAL:
        print(f"Also overwriting {nba_all_path} with merged data.")
        combined.to_csv(nba_all_path, index=False)

    print("‚úÖ Done. Verifying written all-seasons file...")
    check = pd.read_csv(all_seasons_path)
    check["gameDate"] = pd.to_datetime(check["gameDate"], errors="coerce", utc=True).dt.tz_convert(None)
    print("all-seasons shape:", check.shape)
    print("all-seasons date range:", check["gameDate"].min(), "‚Üí", check["gameDate"].max())
    display(check.tail())
else:
    print("WRITE_CHANGES=False ‚Üí no files written.")
