In [None]:
from pathlib import Path
import pandas as pd

# --- 1) Locate project root and canonical logs file ---

try:
    from qepc.config import detect_project_root
    project_root = Path(detect_project_root())
except Exception:
    project_root = Path(r"C:\Users\wdors\qepc_project")

canonical_logs_path = project_root / "data" / "raw" / "NBA_Player_Logs_All_Seasons.csv"
print("Loading canonical player logs from:\n  ", canonical_logs_path)

df = pd.read_csv(canonical_logs_path, low_memory=False)

# --- 2) Ensure gameDate & Season exist and are parsed ---

if "gameDate" in df.columns:
    df["gameDate"] = pd.to_datetime(df["gameDate"], errors="coerce")
elif "GAME_DATE" in df.columns:
    df["gameDate"] = pd.to_datetime(df["GAME_DATE"], errors="coerce")
else:
    raise ValueError("Could not find gameDate/GAME_DATE column in player logs.")

if "Season" not in df.columns:
    raise ValueError("Expected a 'Season' column (e.g. '2020-21') in canonical logs.")

print("\nCanonical shape:", df.shape)
print("Date range:", df["gameDate"].min(), "→", df["gameDate"].max())

# --- 3) Keep only the last 5 seasons ---

all_seasons = sorted(df["Season"].dropna().unique())
print("\nAll seasons found:", all_seasons)

keep_seasons = all_seasons[-5:]
print("Keeping seasons:", keep_seasons)

df = df[df["Season"].isin(keep_seasons)].copy()
print("After season filter:", df.shape)

# --- 4) Build a lean schema for props modeling ---

column_map = {
    # identity & context
    "gameId": "gameId",
    "gameDate": "gameDate",
    "Season": "Season",
    "playerId": "playerId",
    "playerName": "playerName",
    "teamId": "teamId",
    "teamAbbrev": "teamAbbrev",
    "teamName": "teamName",
    "opponentTeamAbbrev": "opponentTeamAbbrev",
    "home": "home",
    "win": "win",
    # core boxscore
    "minutes": "minutes",
    "pts": "pts",
    "reb": "reboundsTotal",
    "ast": "assists",
    "stl": "steals",
    "blk": "blocks",
    "tov": "turnovers",
    "pf": "foulsPersonal",
    "fgm": "fieldGoalsMade",
    "fga": "fieldGoalsAttempted",
    "fg_pct": "fieldGoalsPercentage",
    "fg3m": "threePointersMade",
    "fg3a": "threePointersAttempted",
    "fg3_pct": "threePointersPercentage",
    "ftm": "freeThrowsMade",
    "fta": "freeThrowsAttempted",
    "ft_pct": "freeThrowsPercentage",
    "plus_minus": "plusMinusPoints",
}

missing = [src for src in column_map.keys() if src not in df.columns]
if missing:
    print("\n⚠️ Missing source columns in canonical file:", missing)
    # Drop missing ones so the rest still works
    for m in missing:
        del column_map[m]

lean_cols = list(column_map.keys())
df_lean = df[lean_cols].rename(columns=column_map)

# Nice ordering for modeling
sort_cols = [c for c in ["gameDate", "gameId", "playerId"] if c in df_lean.columns]
if sort_cols:
    df_lean = df_lean.sort_values(sort_cols)

print("\nLean shape:", df_lean.shape)
print("Sample:")
display(df_lean.head())

# --- 5) Save into the experimental notebooks/data folder ---

# We are running inside qepc_core/notebooks
notebooks_root = Path.cwd()
data_local = notebooks_root / "data"
data_local.mkdir(exist_ok=True)

out_path = data_local / "player_logs_5yr_lean.csv"
df_lean.to_csv(out_path, index=False)

print("\n✅ Wrote lean player logs to (EXPERIMENTAL):")
print("  ", out_path)
