In [1]:
# 02_build_player_logs_all_seasons.ipynb
# Goal: Canonical "NBA_Player_Logs_All_Seasons.csv" for QEPC.

from pathlib import Path
import sys

import numpy as np
import pandas as pd

print("=== QEPC Player Logs Bootstrap ===")

cwd = Path.cwd()
core_root = None
cur = cwd

for _ in range(8):
    if cur.name == "qepc_core":
        core_root = cur
        break
    if cur.parent == cur:
        break
    cur = cur.parent

if core_root is None:
    raise RuntimeError(f"Could not find qepc_core above {cwd}")

core_str = str(core_root)
if core_str not in sys.path:
    sys.path.insert(0, core_str)

repo_root = core_root.parent.parent.parent
repo_str = str(repo_root)
if repo_str not in sys.path:
    sys.path.append(repo_str)

print("qepc_core root:", core_root)
print("repo root:     ", repo_root)

import qepc
from qepc.config import detect_project_root, QEPCConfig

project_root = detect_project_root()
cfg = QEPCConfig.from_project_root(project_root)

print("project_root:", project_root)
print("raw_root:    ", cfg.raw_root)
print("=== Bootstrap OK ===")


=== QEPC Player Logs Bootstrap ===
qepc_core root: C:\Users\wdors\qepc_project\experimental\GTP_REWRITE\qepc_core
repo root:      C:\Users\wdors\qepc_project
project_root: C:\Users\wdors\qepc_project
raw_root:     C:\Users\wdors\qepc_project\data\raw
=== Bootstrap OK ===


In [2]:
# Locate the best raw player-game file

raw_root = cfg.raw_root
print("raw_root:", raw_root)

candidate_names = [
    "NBA_Player_Logs_All_Seasons.csv",   # if you already have a canonical-ish one
    "Player_Game_Logs_All_Seasons.csv",  # big NBA API export
    "PlayerStatistics.csv",              # fallback
]

found_paths = []
for name in candidate_names:
    p = raw_root / name
    if p.exists():
        found_paths.append(p)

if not found_paths:
    raise FileNotFoundError(
        f"No player logs file found in {raw_root}. "
        f"Tried: {candidate_names}"
    )

# Prefer the most detailed file by our priority above
source_path = found_paths[0]
print("Using source player logs file:", source_path)

df_raw = pd.read_csv(source_path, low_memory=False)
print("Raw shape:", df_raw.shape)
print("Raw columns (first 20):", list(df_raw.columns[:20]))

display(df_raw.head())


raw_root: C:\Users\wdors\qepc_project\data\raw
Using source player logs file: C:\Users\wdors\qepc_project\data\raw\Player_Game_Logs_All_Seasons.csv
Raw shape: (254187, 71)
Raw columns (first 20): ['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA']


Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,...,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,AVAILABLE_FLAG,MIN_SEC,TEAM_COUNT,Season
0,2014-15,201563,Michael Beasley,Michael,1610612748,MIA,Miami Heat,21401227,2015-04-15T00:00:00,MIA @ PHI,...,128,8030,42,1,47,67,1.0,48:00,1,2014-15
1,2014-15,201566,Russell Westbrook,Russell,1610612760,OKC,Oklahoma City Thunder,21401222,2015-04-15T00:00:00,OKC @ MIN,...,78,554,175,1991,47,100,1.0,31:57,1,2014-15
2,2014-15,203076,Anthony Davis,Anthony,1610612740,NOP,New Orleans Pelicans,21401223,2015-04-15T00:00:00,NOP vs. SAS,...,271,4003,191,1,47,141,1.0,43:11,1,2014-15
3,2014-15,201188,Marc Gasol,Marc,1610612763,MEM,Memphis Grizzlies,21401220,2015-04-15T00:00:00,MEM vs. IND,...,175,10900,261,1,47,264,1.0,37:42,1,2014-15
4,2014-15,203079,Dion Waiters,Dion,1610612760,OKC,Oklahoma City Thunder,21401222,2015-04-15T00:00:00,OKC @ MIN,...,175,679,689,1991,47,312,1.0,35:45,1,2014-15


In [3]:
# Canonicalize raw player game logs into QEPC schema

df = df_raw.copy()

# 1) Standardize date & season
# GAME_DATE looks like "2015-04-15T00:00:00"
if "GAME_DATE" in df.columns:
    df["gameDate"] = pd.to_datetime(df["GAME_DATE"], errors="coerce")
elif "gameDate" in df.columns:
    df["gameDate"] = pd.to_datetime(df["gameDate"], errors="coerce")
else:
    raise KeyError("No GAME_DATE / gameDate column found in player logs.")

# Season string
if "Season" in df.columns:
    df["Season"] = df["Season"].astype(str)
elif "SEASON_YEAR" in df.columns:
    df["Season"] = df["SEASON_YEAR"].astype(str)
else:
    df["Season"] = df["gameDate"].dt.year.astype(str)  # fallback

# 2) IDs and names
df["gameId"] = df.get("GAME_ID", df.get("gameId"))
df["playerId"] = df.get("PLAYER_ID", df.get("personId"))
df["playerName"] = df.get("PLAYER_NAME", df.get("fullName"))

df["teamId"] = df.get("TEAM_ID", df.get("teamId"))
df["teamAbbrev"] = df.get("TEAM_ABBREVIATION", df.get("teamAbbrev"))
df["teamName"] = df.get("TEAM_NAME", df.get("teamName"))

# 3) Matchup, WL, home/away, opponent abbrev
df["matchup"] = df.get("MATCHUP", df.get("matchup"))
df["WL"] = df.get("WL", df.get("win"))

def infer_home_flag(matchup: str) -> float:
    # Examples: "MIA @ PHI" => away; "NOP vs. SAS" => home
    if not isinstance(matchup, str):
        return np.nan
    if "@ " in matchup:
        return 0.0  # '@' means this team is away
    if "vs." in matchup or "vs " in matchup:
        return 1.0  # 'vs.' means this team is home
    return np.nan

def infer_opponent_abbrev(matchup: str, team_abbrev: str) -> str:
    if not isinstance(matchup, str) or not isinstance(team_abbrev, str):
        return np.nan
    parts = matchup.replace("vs.", "vs").replace("@", "@").split()
    # Typical patterns:
    #  ["MIA", "@", "PHI"]
    #  ["NOP", "vs", "SAS"]
    if len(parts) >= 3:
        # parts[0] is teamAbbrev, parts[2] is opponentAbbrev
        return parts[2]
    return np.nan

df["home"] = df["matchup"].apply(infer_home_flag)
df["opponentTeamAbbrev"] = [
    infer_opponent_abbrev(m, t) for m, t in zip(df["matchup"], df["teamAbbrev"])
]

# win flag from WL if present
def wl_to_win_flag(wl: str) -> float:
    if isinstance(wl, str):
        return 1.0 if wl.upper() == "W" else 0.0 if wl.upper() == "L" else np.nan
    return np.nan

df["win"] = df["WL"].apply(wl_to_win_flag)

# 4) Core box score stats
df["minutes"] = df.get("MIN", df.get("minutes"))

df["fgm"] = df.get("FGM", df.get("fgm"))
df["fga"] = df.get("FGA", df.get("fga"))
df["fg_pct"] = df.get("FG_PCT", df.get("fg_pct"))

df["fg3m"] = df.get("FG3M", df.get("fg3m"))
df["fg3a"] = df.get("FG3A", df.get("fg3a"))
df["fg3_pct"] = df.get("FG3_PCT", df.get("fg3_pct"))

df["ftm"] = df.get("FTM", df.get("ftm"))
df["fta"] = df.get("FTA", df.get("fta"))
df["ft_pct"] = df.get("FT_PCT", df.get("ft_pct"))

df["oreb"] = df.get("OREB", df.get("reboundsOffensive"))
df["dreb"] = df.get("DREB", df.get("reboundsDefensive"))
df["reb"] = df.get("REB", df.get("reboundsTotal"))

df["ast"] = df.get("AST", df.get("assists"))
df["stl"] = df.get("STL", df.get("steals"))
df["blk"] = df.get("BLK", df.get("blocks"))
df["tov"] = df.get("TOV", df.get("turnovers"))
df["pf"] = df.get("PF", df.get("foulsPersonal"))

df["pts"] = df.get("PTS", df.get("points"))
df["plus_minus"] = df.get("PLUS_MINUS", df.get("plusMinusPoints"))

# 5) Build final canonical frame
canonical_cols = [
    "gameId",
    "gameDate",
    "Season",
    "playerId",
    "playerName",
    "teamId",
    "teamAbbrev",
    "teamName",
    "opponentTeamAbbrev",
    "home",
    "win",
    "minutes",
    "fgm", "fga", "fg_pct",
    "fg3m", "fg3a", "fg3_pct",
    "ftm", "fta", "ft_pct",
    "oreb", "dreb", "reb",
    "ast", "stl", "blk",
    "tov", "pf",
    "pts", "plus_minus",
    "matchup",
]

df_canon = df[canonical_cols].copy()

# Dtype cleanup
df_canon["gameId"] = df_canon["gameId"].astype(str)
df_canon["playerId"] = pd.to_numeric(df_canon["playerId"], errors="coerce").astype("Int64")
df_canon["teamId"] = pd.to_numeric(df_canon["teamId"], errors="coerce").astype("Int64")

print("Canonical player logs shape:", df_canon.shape)
print("Date range:", df_canon["gameDate"].min(), "→", df_canon["gameDate"].max())
print("Seasons:", sorted(df_canon["Season"].dropna().unique()))

display(df_canon.head())


Canonical player logs shape: (254187, 32)
Date range: 2014-10-28 00:00:00 → 2024-04-14 00:00:00
Seasons: ['2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']


Unnamed: 0,gameId,gameDate,Season,playerId,playerName,teamId,teamAbbrev,teamName,opponentTeamAbbrev,home,...,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus,matchup
0,21401227,2015-04-15,2014-15,201563,Michael Beasley,1610612748,MIA,Miami Heat,PHI,0.0,...,9,11,8,2,2,3,4,34,4,MIA @ PHI
1,21401222,2015-04-15,2014-15,201566,Russell Westbrook,1610612760,OKC,Oklahoma City Thunder,MIN,0.0,...,8,8,7,2,0,4,2,37,23,OKC @ MIN
2,21401223,2015-04-15,2014-15,203076,Anthony Davis,1610612740,NOP,New Orleans Pelicans,SAS,1.0,...,11,13,2,2,3,6,1,31,10,NOP vs. SAS
3,21401220,2015-04-15,2014-15,201188,Marc Gasol,1610612763,MEM,Memphis Grizzlies,IND,1.0,...,8,13,0,2,1,1,3,33,1,MEM vs. IND
4,21401222,2015-04-15,2014-15,203079,Dion Waiters,1610612760,OKC,Oklahoma City Thunder,MIN,0.0,...,1,4,1,3,1,2,2,33,22,OKC @ MIN


In [4]:
# Basic sanity checks on canonical player logs

print("Null counts for key columns:")
print(df_canon[[
    "gameId", "gameDate", "Season",
    "playerId", "playerName",
    "teamId", "teamAbbrev", "teamName",
    "minutes", "pts"
]].isna().sum())

print("\nSample by season:")
display(
    df_canon.groupby("Season")
            .agg(
                games=("gameId", "nunique"),
                player_rows=("playerId", "count")
            )
            .head(15)
)

print("\nSample rows:")
display(df_canon.sample(10, random_state=42))


Null counts for key columns:
gameId        0
gameDate      0
Season        0
playerId      0
playerName    0
teamId        0
teamAbbrev    0
teamName      0
minutes       0
pts           0
dtype: int64

Sample by season:


Unnamed: 0_level_0,games,player_rows
Season,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-15,1230,25981
2015-16,1230,26078
2016-17,1230,26139
2017-18,1230,26107
2018-19,1230,26101
2019-20,1059,22393
2020-21,1080,23054
2021-22,1230,26039
2022-23,1230,25894
2023-24,1230,26401



Sample rows:


Unnamed: 0,gameId,gameDate,Season,playerId,playerName,teamId,teamAbbrev,teamName,opponentTeamAbbrev,home,...,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus,matchup
135765,21900806,2020-02-11,2019-20,1627863,Danuel House Jr.,1610612745,HOU,Houston Rockets,BOS,1.0,...,6,9,1,0,0,1,2,17,15,HOU vs. BOS
128564,21800086,2018-10-28,2018-19,101162,Marcin Gortat,1610612746,LAC,LA Clippers,WAS,1.0,...,6,8,3,0,3,1,3,4,5,LAC vs. WAS
112393,21800844,2019-02-11,2018-19,202702,Kenneth Faried,1610612745,HOU,Houston Rockets,DAL,1.0,...,6,8,1,1,1,1,3,17,22,HOU vs. DAL
129037,21800063,2018-10-25,2018-19,1628379,Luke Kennard,1610612765,DET,Detroit Pistons,CLE,1.0,...,0,0,1,1,0,0,0,8,5,DET vs. CLE
199407,22100111,2021-11-03,2021-22,1627936,Alex Caruso,1610612741,CHI,Chicago Bulls,PHI,0.0,...,3,3,6,2,0,0,4,6,-1,CHI @ PHI
216512,22200529,2022-12-30,2022-23,1630692,Jordan Goodwin,1610612764,WAS,Washington Wizards,ORL,0.0,...,1,1,1,0,0,0,0,0,0,WAS @ ORL
114808,21800733,2019-01-26,2018-19,101161,Amir Johnson,1610612755,PHI,Philadelphia 76ers,DEN,0.0,...,3,4,4,1,1,2,5,5,-4,PHI @ DEN
151629,21900052,2019-10-29,2019-20,1628389,Bam Adebayo,1610612748,MIA,Miami Heat,ATL,1.0,...,8,10,3,2,3,2,4,17,27,MIA vs. ATL
202964,22201182,2023-04-04,2022-23,203083,Andre Drummond,1610612741,CHI,Chicago Bulls,ATL,1.0,...,3,4,0,0,2,0,0,2,0,CHI vs. ATL
41130,21500508,2016-01-03,2015-16,2547,Chris Bosh,1610612748,MIA,Miami Heat,WAS,0.0,...,6,7,2,1,1,1,0,23,19,MIA @ WAS


In [5]:
# Write canonical player logs to disk (opt-in)

target_path = raw_root / "NBA_Player_Logs_All_Seasons.csv"
backup_path = target_path.with_suffix(".backup_before_rebuild.csv")

WRITE_CHANGES = False  # <-- set to True when ready

if WRITE_CHANGES:
    if target_path.exists():
        print(f"Backing up existing file to: {backup_path}")
        target_path.rename(backup_path)

    print(f"Writing canonical player logs to: {target_path}")
    df_canon.to_csv(target_path, index=False)

    # Reload to sanity check
    df_check = pd.read_csv(target_path, low_memory=False, parse_dates=["gameDate"])
    print("Reloaded shape:", df_check.shape)
    print("Reloaded date range:", df_check["gameDate"].min(), "→", df_check["gameDate"].max())
else:
    print("WRITE_CHANGES=False → dry run only, no file written.")


WRITE_CHANGES=False → dry run only, no file written.
