PROJECT_ROOT: C:\Users\wdors\qepc_project
qepc in root? True


In [2]:
from qepc.brain.games_loader import fetch_league_games, build_games_table
from qepc.brain.boxscores_loader import fetch_boxscores_for_games

team_games = fetch_league_games("2023-24")
games = build_games_table(team_games)

# just 3 games to test
game_ids = games["GAME_ID"].unique().tolist()[:3]

trad_df, adv_df = fetch_boxscores_for_games(game_ids, sleep_seconds=0.8, verbose=True)

trad_df.head(), adv_df.head()


[games_loader] Fetching LeagueGameLog for season=2023-24, season_type=Regular Season...
[games_loader] Retrieved 2460 team-games.
[boxscores_loader] (1) Fetching boxscores for GAME_ID=0022300061...
[boxscores_loader] (2) Fetching boxscores for GAME_ID=0022300062...
[boxscores_loader] (3) Fetching boxscores for GAME_ID=0022300070...


(       gameId      teamId teamCity teamName teamTricode teamSlug  personId  \
 0  0022300061  1610612743   Denver  Nuggets         DEN  nuggets   1629008   
 1  0022300061  1610612743   Denver  Nuggets         DEN  nuggets    203932   
 2  0022300061  1610612743   Denver  Nuggets         DEN  nuggets    203999   
 3  0022300061  1610612743   Denver  Nuggets         DEN  nuggets    203484   
 4  0022300061  1610612743   Denver  Nuggets         DEN  nuggets   1627750   
 
     firstName     familyName             nameI  ... reboundsDefensive  \
 0     Michael     Porter Jr.     M. Porter Jr.  ...                10   
 1       Aaron         Gordon         A. Gordon  ...                 5   
 2      Nikola          Jokić          N. Jokić  ...                10   
 3  Kentavious  Caldwell-Pope  K. Caldwell-Pope  ...                 1   
 4       Jamal         Murray         J. Murray  ...                 2   
 
   reboundsTotal assists steals blocks  turnovers  foulsPersonal  points  \
 0

In [8]:
import pandas as pd

def normalize_boxscore_cols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Make the most important ID columns consistent and upper-snake-case.
    Avoid creating duplicate columns if GAME_ID is already present.
    """
    df = df.copy()

    # If GAME_ID already exists (added in boxscores_loader),
    # we don't want to rename gameId -> GAME_ID because that would duplicate it.
    if "GAME_ID" in df.columns and "gameId" in df.columns:
        df = df.drop(columns=["gameId"])

    # Rename ID columns only if they exist and won't conflict
    rename_map = {}
    if "gameId" in df.columns and "GAME_ID" not in df.columns:
        rename_map["gameId"] = "GAME_ID"
    if "teamId" in df.columns and "TEAM_ID" not in df.columns:
        rename_map["teamId"] = "TEAM_ID"
    if "personId" in df.columns and "PLAYER_ID" not in df.columns:
        rename_map["personId"] = "PLAYER_ID"

    if rename_map:
        df.rename(columns=rename_map, inplace=True)

    return df


In [9]:
trad_norm = normalize_boxscore_cols(trad_df)
adv_norm = normalize_boxscore_cols(adv_df)

display(trad_norm.head())
display(adv_norm.head())


Unnamed: 0,TEAM_ID,teamCity,teamName,teamTricode,teamSlug,PLAYER_ID,firstName,familyName,nameI,playerSlug,...,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,GAME_ID
0,1610612743,Denver,Nuggets,DEN,nuggets,1629008,Michael,Porter Jr.,M. Porter Jr.,michael-porter-jr,...,10,12,2,2,0,0,1,12,12.0,22300061
1,1610612743,Denver,Nuggets,DEN,nuggets,203932,Aaron,Gordon,A. Gordon,aaron-gordon,...,5,7,5,2,1,0,0,15,6.0,22300061
2,1610612743,Denver,Nuggets,DEN,nuggets,203999,Nikola,Jokić,N. Jokić,nikola-jokić,...,10,13,11,1,1,2,2,29,15.0,22300061
3,1610612743,Denver,Nuggets,DEN,nuggets,203484,Kentavious,Caldwell-Pope,K. Caldwell-Pope,kentavious-caldwell-pope,...,1,2,1,3,1,3,5,20,10.0,22300061
4,1610612743,Denver,Nuggets,DEN,nuggets,1627750,Jamal,Murray,J. Murray,jamal-murray,...,2,2,6,0,1,1,3,21,3.0,22300061


Unnamed: 0,TEAM_ID,teamCity,teamName,teamTricode,teamSlug,PLAYER_ID,firstName,familyName,nameI,playerSlug,...,effectiveFieldGoalPercentage,trueShootingPercentage,usagePercentage,estimatedUsagePercentage,estimatedPace,pace,pacePer40,possessions,PIE,GAME_ID
0,1610612747,Los Angeles,Lakers,LAL,lakers,1627752,Taurean,Prince,T. Prince,taurean-prince,...,1.0,1.014,0.133,0.133,102.0,102.0,85.0,63.0,0.128,22300061
1,1610612747,Los Angeles,Lakers,LAL,lakers,2544,LeBron,James,L. James,lebron-james,...,0.656,0.639,0.225,0.225,97.62,97.62,81.35,60.0,0.189,22300061
2,1610612747,Los Angeles,Lakers,LAL,lakers,203076,Anthony,Davis,A. Davis,anthony-davis,...,0.382,0.453,0.259,0.259,100.5,100.5,83.75,71.0,0.082,22300061
3,1610612747,Los Angeles,Lakers,LAL,lakers,1630559,Austin,Reaves,A. Reaves,austin-reaves,...,0.409,0.497,0.203,0.203,102.64,102.64,85.53,67.0,0.088,22300061
4,1610612747,Los Angeles,Lakers,LAL,lakers,1626156,D'Angelo,Russell,D. Russell,dangelo-russell,...,0.417,0.427,0.186,0.186,96.84,96.84,80.7,74.0,0.047,22300061


In [10]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[1]
out_dir = PROJECT_ROOT / "data" / "raw" / "nba" / "boxscores"
out_dir.mkdir(parents=True, exist_ok=True)

trad_norm.to_parquet(out_dir / "boxscores_traditional_2023-24_sample.parquet", index=False)
adv_norm.to_parquet(out_dir / "boxscores_advanced_2023-24_sample.parquet", index=False)

print("Saved to:", out_dir)


Saved to: C:\Users\wdors\qepc_project\data\raw\nba\boxscores
