In [1]:
# ==========================================================
# CELL 0 – PORTABLE PROJECT ROOT + RAW / CACHE PATHS
# ==========================================================
from pathlib import Path
import sys

def find_project_root(marker: str = "qepc_project") -> Path:
    """
    Walk up from the current working directory until we find a folder
    named `marker` (default: 'qepc_project').

    This works on any machine/user as long as you start Jupyter
    somewhere *inside* the repo.
    """
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == marker:
            return p
    raise FileNotFoundError(
        f"Could not find a folder named '{marker}' in the current path hierarchy.\n"
        f"Start Jupyter from inside your qepc_project folder or adjust the marker."
    )

PROJECT_ROOT = find_project_root("qepc_project")

# Make sure Python can import the qepc package
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

RAW_EOIN = PROJECT_ROOT / "data" / "raw" / "nba" / "eoin"
RAW_KAGGLE = PROJECT_ROOT / "data" / "raw" / "nba" / "kaggle"
CACHE_IMPORTS = PROJECT_ROOT / "cache" / "imports"

for path in [RAW_EOIN, RAW_KAGGLE, CACHE_IMPORTS]:
    path.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_EOIN:", RAW_EOIN)
print("RAW_KAGGLE:", RAW_KAGGLE)
print("CACHE_IMPORTS:", CACHE_IMPORTS)


PROJECT_ROOT: C:\Users\wdorsey\qepc_project
RAW_EOIN: C:\Users\wdorsey\qepc_project\data\raw\nba\eoin
RAW_KAGGLE: C:\Users\wdorsey\qepc_project\data\raw\nba\kaggle
CACHE_IMPORTS: C:\Users\wdorsey\qepc_project\cache\imports


In [2]:
# ==========================================================
# CELL 1 – ENSURE kagglehub IS INSTALLED
# ==========================================================
try:
    import kagglehub  # type: ignore
    print("kagglehub already installed.")
except ImportError:
    print("Installing kagglehub...")
    %pip install kagglehub
    import kagglehub  # type: ignore

print("kagglehub version loaded OK.")


kagglehub already installed.
kagglehub version loaded OK.


In [3]:
# ==========================================================
# CELL 2 – DOWNLOAD EOIN KAGGLE DATASET
# ==========================================================
from pathlib import Path

DATASET_ID = "eoinamoore/historical-nba-data-and-player-box-scores"

eoin_kaggle_path = Path(kagglehub.dataset_download(DATASET_ID)).resolve()
print("Kaggle Eoin dataset path:", eoin_kaggle_path)

print("\nSample CSV files found under that path:")
for p in sorted(eoin_kaggle_path.rglob("*.csv"))[:20]:
    print(" -", p.relative_to(eoin_kaggle_path))


Kaggle Eoin dataset path: C:\Users\wdorsey\.cache\kagglehub\datasets\eoinamoore\historical-nba-data-and-player-box-scores\versions\300

Sample CSV files found under that path:
 - Games.csv
 - LeagueSchedule24_25.csv
 - LeagueSchedule25_26.csv
 - Players.csv
 - PlayerStatistics.csv
 - TeamHistories.csv
 - TeamStatistics.csv


In [4]:
# ==========================================================
# CELL 3 – COPY RAW EOIN CSVs INTO PROJECT
# ==========================================================
import shutil

wanted_files = [
    "Games.csv",
    "Players.csv",
    "PlayerStatistics.csv",
    "TeamHistories.csv",
    "TeamStatistics.csv",
    "LeagueSchedule24_25.csv",
    "LeagueSchedule25_26.csv",
]

for name in wanted_files:
    matches = list(eoin_kaggle_path.rglob(name))
    if not matches:
        print(f"[WARN] {name} not found in Kaggle folder.")
        continue

    src = matches[0]
    dst = RAW_EOIN / name
    shutil.copy2(src, dst)
    print(f"Copied {src.name} → {dst}")

print("\nCSV files now in RAW_EOIN:")
for p in RAW_EOIN.glob("*.csv"):
    print(" -", p.name)


Copied Games.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\Games.csv
Copied Players.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\Players.csv
Copied PlayerStatistics.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\PlayerStatistics.csv
Copied TeamHistories.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\TeamHistories.csv
Copied TeamStatistics.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\TeamStatistics.csv
Copied LeagueSchedule24_25.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\LeagueSchedule24_25.csv
Copied LeagueSchedule25_26.csv → C:\Users\wdorsey\qepc_project\data\raw\nba\eoin\LeagueSchedule25_26.csv

CSV files now in RAW_EOIN:
 - Games.csv
 - LeagueSchedule24_25.csv
 - LeagueSchedule25_26.csv
 - Players.csv
 - PlayerStatistics.csv
 - TeamHistories.csv
 - TeamStatistics.csv


In [5]:
# ==========================================================
# CELL 4 – LOAD RAW EOIN CSVs
# ==========================================================
import pandas as pd

def load_eoin_csv(name: str, low_memory: bool = False) -> pd.DataFrame:
    path = RAW_EOIN / name
    if not path.exists():
        raise FileNotFoundError(f"Eoin Kaggle file not found: {path}")
    df = pd.read_csv(path, low_memory=low_memory)
    print(f"Loaded {name} → shape={df.shape}")
    return df

games_raw = load_eoin_csv("Games.csv", low_memory=False)
player_boxes_raw = load_eoin_csv("PlayerStatistics.csv", low_memory=True)
team_boxes_raw = load_eoin_csv("TeamStatistics.csv", low_memory=False)

display(games_raw.head())
display(player_boxes_raw.head())
display(team_boxes_raw.head())


Loaded Games.csv → shape=(72311, 17)


  df = pd.read_csv(path, low_memory=low_memory)


Loaded PlayerStatistics.csv → shape=(1639424, 35)
Loaded TeamStatistics.csv → shape=(144622, 48)


Unnamed: 0,gameId,gameDateTimeEst,hometeamCity,hometeamName,hometeamId,awayteamCity,awayteamName,awayteamId,homeScore,awayScore,winner,gameType,attendance,arenaId,gameLabel,gameSubLabel,seriesGameNumber
0,22501204,2025-12-10 17:00:00,Los Angeles,Lakers,1610612747,San Antonio,Spurs,1610612759,119,132,1610612759,in-season-knockout,18684.0,,Emirates NBA Cup,West Quarterfinal,
1,22501203,2025-12-10 14:30:00,Oklahoma City,Thunder,1610612760,Phoenix,Suns,1610612756,138,89,1610612760,in-season-knockout,18203.0,,Emirates NBA Cup,West Quarterfinal,
2,22501202,2025-12-09 15:30:00,Toronto,Raptors,1610612761,New York,Knicks,1610612752,101,117,1610612752,in-season-knockout,17801.0,,Emirates NBA Cup,East Quarterfinal,
3,22501201,2025-12-09 13:00:00,Orlando,Magic,1610612753,Miami,Heat,1610612748,117,108,1610612753,in-season-knockout,18605.0,,Emirates NBA Cup,East Quarterfinal,
4,22500366,2025-12-08 15:00:00,New Orleans,Pelicans,1610612740,San Antonio,Spurs,1610612759,132,135,1610612759,,15783.0,,,,


Unnamed: 0,firstName,lastName,personId,gameId,gameDateTimeEst,playerteamCity,playerteamName,opponentteamCity,opponentteamName,gameType,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,De'Aaron,Fox,1628368,22501204,2025-12-10 17:00:00,San Antonio,Spurs,Los Angeles,Lakers,in-season-knockout,...,0.5,0.0,0.0,0.0,4.0,0.0,4.0,1.0,1.0,21.0
1,Luke,Kornet,1628436,22501204,2025-12-10 17:00:00,San Antonio,Spurs,Los Angeles,Lakers,in-season-knockout,...,0.0,9.0,8.0,0.889,7.0,1.0,8.0,1.0,0.0,10.0
2,Jarred,Vanderbilt,1629020,22501204,2025-12-10 17:00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Deandre,Ayton,1629028,22501204,2025-12-10 17:00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,0.0,2.0,1.0,0.5,5.0,3.0,8.0,3.0,0.0,-10.0
4,Luka,Doncic,1629029,22501204,2025-12-10 17:00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,0.375,14.0,10.0,0.714,5.0,0.0,5.0,5.0,3.0,-2.0


Unnamed: 0,gameId,gameDateTimeEst,teamCity,teamName,teamId,opponentTeamCity,opponentTeamName,opponentTeamId,home,win,...,leadChanges,pointsFastBreak,pointsFromTurnovers,pointsInThePaint,pointsSecondChance,timesTied,timeoutsRemaining,seasonWins,seasonLosses,coachId
0,22501204,2025-12-10 17:00:00,Los Angeles,Lakers,1610612747,San Antonio,Spurs,1610612759,1,0,...,5.0,21.0,11.0,52.0,2.0,2.0,1.0,17.0,7.0,
1,22501204,2025-12-10 17:00:00,San Antonio,Spurs,1610612759,Los Angeles,Lakers,1610612747,0,1,...,5.0,27.0,16.0,46.0,11.0,2.0,1.0,17.0,7.0,
2,22501203,2025-12-10 14:30:00,Phoenix,Suns,1610612756,Oklahoma City,Thunder,1610612760,0,0,...,0.0,20.0,20.0,42.0,6.0,1.0,0.0,14.0,11.0,
3,22501203,2025-12-10 14:30:00,Oklahoma City,Thunder,1610612760,Phoenix,Suns,1610612756,1,1,...,0.0,18.0,34.0,50.0,9.0,1.0,1.0,24.0,1.0,
4,22501202,2025-12-09 15:30:00,New York,Knicks,1610612752,Toronto,Raptors,1610612761,0,1,...,10.0,16.0,23.0,42.0,25.0,7.0,1.0,17.0,7.0,


In [6]:
# ==========================================================
# CELL 5 – NORMALIZE COLUMN NAMES
# ==========================================================
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [
        c.strip()
         .lower()
         .replace(" ", "")
         .replace("(", "")
         .replace(")", "")
        for c in out.columns
    ]
    return out

games_norm = normalize_columns(games_raw)
player_boxes_norm = normalize_columns(player_boxes_raw)
team_boxes_norm = normalize_columns(team_boxes_raw)

print("games_norm columns:", list(games_norm.columns))
print("player_boxes_norm columns (first 20):", list(player_boxes_norm.columns)[:20])
print("team_boxes_norm columns (first 20):", list(team_boxes_norm.columns)[:20])


games_norm columns: ['gameid', 'gamedatetimeest', 'hometeamcity', 'hometeamname', 'hometeamid', 'awayteamcity', 'awayteamname', 'awayteamid', 'homescore', 'awayscore', 'winner', 'gametype', 'attendance', 'arenaid', 'gamelabel', 'gamesublabel', 'seriesgamenumber']
player_boxes_norm columns (first 20): ['firstname', 'lastname', 'personid', 'gameid', 'gamedatetimeest', 'playerteamcity', 'playerteamname', 'opponentteamcity', 'opponentteamname', 'gametype', 'gamelabel', 'gamesublabel', 'seriesgamenumber', 'win', 'home', 'numminutes', 'points', 'assists', 'blocks', 'steals']
team_boxes_norm columns (first 20): ['gameid', 'gamedatetimeest', 'teamcity', 'teamname', 'teamid', 'opponentteamcity', 'opponentteamname', 'opponentteamid', 'home', 'win', 'teamscore', 'opponentscore', 'assists', 'blocks', 'steals', 'fieldgoalsattempted', 'fieldgoalsmade', 'fieldgoalspercentage', 'threepointersattempted', 'threepointersmade']


In [7]:
# ==========================================================
# CELL 6 – DATETIME PARSER (ISO8601 → UTC)
# ==========================================================
def parse_game_datetime(series: pd.Series) -> pd.Series:
    """
    Parse Eoin's gameDateTimeEst strings into UTC datetimes.
    Handles timezone offsets like '-04:00'.
    """
    dt = pd.to_datetime(series, format="ISO8601", utc=True, errors="coerce")
    n_bad = series.shape[0] - dt.notna().sum()
    if n_bad:
        print(f"Warning: {n_bad} rows could not be parsed as datetimes.")
    print("Resulting dtype:", dt.dtype)
    return dt


In [8]:
# ==========================================================
# CELL 7 – BUILD QEPC-READY TABLES
# ==========================================================

# --- Games table ---
games_qepc = games_norm.rename(columns={
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "hometeamcity": "home_team_city",
    "hometeamname": "home_team_name",
    "hometeamid": "home_team_id",
    "awayteamcity": "away_team_city",
    "awayteamname": "away_team_name",
    "awayteamid": "away_team_id",
    "homescore": "home_score",
    "awayscore": "away_score",
    "winner": "winner_team_id",
})

games_qepc["game_datetime"] = parse_game_datetime(games_qepc["game_datetime"])
games_qepc["game_date"] = games_qepc["game_datetime"].dt.date

# --- Player boxes ---
player_boxes_qepc = player_boxes_norm.rename(columns={
    "personid": "player_id",
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "playerteamcity": "team_city",
    "playerteamname": "team_name",
    "opponentteamcity": "opp_team_city",
    "opponentteamname": "opp_team_name",
})

player_boxes_qepc["game_datetime"] = parse_game_datetime(player_boxes_qepc["game_datetime"])
player_boxes_qepc["game_date"] = player_boxes_qepc["game_datetime"].dt.date

# --- Team boxes ---
team_boxes_qepc = team_boxes_norm.rename(columns={
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "teamid": "team_id",
    "teamcity": "team_city",
    "teamname": "team_name",
    "opponentteamid": "opp_team_id",
    "opponentteamcity": "opp_team_city",
    "opponentteamname": "opp_team_name",
})

team_boxes_qepc["game_datetime"] = parse_game_datetime(team_boxes_qepc["game_datetime"])
team_boxes_qepc["game_date"] = team_boxes_qepc["game_datetime"].dt.date

print("games_qepc shape:", games_qepc.shape)
print("player_boxes_qepc shape:", player_boxes_qepc.shape)
print("team_boxes_qepc shape:", team_boxes_qepc.shape)

display(games_qepc.head())
display(player_boxes_qepc.head())
display(team_boxes_qepc.head())


Resulting dtype: datetime64[ns, UTC]
Resulting dtype: datetime64[ns, UTC]
Resulting dtype: datetime64[ns, UTC]
games_qepc shape: (72311, 18)
player_boxes_qepc shape: (1639424, 36)
team_boxes_qepc shape: (144622, 49)


Unnamed: 0,game_id,game_datetime,home_team_city,home_team_name,home_team_id,away_team_city,away_team_name,away_team_id,home_score,away_score,winner_team_id,gametype,attendance,arenaid,gamelabel,gamesublabel,seriesgamenumber,game_date
0,22501204,2025-12-10 17:00:00+00:00,Los Angeles,Lakers,1610612747,San Antonio,Spurs,1610612759,119,132,1610612759,in-season-knockout,18684.0,,Emirates NBA Cup,West Quarterfinal,,2025-12-10
1,22501203,2025-12-10 14:30:00+00:00,Oklahoma City,Thunder,1610612760,Phoenix,Suns,1610612756,138,89,1610612760,in-season-knockout,18203.0,,Emirates NBA Cup,West Quarterfinal,,2025-12-10
2,22501202,2025-12-09 15:30:00+00:00,Toronto,Raptors,1610612761,New York,Knicks,1610612752,101,117,1610612752,in-season-knockout,17801.0,,Emirates NBA Cup,East Quarterfinal,,2025-12-09
3,22501201,2025-12-09 13:00:00+00:00,Orlando,Magic,1610612753,Miami,Heat,1610612748,117,108,1610612753,in-season-knockout,18605.0,,Emirates NBA Cup,East Quarterfinal,,2025-12-09
4,22500366,2025-12-08 15:00:00+00:00,New Orleans,Pelicans,1610612740,San Antonio,Spurs,1610612759,132,135,1610612759,,15783.0,,,,,2025-12-08


Unnamed: 0,firstname,lastname,player_id,game_id,game_datetime,team_city,team_name,opp_team_city,opp_team_name,gametype,...,freethrowsattempted,freethrowsmade,freethrowspercentage,reboundsdefensive,reboundsoffensive,reboundstotal,foulspersonal,turnovers,plusminuspoints,game_date
0,De'Aaron,Fox,1628368,22501204,2025-12-10 17:00:00+00:00,San Antonio,Spurs,Los Angeles,Lakers,in-season-knockout,...,0.0,0.0,0.0,4.0,0.0,4.0,1.0,1.0,21.0,2025-12-10
1,Luke,Kornet,1628436,22501204,2025-12-10 17:00:00+00:00,San Antonio,Spurs,Los Angeles,Lakers,in-season-knockout,...,9.0,8.0,0.889,7.0,1.0,8.0,1.0,0.0,10.0,2025-12-10
2,Jarred,Vanderbilt,1629020,22501204,2025-12-10 17:00:00+00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-10
3,Deandre,Ayton,1629028,22501204,2025-12-10 17:00:00+00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,2.0,1.0,0.5,5.0,3.0,8.0,3.0,0.0,-10.0,2025-12-10
4,Luka,Doncic,1629029,22501204,2025-12-10 17:00:00+00:00,Los Angeles,Lakers,San Antonio,Spurs,in-season-knockout,...,14.0,10.0,0.714,5.0,0.0,5.0,5.0,3.0,-2.0,2025-12-10


Unnamed: 0,game_id,game_datetime,team_city,team_name,team_id,opp_team_city,opp_team_name,opp_team_id,home,win,...,pointsfastbreak,pointsfromturnovers,pointsinthepaint,pointssecondchance,timestied,timeoutsremaining,seasonwins,seasonlosses,coachid,game_date
0,22501204,2025-12-10 17:00:00+00:00,Los Angeles,Lakers,1610612747,San Antonio,Spurs,1610612759,1,0,...,21.0,11.0,52.0,2.0,2.0,1.0,17.0,7.0,,2025-12-10
1,22501204,2025-12-10 17:00:00+00:00,San Antonio,Spurs,1610612759,Los Angeles,Lakers,1610612747,0,1,...,27.0,16.0,46.0,11.0,2.0,1.0,17.0,7.0,,2025-12-10
2,22501203,2025-12-10 14:30:00+00:00,Phoenix,Suns,1610612756,Oklahoma City,Thunder,1610612760,0,0,...,20.0,20.0,42.0,6.0,1.0,0.0,14.0,11.0,,2025-12-10
3,22501203,2025-12-10 14:30:00+00:00,Oklahoma City,Thunder,1610612760,Phoenix,Suns,1610612756,1,1,...,18.0,34.0,50.0,9.0,1.0,1.0,24.0,1.0,,2025-12-10
4,22501202,2025-12-09 15:30:00+00:00,New York,Knicks,1610612752,Toronto,Raptors,1610612761,0,1,...,16.0,23.0,42.0,25.0,7.0,1.0,17.0,7.0,,2025-12-09


In [9]:
# ==========================================================
# CELL 8 – SAVE QEPC TABLES TO CACHE/IMPORTS
# ==========================================================
games_path = CACHE_IMPORTS / "eoin_games_qepc.parquet"
player_boxes_path = CACHE_IMPORTS / "eoin_player_boxes_qepc.parquet"
team_boxes_path = CACHE_IMPORTS / "eoin_team_boxes_qepc.parquet"

games_qepc.to_parquet(games_path, index=False)
player_boxes_qepc.to_parquet(player_boxes_path, index=False)
team_boxes_qepc.to_parquet(team_boxes_path, index=False)

print("Saved QEPC-ready Eoin data to:", CACHE_IMPORTS)
print(" -", games_path.name)
print(" -", player_boxes_path.name)
print(" -", team_boxes_path.name)


Saved QEPC-ready Eoin data to: C:\Users\wdorsey\qepc_project\cache\imports
 - eoin_games_qepc.parquet
 - eoin_player_boxes_qepc.parquet
 - eoin_team_boxes_qepc.parquet


In [10]:
# ==========================================================
# CELL 9 – OPTIONAL: SANITY CHECK VIA eoin_data_source
# ==========================================================
from qepc.nba.eoin_data_source import (
    load_eoin_games,
    load_eoin_player_boxes,
    load_eoin_team_boxes,
    print_eoin_summary,
)

games_qepc2 = load_eoin_games()
player_boxes_qepc2 = load_eoin_player_boxes()
team_boxes_qepc2 = load_eoin_team_boxes()

print("Loaded from qepc.nba.eoin_data_source:")
print(games_qepc2.shape, player_boxes_qepc2.shape, team_boxes_qepc2.shape)
print_eoin_summary(games_qepc2, player_boxes_qepc2, team_boxes_qepc2)


Loaded from qepc.nba.eoin_data_source:
(72311, 18) (1639424, 36) (144622, 49)
=== Eoin / QEPC Data Summary ===
Games:            72311 rows, 18 columns
  game_datetime: 1946-11-26 23:00:00+00:00  →  2025-12-10 17:00:00+00:00
Player boxes:   1639424 rows, 36 columns
  game_datetime: 1946-11-26 23:00:00+00:00  →  2025-12-10 17:00:00+00:00
Team boxes:      144622 rows, 49 columns
  game_datetime: 1946-11-26 23:00:00+00:00  →  2025-12-10 17:00:00+00:00
Max season record seen in team_boxes: 68.0–65.0 (approx)
