In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[1]
print("PROJECT_ROOT:", PROJECT_ROOT)

# Optional: quick check for kagglehub; install if missing
try:
    import kagglehub  # type: ignore
    print("kagglehub already installed.")
except ImportError:
    print("Installing kagglehub...")
    # In Jupyter:
    !pip install kagglehub
    import kagglehub  # type: ignore
    print("kagglehub installed.")


In [None]:
import kagglehub

# Download latest version of the dataset
path = kagglehub.dataset_download("eoinamoore/historical-nba-data-and-player-box-scores")

print("Path to dataset files:", path)


In [None]:
import shutil
from pathlib import Path

src_dir = Path(path)
dst_dir = PROJECT_ROOT / "data" / "raw" / "nba" / "eoin"

dst_dir.mkdir(parents=True, exist_ok=True)

print("Copying CSVs from", src_dir)
print("               to", dst_dir)

for p in src_dir.glob("*.csv"):
    print("  -", p.name)
    shutil.copy2(p, dst_dir / p.name)

print("Done.")


In [None]:
import pandas as pd

raw_dir = PROJECT_ROOT / "data" / "raw" / "nba" / "eoin"
print("Raw dir:", raw_dir)

games_raw = pd.read_csv(raw_dir / "Games.csv")
player_boxes_raw = pd.read_csv(raw_dir / "PlayerStatistics.csv")     # <-- changed
team_boxes_raw = pd.read_csv(raw_dir / "TeamStatistics.csv")        # <-- likely name; confirm from list

print("games_raw shape:", games_raw.shape)
print("player_boxes_raw shape:", player_boxes_raw.shape)
print("team_boxes_raw shape:", team_boxes_raw.shape)

display(games_raw.head())
display(player_boxes_raw.head())
display(team_boxes_raw.head())


In [None]:
games_raw = pd.read_csv(raw_dir / "Games.csv", low_memory=False)
player_boxes_raw = pd.read_csv(raw_dir / "PlayerStatistics.csv", low_memory=False)


In [None]:
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\wdors\qepc_project")
RAW_KAGGLE = PROJECT_ROOT / "data" / "raw" / "nba" / "kaggle"

RAW_KAGGLE.mkdir(parents=True, exist_ok=True)

print("✔️ QEPC Kaggle Path Configured:", RAW_KAGGLE)


In [None]:
RAW_KAGGLE = PROJECT_ROOT / "data" / "raw" / "nba" / "kaggle"


In [None]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\wdors\qepc_project")
RAW_EOIN = PROJECT_ROOT / "data" / "raw" / "nba" / "eoin"

def load_eoin_csv(name, low_memory=False):
    path = RAW_EOIN / name
    if not path.exists():
        raise FileNotFoundError(f"Eoin Kaggle file not found: {path}")
    df = pd.read_csv(path, low_memory=low_memory)
    print(f"Loaded {name} → shape={df.shape}")
    return df

games_raw = load_eoin_csv("Games.csv", low_memory=False)
player_boxes_raw = load_eoin_csv("PlayerStatistics.csv", low_memory=False)
team_boxes_raw = load_eoin_csv("TeamStatistics.csv", low_memory=False)


In [None]:
def normalize_columns(df):
    """
    Lowercase, snake-ish columns so we don't fight weird casing/spaces later.
    """
    df = df.copy()
    df.columns = [
        c.strip().replace(" ", "_").replace(".", "_").lower()
        for c in df.columns
    ]
    return df

games_norm = normalize_columns(games_raw)
player_boxes_norm = normalize_columns(player_boxes_raw)
team_boxes_norm = normalize_columns(team_boxes_raw)

print("games_norm cols:", games_norm.columns.tolist()[:12], "...")
print("player_boxes_norm cols:", player_boxes_norm.columns.tolist()[:12], "...")
print("team_boxes_norm cols:", team_boxes_norm.columns.tolist()[:12], "...")


In [None]:
# --- Games table ---
games_qepc = games_norm.rename(columns={
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "hometeamcity": "home_team_city",
    "hometeamname": "home_team_name",
    "hometeamid": "home_team_id",
    "awayteamcity": "away_team_city",
    "awayteamname": "away_team_name",
    "awayteamid": "away_team_id",
    "homescore": "home_score",
    "awayscore": "away_score",
    "winner": "winner_team_id",
})

# --- Player boxes ---
player_boxes_qepc = player_boxes_norm.rename(columns={
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "personid": "player_id",
    "playerteamname": "team_name",
    "playerteamcity": "team_city",
    "opponentteamname": "opp_team_name",
    "opponentteamcity": "opp_team_city",
})

# --- Team boxes ---
team_boxes_qepc = team_boxes_norm.rename(columns={
    "gameid": "game_id",
    "gamedatetimeest": "game_datetime",
    "teamid": "team_id",
    "teamcity": "team_city",
    "teamname": "team_name",
    "opponentteamid": "opp_team_id",
    "opponentteamcity": "opp_team_city",
    "opponentteamname": "opp_team_name",
})

print("games_qepc shape:", games_qepc.shape)
print("player_boxes_qepc shape:", player_boxes_qepc.shape)
print("team_boxes_qepc shape:", team_boxes_qepc.shape)


In [None]:
def parse_game_datetime(series: pd.Series) -> pd.Series:
    """
    Parse game datetime strings to timezone-aware UTC timestamps.

    Handles values like:
      '2025-12-05 16:30:00'
      '2025-12-05 16:30:00-04:00'
    and other ISO-ish variations.
    """
    dt = pd.to_datetime(series, errors="coerce", utc=True)

    if dt.isna().any():
        n_bad = int(dt.isna().sum())
        print(f"Warning: {n_bad} rows could not be parsed as datetimes.")

    print("Resulting dtype:", dt.dtype)
    return dt


In [None]:
# Games
games_qepc["game_datetime"] = parse_game_datetime(games_qepc["game_datetime"])
games_qepc["game_date"] = games_qepc["game_datetime"].dt.date

games_qepc["is_final"] = (
    games_qepc["home_score"].notna() & games_qepc["away_score"].notna()
)

# Player boxes
player_boxes_qepc["game_datetime"] = parse_game_datetime(
    player_boxes_qepc["game_datetime"]
)
player_boxes_qepc["game_date"] = player_boxes_qepc["game_datetime"].dt.date

# Team boxes
team_boxes_qepc["game_datetime"] = parse_game_datetime(
    team_boxes_qepc["game_datetime"]
)
team_boxes_qepc["game_date"] = team_boxes_qepc["game_datetime"].dt.date

print("Done parsing datetimes.")

print(games_qepc[["game_id", "game_datetime", "game_date"]].head())


In [None]:
CACHE_IMPORTS = PROJECT_ROOT / "cache" / "imports"
CACHE_IMPORTS.mkdir(parents=True, exist_ok=True)

games_qepc.to_parquet(CACHE_IMPORTS / "eoin_games_qepc.parquet", index=False)
player_boxes_qepc.to_parquet(CACHE_IMPORTS / "eoin_player_boxes_qepc.parquet", index=False)
team_boxes_qepc.to_parquet(CACHE_IMPORTS / "eoin_team_boxes_qepc.parquet", index=False)

print("Saved QEPC-ready Eoin data to:", CACHE_IMPORTS)
for p in CACHE_IMPORTS.glob("eoin_*_qepc.parquet"):
    print(" -", p.name)


In [None]:
bad_games = games_qepc[games_qepc["game_datetime"].isna()]
bad_players = player_boxes_qepc[player_boxes_qepc["game_datetime"].isna()]
bad_teams = team_boxes_qepc[team_boxes_qepc["game_datetime"].isna()]

print("bad_games:", bad_games.shape)
print("bad_players:", bad_players.shape)
print("bad_teams:", bad_teams.shape)


In [None]:
from pathlib import Path

from qepc.nba.eoin_data_source import (
    load_eoin_games,
    load_eoin_player_boxes,
    load_eoin_team_boxes,
    print_eoin_summary,
)

games_qepc = load_eoin_games()
player_boxes_qepc = load_eoin_player_boxes()
team_boxes_qepc = load_eoin_team_boxes()

print(games_qepc.shape, player_boxes_qepc.shape, team_boxes_qepc.shape)
print_eoin_summary(games_qepc, player_boxes_qepc, team_boxes_qepc)
