# QEPC — Odds Long Explore (Portable)

This notebook is designed to work across machines (different Windows usernames) with **no hardcoded paths**.

What it does:
- Locates `PROJECT_ROOT` automatically (repo root containing `qepc/__init__.py`)
- Locates the odds CSV in your repo under `data/raw/nba/odds_long/`
- If missing, optionally downloads via `kagglehub` and copies it into the repo
- Loads + normalizes odds using `qepc.nba.odds_long_loader.load_long_odds`
- (Optional) checks join coverage vs Eoin games parquet if present


In [None]:
from __future__ import annotations

from pathlib import Path
import sys
import pandas as pd

# --- PROJECT ROOT AUTO-DETECT (no hardcoded C:\Users\...) ---
_cwd = Path.cwd().resolve()
PROJECT_ROOT = None
for p in [_cwd] + list(_cwd.parents):
    if (p / "qepc" / "__init__.py").exists():
        PROJECT_ROOT = p
        break

if PROJECT_ROOT is None:
    raise RuntimeError(f"Could not find PROJECT_ROOT above: {_cwd} (missing qepc/__init__.py)")

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import qepc  # noqa: F401

print("PROJECT_ROOT:", PROJECT_ROOT)
print("qepc imported from:", Path(qepc.__file__).resolve())
print("python:", sys.executable)


## Locate (or download) the odds CSV

Preferred repo location:

`data/raw/nba/odds_long/nba_2008-2025.csv`

If it isn't found, this cell will:
1) search the repo for any `nba_2008-2025.csv`
2) optionally download using `kagglehub` (if installed) and copy the CSV into the repo location


In [None]:
from pathlib import Path
import shutil

ODDS_DIR = PROJECT_ROOT / "data" / "raw" / "nba" / "odds_long"
ODDS_DIR.mkdir(parents=True, exist_ok=True)

CANON_ODDS_CSV = ODDS_DIR / "nba_2008-2025.csv"

def find_odds_csv() -> Path | None:
    if CANON_ODDS_CSV.exists():
        return CANON_ODDS_CSV

    # fallback: search repo
    hits = list(PROJECT_ROOT.rglob("nba_2008-2025.csv"))
    if hits:
        return hits[0]

    return None

ODDS_CSV = find_odds_csv()

if ODDS_CSV is None:
    print("Odds CSV not found in repo. Attempting kagglehub download (if available)...")
    try:
        import kagglehub  # type: ignore

        # Update this if you switch Kaggle datasets later.
        DATASET_ID = "cviaxmiwnptr/nba-betting-data-october-2007-to-june-2024"
        download_path = Path(kagglehub.dataset_download(DATASET_ID)).resolve()
        csv_files = sorted(download_path.rglob("*.csv"))

        if not csv_files:
            raise FileNotFoundError(f"No CSVs found inside downloaded dataset folder: {download_path}")

        # Prefer a file that looks like the long-horizon odds CSV, otherwise take the first.
        preferred = None
        for f in csv_files:
            name = f.name.lower()
            if "nba" in name and ("2008" in name or "2025" in name or "odds" in name or "betting" in name):
                preferred = f
                break
        raw_csv = preferred or csv_files[0]

        print("Downloaded dataset folder:", download_path)
        print("Using downloaded CSV:", raw_csv.name)

        # Copy into repo canonical location so the rest of QEPC can find it consistently.
        shutil.copy2(raw_csv, CANON_ODDS_CSV)
        ODDS_CSV = CANON_ODDS_CSV

        print("Copied odds CSV into repo at:", ODDS_CSV)

    except Exception as e:
        raise FileNotFoundError(
            "NBA odds CSV not found under your repo, and kagglehub download failed.\n"
            f"Expected file at: {CANON_ODDS_CSV}\n"
            "Fix options:\n"
            "  1) Copy nba_2008-2025.csv into data/raw/nba/odds_long/\n"
            "  2) Install/configure kagglehub on this machine\n"
            f"Underlying error: {type(e).__name__}: {e}"
        )

print("ODDS_CSV:", ODDS_CSV)
print("Exists:", ODDS_CSV.exists())


## Load + normalize odds (QEPC loader)

This uses the hardened loader you just merged (`load_long_odds`) which:
- normalizes team codes (aliases like GS/GSW, NO/NOP, NJ/BRK, etc.)
- maps to NBA team IDs
- constructs `game_key` and safe join fields


In [None]:
from qepc.nba.odds_long_loader import load_long_odds

odds_long = load_long_odds(ODDS_CSV)

print("odds_long shape:", odds_long.shape)
print("columns:", list(odds_long.columns))

print("date range:", odds_long["game_date"].min(), "→", odds_long["game_date"].max())
print("home_team_id null %:", float(odds_long["home_team_id"].isna().mean()))
print("away_team_id null %:", float(odds_long["away_team_id"].isna().mean()))

odds_long.head(10)


## Market vs actual totals & spreads (quick sanity stats)

These are simple “how wrong was the market?” sanity checks.  
(We’re not claiming we’ll beat Vegas yet — we’re just measuring reality.)


In [None]:
import numpy as np

df = odds_long.copy()

df["actual_total"] = df["score_home"] + df["score_away"]
df["total_error"] = df["actual_total"] - df["total_points"]

mae_total = df["total_error"].abs().mean()
rmse_total = np.sqrt((df["total_error"] ** 2).mean())
bias_total = df["total_error"].mean()

print("Vegas total MAE :", round(mae_total, 3))
print("Vegas total RMSE:", round(rmse_total, 3))
print("Vegas total bias:", round(bias_total, 3))

df[[
    "season", "game_date",
    "away_code", "home_code",
    "score_away", "score_home",
    "total_points", "actual_total", "total_error"
]].head(10)


In [None]:
df = odds_long.copy()

# Home margin: positive => home wins
df["home_margin"] = df["score_home"] - df["score_away"]

# Spread convention: spread_home negative if home favored
# Expected home margin ≈ -spread_home
df["spread_error_home"] = df["home_margin"] + df["spread_home"]

mae_spread = df["spread_error_home"].abs().mean()
rmse_spread = np.sqrt((df["spread_error_home"] ** 2).mean())
bias_spread = df["spread_error_home"].mean()

print("Vegas spread(home) MAE :", round(mae_spread, 3))
print("Vegas spread(home) RMSE:", round(rmse_spread, 3))
print("Vegas spread(home) bias:", round(bias_spread, 3))

df[[
    "season", "game_date",
    "away_code", "home_code",
    "score_away", "score_home",
    "spread_home", "home_margin", "spread_error_home"
]].head(10)


## Optional: join odds to Eoin games (if parquet exists)

If you’ve already built:
`cache/imports/eoin_games_qepc.parquet`

…this cell will compute real join coverage and show a sample.


In [None]:
import pandas as pd
from qepc.nba.odds_long_loader import attach_odds_to_games

eoin_games_path = PROJECT_ROOT / "cache" / "imports" / "eoin_games_qepc.parquet"

if not eoin_games_path.exists():
    print("Eoin games parquet not found:", eoin_games_path)
    print("Skipping join test. (Run your Eoin build notebook on this machine.)")
else:
    games = pd.read_parquet(eoin_games_path)

    # minimal columns for join; keep names if present for debugging
    keep = [c for c in ["game_id", "game_date", "game_datetime", "home_team_id", "away_team_id", "home_team_name", "away_team_name"] if c in games.columns]
    games_small = games[keep].copy()

    merged, diag = attach_odds_to_games(games_small, odds_long)

    print(f"Matched odds rows: {diag.matched_rows} of {diag.total_games} games")
    print(f"Games missing odds: {diag.unmatched_games}")
    print(f"Odds rows unmatched: {diag.unmatched_odds}")

    print("\nSample merged rows:")
    display(merged.head(5))

    print("\nSample unmatched games:")
    display(diag.sample_unmatched_games.head(10))

    print("\nSample unmatched odds:")
    display(diag.sample_unmatched_odds.head(10))


## Team code coverage quick check

This is mostly for spotting weird historical abbreviations.


In [None]:
from qepc.nba.odds_long_loader import CODE_TO_ID

odds_codes = sorted(set(odds_long["home_code"]) | set(odds_long["away_code"]))
unknown = [c for c in odds_codes if c not in CODE_TO_ID]

print("Unique odds codes:", len(odds_codes))
print("Unknown codes (not in CODE_TO_ID):", unknown[:50])
