In [None]:
# ==========================================
# CELL 1 – AUTO-DETECT PROJECT ROOT & IMPORTS
# ==========================================
import sys
from pathlib import Path

import numpy as np
import pandas as pd

# Auto-detect project root by looking for "qepc" package above CWD
here = Path.cwd().resolve()
print("Current working directory:", here)

PROJECT_ROOT = None
for p in [here, *here.parents]:
    if (p / "qepc").is_dir():
        PROJECT_ROOT = p
        break

if PROJECT_ROOT is None:
    raise FileNotFoundError(
        f"Could not find a 'qepc' package above this notebook.\n"
        f"Started search from: {here}\n"
        "Make sure this notebook lives somewhere inside your qepc_project folder."
    )

print("Detected PROJECT_ROOT:", PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
    print("Added PROJECT_ROOT to sys.path")

# (Nothing from qepc imported yet; this notebook is mostly external data exploration.)


In [None]:
# ==========================================
# CELL 2 – DOWNLOAD NBA BETTING DATA FROM KAGGLE
# ==========================================
import kagglehub

# This is the dataset we picked:
# "NBA Betting Data | October 2007 to June 2024"
DATASET_ID = "cviaxmiwnptr/nba-betting-data-october-2007-to-june-2024"

odds_path_str = kagglehub.dataset_download(DATASET_ID)
odds_path = Path(odds_path_str).resolve()

print("Path to NBA betting dataset files:", odds_path)

# List CSVs so we can see what's inside
csv_files = list(odds_path.rglob("*.csv"))
print("CSV files found:", len(csv_files))
for f in csv_files:
    print(" -", f.name)


In [None]:
# ==========================================
# CELL 3 – LOAD RAW ODDS CSV & INSPECT
# ==========================================
if not csv_files:
    raise FileNotFoundError(f"No CSV files found under {odds_path}")

# For now, just grab the first CSV (this dataset should only have one)
raw_csv = csv_files[0]
print("Using CSV:", raw_csv)

odds_raw = pd.read_csv(raw_csv)
print("odds_raw shape:", odds_raw.shape)

print("Columns:")
for c in odds_raw.columns:
    print(" -", c)

display(odds_raw.head(10))


In [None]:
# ==========================================
# CELL 4 – NORMALIZE NBA ODDS TABLE
# ==========================================
import numpy as np

odds = odds_raw.copy()

# 1) Normalize date + team codes
odds["game_date"] = pd.to_datetime(odds["date"]).dt.date
odds["home_code"] = odds["home"].str.upper()
odds["away_code"] = odds["away"].str.upper()

# 2) Compute home/away spreads from "whos_favored" + "spread"
def compute_home_away_spreads(row):
    fav = row["whos_favored"]
    s = row["spread"]
    if pd.isna(s) or fav not in ("home", "away"):
        return np.nan, np.nan
    if fav == "home":
        # Home is -spread, away is +spread
        return -float(s), float(s)
    else:
        # Away is -spread, home is +spread
        return float(s), -float(s)

home_spreads = []
away_spreads = []

for _, r in odds.iterrows():
    h_s, a_s = compute_home_away_spreads(r)
    home_spreads.append(h_s)
    away_spreads.append(a_s)

odds["spread_home"] = home_spreads
odds["spread_away"] = away_spreads

# 3) Keep total as-is (full-game total points)
odds["total_points"] = odds["total"]

# 4) Convert American moneyline to implied probability
def american_to_prob(odds_american):
    """
    Convert American odds to implied probability (before removing vig).
    Example: -140 -> ~0.58, +200 -> ~0.333
    """
    if pd.isna(odds_american):
        return np.nan
    o = float(odds_american)
    if o < 0:
        return (-o) / ((-o) + 100.0)
    else:
        return 100.0 / (o + 100.0)

odds["p_home_raw"] = odds["moneyline_home"].apply(american_to_prob)
odds["p_away_raw"] = odds["moneyline_away"].apply(american_to_prob)

# Remove vig by normalizing
sum_raw = odds["p_home_raw"] + odds["p_away_raw"]
odds["p_home"] = odds["p_home_raw"] / sum_raw
odds["p_away"] = odds["p_away_raw"] / sum_raw

# 5) Build a simple "game_key" to help with matching later
odds["game_key"] = (
    odds["game_date"].astype(str)
    + "_"
    + odds["away_code"]
    + "_"
    + odds["home_code"]
)

# 6) Select a tidy subset of columns
odds_tidy = odds[[
    "season",
    "game_date",
    "game_key",
    "away_code",
    "home_code",
    "score_away",
    "score_home",
    "spread_home",
    "spread_away",
    "total_points",
    "moneyline_away",
    "moneyline_home",
    "p_away",
    "p_home",
    "regular",
    "playoffs",
]]

print("odds_tidy shape:", odds_tidy.shape)
odds_tidy.head(10)


In [None]:
# ==========================================
# CELL 5 – MARKET VS ACTUAL TOTALS
# ==========================================
import numpy as np

totals_df = odds_tidy.copy()

# Actual total points scored
totals_df["actual_total"] = totals_df["score_home"] + totals_df["score_away"]

# Error from the *market* point of view:
# positive = game went over the closing total
totals_df["total_error"] = totals_df["actual_total"] - totals_df["total_points"]

mae_total = totals_df["total_error"].abs().mean()
rmse_total = np.sqrt((totals_df["total_error"] ** 2).mean())
mean_error_total = totals_df["total_error"].mean()

print(f"Total line MAE:  {mae_total:.2f} points")
print(f"Total line RMSE: {rmse_total:.2f} points")
print(f"Total line bias (actual - line): {mean_error_total:+.2f} points")

# Show a few examples
display(
    totals_df[[
        "season",
        "game_date",
        "away_code",
        "home_code",
        "score_away",
        "score_home",
        "total_points",
        "actual_total",
        "total_error",
    ]].head(10)
)


In [None]:
# ==========================================
# CELL 6 – MARKET VS ACTUAL SPREADS (HOME) – FIXED
# ==========================================
spreads_df = odds_tidy.copy()

# Actual margin from home POV: positive = home wins by that many
spreads_df["home_margin"] = spreads_df["score_home"] - spreads_df["score_away"]

# Remember: spread_home is negative if home is favored, positive if home is dog.
# Expected home margin = -spread_home
# So error (actual - line) = home_margin - expected = home_margin + spread_home
spreads_df["spread_error_home"] = spreads_df["home_margin"] + spreads_df["spread_home"]

mae_spread = spreads_df["spread_error_home"].abs().mean()
rmse_spread = np.sqrt((spreads_df["spread_error_home"] ** 2).mean())
mean_error_spread = spreads_df["spread_error_home"].mean()

print(f"Home spread MAE:  {mae_spread:.2f} points")
print(f"Home spread RMSE: {rmse_spread:.2f} points")
print(f"Home spread bias (actual margin - line): {mean_error_spread:+.2f} points")

display(
    spreads_df[[
        "season",
        "game_date",
        "away_code",
        "home_code",
        "score_away",
        "score_home",
        "spread_home",
        "home_margin",
        "spread_error_home",
    ]].head(10)
)


In [None]:
# ==========================================
# CELL 7 – LOAD EOIN TEAM HISTORIES
# ==========================================
from pathlib import Path
import pandas as pd

RAW_EOIN = PROJECT_ROOT / "data" / "raw" / "nba" / "eoin"
team_hist_path = RAW_EOIN / "TeamHistories.csv"

print("TeamHistories path:", team_hist_path)
if not team_hist_path.exists():
    raise FileNotFoundError(f"TeamHistories.csv not found at {team_hist_path}")

team_hist = pd.read_csv(team_hist_path)
print("team_hist shape:", team_hist.shape)
print("Columns:")
for c in team_hist.columns:
    print(" -", c)

team_hist.head(10)


In [None]:
# ==========================================
# CELL 8 – BUILD TEAM CODE → ID/NAME MAPPING
# ==========================================
import numpy as np

df = team_hist.copy()

# Try to auto-detect likely code + id + name columns
code_candidates = [
    c for c in df.columns
    if any(k in c.lower() for k in ["abbr", "code"])
]
id_candidates = [
    c for c in df.columns
    if "team" in c.lower() and "id" in c.lower()
]
name_candidates = [
    c for c in df.columns
    if "name" in c.lower() and "team" in c.lower()
]

print("Code candidates:", code_candidates)
print("ID candidates:", id_candidates)
print("Name candidates:", name_candidates)

if not code_candidates:
    raise ValueError(
        "Could not auto-detect a team code/abbreviation column.\n"
        "Check team_hist.columns above and pick one manually."
    )
if not id_candidates:
    raise ValueError(
        "Could not auto-detect a team id column.\n"
        "Check team_hist.columns above and pick one manually."
    )

code_col = code_candidates[0]
id_col = id_candidates[0]
name_col = name_candidates[0] if name_candidates else None

print(f"Using code_col='{code_col}', id_col='{id_col}', name_col='{name_col}'")

df["code"] = df[code_col].astype(str).str.upper()
df["team_id"] = df[id_col].astype(int)

if name_col is not None:
    df["team_name"] = df[name_col].astype(str)
else:
    # Fallback: try to build a name from city + nickname if they exist
    city_cols = [c for c in df.columns if "city" in c.lower()]
    nick_cols = [c for c in df.columns if "nickname" in c.lower() or "name" in c.lower()]

    if city_cols and nick_cols:
        df["team_name"] = (
            df[city_cols[0]].astype(str) + " " + df[nick_cols[0]].astype(str)
        )
    else:
        df["team_name"] = df["code"]  # worst-case fallback

team_codes = (
    df[["code", "team_id", "team_name"]]
    .drop_duplicates("code")
    .sort_values("code")
    .reset_index(drop=True)
)

print("team_codes shape:", team_codes.shape)
team_codes.head(20)


In [None]:
# ==========================================
# CELL 9 – CHECK CODE COVERAGE: ODDS VS EOIN
# ==========================================
odds_codes = set(odds_long["home_code"]) | set(odds_long["away_code"])
eoin_codes = set(team_codes["code"])

missing_in_eoin = sorted(odds_codes - eoin_codes)
missing_in_odds = sorted(eoin_codes - odds_codes)

print("Unique odds codes:", len(odds_codes))
print("Unique Eoin codes:", len(eoin_codes))
print("Codes in odds but NOT in Eoin mapping:", missing_in_eoin[:50])
print("Codes in Eoin mapping but NOT in odds:", missing_in_odds[:50])
