In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# fetch paths
import sys, os
sys.path.append(os.path.abspath('..'))
import config

In [3]:
import os, re
import numpy as np
import pandas as pd
from pathlib import Path
from neo4j import GraphDatabase
import sys
repo_root = Path("..").resolve()
sys.path.append(str(repo_root))
import config

RAW_PLAYOFFS = Path(config.DATA_RAW_PLAYOFFS)          # 1996–97 playoffs CSV
RAW_FINALS   = Path(config.DATA_RAW_FINALS)            # 1997 Finals CSV

def to_sec_left(pct):
    try:
        m, s = str(pct).split(":"); return int(m)*60 + int(s)
    except: return np.nan

def seconds_left_game(period, sec_left_period):
    # regulation only (4*12min=2880s). This is enough for '97 Finals demo.
    return max(0, (4 - int(period)) * 720 + int(sec_left_period or 0))

# --- Load historical events (you can expand to more seasons if you want) ---
df_hist = pd.read_csv(RAW_PLAYOFFS)
df_hist = df_hist.sort_values(["GAME_ID","PERIOD","EVENTNUM"])
df_hist["SECONDS_LEFT_PERIOD"] = df_hist["PCTIMESTRING"].apply(to_sec_left)
df_hist["SECONDS_LEFT_GAME"]   = df_hist.apply(lambda r: seconds_left_game(r["PERIOD"], r["SECONDS_LEFT_PERIOD"]), axis=1)

# Label: did HOME eventually win this game?
final_margin = (df_hist.groupby("GAME_ID")["SCOREMARGIN"].last()
                .rename("FINAL_MARGIN").astype(float))
df_hist = df_hist.merge(final_margin, on="GAME_ID", how="left")
df_hist["HOME_WIN"] = (df_hist["FINAL_MARGIN"] > 0).astype(int)

# Margin before current event (per game)
df_hist["SCOREMARGIN"] = pd.to_numeric(df_hist["SCOREMARGIN"], errors="coerce")
df_hist["MARGIN_BEFORE"] = df_hist.groupby("GAME_ID")["SCOREMARGIN"].shift(1)
df_hist["MARGIN_BEFORE"] = df_hist["MARGIN_BEFORE"].fillna(method="ffill")  # cheap fill; fine for modeling

# Keep rows with both features present
hist = df_hist.dropna(subset=["SECONDS_LEFT_GAME","MARGIN_BEFORE","HOME_WIN"]).copy()
hist["MARGIN_BEFORE"] = hist["MARGIN_BEFORE"].astype(int)

# --- Build a 2D grid: time-left (30s bins) × margin (−20..20) ---
TIME_BIN = 30   # seconds per bin
MAX_MARGIN = 20
hist["tbin"] = (hist["SECONDS_LEFT_GAME"] // TIME_BIN).clip(lower=0).astype(int)
hist["mbin"] = hist["MARGIN_BEFORE"].clip(-MAX_MARGIN, MAX_MARGIN).astype(int)

# Aggregate win rates with Laplace smoothing
agg = (hist.groupby(["tbin","mbin"])["HOME_WIN"]
       .agg(["sum","count"])
       .reset_index())
alpha = 1.0
agg["pwin"] = (agg["sum"] + alpha) / (agg["count"] + 2*alpha)

# Pivot to dense grid & fill gaps by nearest along time then margin
t_range = np.arange(0, int(hist["tbin"].max())+1)
m_range = np.arange(-MAX_MARGIN, MAX_MARGIN+1)
grid = (agg.pivot(index="tbin", columns="mbin", values="pwin")
          .reindex(index=t_range, columns=m_range))
grid = grid.ffill(axis=0).bfill(axis=0)   # fill along time
grid = grid.ffill(axis=1).bfill(axis=1)   # fill along margin

def wp_lookup(sec_left_game: float, margin: float) -> float:
    tbin = int(max(0, sec_left_game) // TIME_BIN)
    mbin = int(np.clip(margin, -MAX_MARGIN, MAX_MARGIN))
    tbin = int(min(tbin, grid.index.max()))
    mbin = int(np.clip(mbin, grid.columns.min(), grid.columns.max()))
    return float(grid.loc[tbin, mbin])


  df_hist["MARGIN_BEFORE"] = df_hist["MARGIN_BEFORE"].fillna(method="ffill")  # cheap fill; fine for modeling
