In [2]:
# === 03_model_train: universal header + config + loads ===
import sys, yaml
from pathlib import Path

# 1) Make src/ importable regardless of where you opened the notebook
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name.lower() == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))

# 2) Autoreload for iterative edits
%load_ext autoreload
%autoreload 2

# 3) Imports
from utils import print_run_header, read_parquet
print_run_header("03_model_train")

# 4) Load config
CFG_PATH = PROJECT_ROOT / "configs" / "config.yaml"
assert CFG_PATH.exists(), f"Missing config at {CFG_PATH}"
cfg = yaml.safe_load(CFG_PATH.read_text())

# 5) Load processed artifacts from 02_feature_store
PROC_DIR = PROJECT_ROOT / cfg["paths"]["processed_dir"]

labels_path = PROC_DIR / "labels.parquet"
team_path   = PROC_DIR / "team_features.parquet"
starter_path= PROC_DIR / "starter_features.parquet"
lineup_path = PROC_DIR / "lineup_features.parquet"
pitch_path  = PROC_DIR / "pitch_features.parquet"
pw_path     = PROC_DIR / "park_weather_features.parquet"  # may not exist if weather was skipped

for p in [labels_path, team_path, starter_path, lineup_path, pitch_path]:
    assert p.exists(), f"Missing required file: {p}"

labels = read_parquet(labels_path)
team   = read_parquet(team_path)
starter= read_parquet(starter_path)
lineup = read_parquet(lineup_path)
pitch  = read_parquet(pitch_path)

# Weather/park may be skipped; build a neutral fallback if absent
if pw_path.exists():
    pw = read_parquet(pw_path)
else:
    import pandas as pd
    pw = pd.DataFrame({
        "game_id": labels["game_id"],
        "temp_c": 20.0,
        "rel_humidity": 50.0,
        "wind_kph": 8.0,
        "mslp_hpa": 1015.0,
        "park_factor_runs": 1.0,
        "air_density_proxy": 1.0,
    })
    print("Note: using neutral park/weather fallback (no weather features).")

print("Loaded shapes:",
      "labels", labels.shape,
      "team", team.shape,
      "starter", starter.shape,
      "lineup", lineup.shape,
      "pitch", pitch.shape,
      "pw", pw.shape)


=== 03_model_train ===
Python 3.12.7 | pandas 2.3.1 | numpy 2.2.6 | sklearn 1.7.1 | xgboost 3.0.4
Platform: Windows 11 | Time: 2025-08-20 18:58:52
Note: using neutral park/weather fallback (no weather features).
Loaded shapes: labels (8680, 8) team (17360, 4) starter (17360, 4) lineup (8680, 3) pitch (8680, 4) pw (8680, 7)


In [3]:
from markov import build_first_inning_prior
prior = build_first_inning_prior(labels, team, starter, lineup, pw)
prior.describe()


KeyError: 'game_pk'

In [None]:
import importlib, model
importlib.reload(model)
from model import train_hybrid_model


In [None]:
from model import train_hybrid_model
artifacts = train_hybrid_model(
    labels=labels,
    features=[team, starter, lineup, pitch, pw],
    prior=prior,
    cfg=cfg
)
artifacts["model_path"]