In [2]:
# === 02_feature_store: universal header + config + labels ===
import sys
from pathlib import Path

# 1) Make src/ importable regardless of where you opened the notebook
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name.lower() == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))

# 2) Autoreload so edits in src/ are picked up automatically
%load_ext autoreload
%autoreload 2

# 3) Imports
import yaml
from utils import print_run_header, read_parquet

print_run_header("02_feature_store")

# 4) Load config
CFG_PATH = PROJECT_ROOT / "configs" / "config.yaml"
assert CFG_PATH.exists(), f"Missing config at {CFG_PATH}"
cfg = yaml.safe_load(CFG_PATH.read_text())

# 5) Load labels (built in 01_data_build)
PROC_DIR = PROJECT_ROOT / cfg["paths"]["processed_dir"]
LABELS_PATH = PROC_DIR / "labels.parquet"
assert LABELS_PATH.exists(), f"Missing {LABELS_PATH} — run 01_data_build.ipynb first."
labels = read_parquet(LABELS_PATH)
print("labels:", labels.shape, "from", LABELS_PATH)


=== 02_feature_store ===
Python 3.12.7 | pandas 2.3.1 | numpy 2.2.6 | sklearn 1.7.1 | xgboost 3.0.4
Platform: Windows 11 | Time: 2025-08-21 19:34:54
labels: (8680, 7) from C:\Users\alex\Desktop\nrfi\data\processed\labels.parquet


In [3]:
from features import build_team_fi_features
team_feat = build_team_fi_features(labels, eb_prior_strength=cfg["features"]["eb_prior_strength"])
team_feat.shape


(8680, 4)

In [4]:
# Starter features
from features import build_starter_fi_features
starter_feat = build_starter_fi_features(labels, eb_prior_strength=cfg["features"]["eb_prior_strength"])
starter_feat.shape


(8680, 4)

In [5]:
# Lineup priors
from lineups import build_lineup_priors
lineup_feat = build_lineup_priors(labels, n_samples=cfg["features"]["lineup_samples"])
lineup_feat.shape


(8680, 4)

In [6]:
# Pitch-matchup placeholders
from features import build_pitch_matchup_features
pitch_feat = build_pitch_matchup_features(labels)
pitch_feat.shape


(8680, 4)

In [7]:
from utils import write_parquet

pairs = [
    ("team_features.parquet",    team_feat),
    ("starter_features.parquet", starter_feat),
    ("lineup_features.parquet",  lineup_feat),
    ("pitch_features.parquet",   pitch_feat),
]

# include park/weather only if you created pw_feat above
if "pw_feat" in globals():
    pairs.append(("park_weather_features.parquet", pw_feat))

PROC_DIR.mkdir(parents=True, exist_ok=True)
for name, df in pairs:
    assert len(df) > 0, f"Critical empty table: {name}"
    write_parquet(df, PROC_DIR / name)

print("02_feature_store ✅ | wrote:", [n for n, _ in pairs])


02_feature_store ✅ | wrote: ['team_features.parquet', 'starter_features.parquet', 'lineup_features.parquet', 'pitch_features.parquet']
