In [2]:
# Universal header: robust project root (works from repo root OR notebooks/)
import os, sys
from pathlib import Path

def _find_root():
    cwd = Path.cwd()
    for p in [cwd] + list(cwd.parents):
        if (p / "src").is_dir() and (p / "configs").is_dir():
            return p
    return cwd

PROJECT_ROOT = _find_root().resolve()
os.environ["NRFI_PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT / "src") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "src"))

from utils import print_run_header, set_seed
print_run_header("01_data_build")
set_seed(42)
print("Project root:", PROJECT_ROOT)


=== 01_data_build ===
Python 3.12.7 | pandas 2.3.1 | numpy 2.2.6 | sklearn 1.7.1 | xgboost 3.0.4
Platform: Windows 11 | Time: 2025-08-21 16:27:43
Project root: C:\Users\alex\Desktop\nrfi


In [3]:
import yaml
CONFIG_PATH = PROJECT_ROOT / "configs" / "config.yaml"
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)

if not CONFIG_PATH.exists():
    # Write a default config so first run never fails
    CONFIG_DEFAULT = """project:
  name: nrfi_yrfi
  seed: 42
  use_duckdb: true
  log_level: INFO
  time_zone_default: America/New_York
paths:
  data_dir: data
  raw_dir: data/raw
  processed_dir: data/processed
  reference_dir: data/reference
  sample_dir: data/sample
  outputs_dir: outputs
  reports_dir: reports
  odds_dir: data/odds
training:
  start_date: "2024-04-01"
  end_date: "2024-05-15"
  min_games_required: 10
  n_folds: 3
  fold_granularity: "M"
model:
  use_xgb: true
  xgb_params:
    n_estimators: 200
    max_depth: 3
    learning_rate: 0.08
    subsample: 0.9
    colsample_bytree: 0.9
    reg_lambda: 1.0
    reg_alpha: 0.0
    n_jobs: -1
  calibration: "isotonic"
  bootstrap:
    enabled: true
    n_boot: 200
    block_unit: "D"
    block_size: 3
features:
  lineup_samples: 25
  min_history_days: 14
  eb_prior_strength: 50
  default_first_pitch_local_time: "19:00"
odds:
  kelly_fraction_cap: 0.25
  min_edge_abs: 0.02
  min_ci_half_width: 0.06
backtest:
  bankroll_start: 1000
  flat_stake: 10.0
"""
    CONFIG_PATH.write_text(CONFIG_DEFAULT)

cfg = yaml.safe_load(CONFIG_PATH.read_text())
print("Config loaded:", cfg["training"]["start_date"], "→", cfg["training"]["end_date"])


Config loaded: 2021-04-01 → 2024-05-15


In [4]:
# Cell 3: Statcast / PBP pulls with caching + fallback sample
from data import DataManager, fetch_statcast_range

dm = DataManager.from_config(cfg)
dm.ensure_dirs()

pbp = fetch_statcast_range(
    start_date=cfg["training"]["start_date"],
    end_date=cfg["training"]["end_date"]
)

print("PBP shape:", pbp.shape)


[pybaseball.statcast 2021-04-01..2024-05-15] start
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates


100%|██████████| 783/783 [03:36<00:00,  3.62it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


[pybaseball.statcast 2021-04-01..2024-05-15] done in 256.04s
PBP shape: (2491869, 120)


In [5]:
from data import build_first_inning_labels
labels = build_first_inning_labels(pbp)
print(labels.head(3))


  r = df1.groupby("game_pk").apply(_runs_in_inning).rename("first_inning_runs")


        date            game_id  game_pk         game_datetime_utc away_team  \
0 2021-04-10  2021-04-10_COL_SF   632169 2021-04-10 00:00:00+00:00       COL   
1 2021-04-11  2021-04-11_KC_CWS   632170 2021-04-11 00:00:00+00:00        KC   
2 2021-04-11  2021-04-11_COL_SF   632188 2021-04-11 00:00:00+00:00       COL   

  home_team  yrfi  
0        SF     0  
1       CWS     0  
2        SF     1  


In [13]:
import sys, importlib
from pathlib import Path

# ensure our repo src is on path
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name.lower() == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
src_path = str(PROJECT_ROOT / "src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import data
importlib.reload(data)
print("data module file:", getattr(data, "__file__", "<no __file__>"))

# rebuild DataManager
dm = data.DataManager.from_config(cfg)
dm.ensure_dirs()

# now run the functions
stadiums = data.ensure_stadium_reference(dm.reference_dir)
labels = data.attach_scheduled_times(
    labels,
    stadiums,
    default_local_time=cfg["features"]["default_first_pitch_local_time"],
    reference_dir=dm.reference_dir,   # cache location
    use_schedule=False                # True = hits MLB API, False = fast fallback
)


data module file: C:\Users\alex\Desktop\nrfi\src\data.py


NameError: name 'cfg' is not defined

In [7]:
# Park factors (recompute if missing)
from data import ensure_park_factors
park = ensure_park_factors(dm.reference_dir, pbp=pbp, seasons=[2023])
print(park.shape)
print(park.head(3))


(10, 3)
  team_code  season  park_factor_runs
0       NYY    2023              1.03
1       BOS    2023              1.05
2       LAD    2023              0.96


In [8]:
# Weather (meteostat) with robust fallbacks Skip for now
import pandas as pd

# Skip weather for now – create neutral placeholder
weather = pd.DataFrame({
    "game_id": labels["game_id"],
    "temp_c": 20.0,
    "rel_humidity": 50.0,
    "wind_kph": 8.0,
    "mslp_hpa": 1015.0
})
print("Weather placeholder rows:", weather.shape)


Weather placeholder rows: (8680, 5)


In [9]:
# Persist a tiny, offline sample
from data import write_sample_bundle
write_sample_bundle(dm.sample_dir, labels=labels, pbp=pbp, stadiums=stadiums, weather=weather)
print("Sample written ->", dm.sample_dir)


Sample written -> C:\Users\alex\Desktop\nrfi\data\sample


In [10]:
# Save labels to processed
from utils import write_parquet
proc_path = PROJECT_ROOT / cfg["paths"]["processed_dir"]
proc_path.mkdir(parents=True, exist_ok=True)
write_parquet(labels, proc_path / "labels.parquet")
print("Saved:", proc_path / "labels.parquet")
print("01_data_build ✅")


Saved: C:\Users\alex\Desktop\nrfi\data\processed\labels.parquet
01_data_build ✅
