In [140]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
import os

import matplotlib.pyplot as plt
import seaborn as sns

import shap
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [142]:
base_dir = Path("/Users/willwatson/baseball-breakout")
ARTIFACT_DIR = base_dir / "artifacts"

xgb_path = ARTIFACT_DIR / "xgb_model.pkl"
feat_path = ARTIFACT_DIR / "feature_list.json"
med_path  = ARTIFACT_DIR / "training_feature_stats.json"

# === Load Model ===
assert xgb_path.exists(), f"‚ùå Missing model: {xgb_path}"
xgb_model = joblib.load(xgb_path)
print(f"‚úÖ Loaded model: {xgb_path.name}")

# === Load Feature Schema ===
with open(feat_path, "r") as f:
    FEATURE_LIST = json.load(f)

# === Load Training Medians (optional) ===
MEDIANS = {}
if med_path.exists():
    with open(med_path, "r") as f:
        MEDIANS = json.load(f).get("medians", {})
print(f"‚úÖ Features loaded: {len(FEATURE_LIST)} | Medians loaded: {bool(MEDIANS)}")


‚úÖ Loaded model: xgb_model.pkl
‚úÖ Features loaded: 25 | Medians loaded: True


In [145]:
# === Generate Synthetic 2026 Baseline from 2025 ===

base_dir = Path("/Users/willwatson/baseball-breakout")
outputs_dir = base_dir / "outputs"

# Load your existing combined dataset
data_path = outputs_dir / "combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv"
df = pd.read_csv(data_path)

# Filter for 2025 season
df_2025 = df[df["season"] == 2025].copy()

# Duplicate and label as 2026
df_2026 = df_2025.copy()
df_2026["season"] = 2026

# Zero out delta columns ‚Äî simulate baseline (no change yet)
delta_cols = [c for c in df_2026.columns if c.endswith("_delta")]
for c in delta_cols:
    df_2026[c] = 0.0

# Combine with original for model compatibility
df_extended = pd.concat([df, df_2026], ignore_index=True)

# Save the new version with 2026 baseline included
out_path = outputs_dir / "combined_breakouts_and_nonbreakouts_2022_2026_ratio_plus_raw.csv"
df_extended.to_csv(out_path, index=False)

print(f"‚úÖ Synthetic 2026 baseline created and saved ‚Üí {out_path}")
print(f"Rows: {len(df_extended):,} | Columns: {len(df_extended.columns)}")
print(f"Includes seasons: {sorted(df_extended['season'].unique())}")


‚úÖ Synthetic 2026 baseline created and saved ‚Üí /Users/willwatson/baseball-breakout/outputs/combined_breakouts_and_nonbreakouts_2022_2026_ratio_plus_raw.csv
Rows: 2,769 | Columns: 30
Includes seasons: [2021, 2022, 2023, 2024, 2025, 2026]


In [147]:
# Predict 2026 Breakouts using Saved Model
# Directories
base_dir = Path("/Users/willwatson/baseball-breakout")
artifacts_dir = base_dir / "artifacts"
outputs_dir = base_dir / "outputs"

# Load model and features 
model_path = artifacts_dir / "xgb_breakout_model.pkl"
feature_list_path = artifacts_dir / "feature_list.json"

xgb = joblib.load(model_path)
features = pd.read_json(feature_list_path, typ="series").tolist()

print(f"‚úÖ Loaded model from {model_path}")
print(f"‚úÖ Loaded {len(features)} features")

# Load full dataset
data_path = outputs_dir / "combined_breakouts_and_nonbreakouts_2022_2026_ratio_plus_raw.csv"

df = pd.read_csv(data_path)

# Check if 2026 season exists
if "season" not in df.columns or 2026 not in df["season"].unique():
    raise ValueError("‚ùå No 2026 season rows found in dataset. Did you merge 2026 projections yet?")

# Filter for 2026
df_2026 = df[df["season"] == 2026].copy()

# Align to feature list
X_2026 = df_2026.reindex(columns=features, fill_value=0)
X_2026 = X_2026.fillna(0)

# Predict probabilities 
df_2026["breakout_prob_2026"] = xgb.predict_proba(X_2026)[:, 1]
df_2026["breakout_rank"] = df_2026["breakout_prob_2026"].rank(ascending=False)

# Save results 
out_path = outputs_dir / "breakout_predictions_2026.csv"
df_2026.to_csv(out_path, index=False)

print(f"‚úÖ Saved 2026 breakout predictions ‚Üí {out_path}")
print(f"Rows: {len(df_2026):,}")

# Display Top 30 
top30 = (
    df_2026[["player_id", "player_name","breakout_prob_2026"]]
    .sort_values("breakout_prob_2026", ascending=False)
    .reset_index(drop=True)
    .head(30)
)

print("\nüèÜ Top 30 Predicted 2026 Breakouts:")
display(top30)


‚úÖ Loaded model from /Users/willwatson/baseball-breakout/artifacts/xgb_breakout_model.pkl
‚úÖ Loaded 25 features
‚úÖ Saved 2026 breakout predictions ‚Üí /Users/willwatson/baseball-breakout/outputs/breakout_predictions_2026.csv
Rows: 461

üèÜ Top 30 Predicted 2026 Breakouts:


Unnamed: 0,player_id,player_name,breakout_prob_2026
0,681351,"O'Hoppe, Logan",0.969408
1,643217,"Benintendi, Andrew",0.969408
2,656775,"Mullins, Cedric",0.969065
3,671277,"Garc√≠a Jr., Luis",0.969065
4,665862,"Chisholm Jr., Jazz",0.968547
5,680977,"Donovan, Brendan",0.968547
6,665161,"Pe√±a, Jeremy",0.968547
7,691026,"Winn, Masyn",0.968547
8,608324,"Bregman, Alex",0.968134
9,690993,"Keith, Colt",0.967641


In [149]:
# Explain 2026 Breakout Predictions Safely with Aligned Features 
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

base_dir = Path("/Users/willwatson/baseball-breakout")
artifacts_dir = base_dir / "artifacts"
outputs_dir = base_dir / "outputs"

# Load model + feature list 
model_path = artifacts_dir / "xgb_breakout_model.pkl"
feature_path = artifacts_dir / "feature_list.json"

xgb = joblib.load(model_path)
feature_list = pd.read_json(feature_path).values.flatten().tolist()

#  Load 2026 predictions 
df_2026 = pd.read_csv(outputs_dir / "breakout_predictions_2026.csv")

# Align columns with model
available = [f for f in feature_list if f in df_2026.columns]
missing = [f for f in feature_list if f not in df_2026.columns]

if missing:
    print(f"‚ö†Ô∏è Missing {len(missing)} features from 2026 dataset (ignored): {missing[:5]}...")

X_2026 = df_2026[available].select_dtypes(include=[np.number]).fillna(0)
print(f"‚úÖ Aligned features: {len(available)} used / {len(feature_list)} expected")

#  Try SHAP explanation 
try:
    import shap
    explainer = shap.TreeExplainer(xgb)
    shap_values = explainer.shap_values(X_2026)

    # Only use meaningful deltas
    relevant = [
        f for f in available if any(k in f for k in [
            "barrel", "hardhit", "attack_angle", "contact", "launch", "bb_per", "k_per", "bat_speed"
        ])
    ]
    feature_idx = [available.index(f) for f in relevant if f in available]

    def reason_from_shap(i):
        vals = shap_values[i][feature_idx]
        feats = np.array(relevant)
        top = feats[np.argsort(vals)[::-1][:3]]
        phrases = []
        for f in top:
            if "barrel" in f or "hardhit" in f:
                phrases.append("improved contact quality")
            elif "launch" in f or "attack_angle" in f:
                phrases.append("better launch profile")
            elif "bb_per" in f:
                phrases.append("more walks")
            elif "k_per" in f:
                phrases.append("fewer strikeouts")
            elif "contact" in f:
                phrases.append("better contact consistency")
        return ", ".join(sorted(set(phrases))) or "balanced gains"

    df_2026["breakout_reason"] = [reason_from_shap(i) for i in range(len(X_2026))]

except Exception as e:
    print(f"‚ö†Ô∏è SHAP explanation failed ({e.__class__.__name__}: {e})")
    print("‚Üí Falling back to feature importances.")
    importances = xgb.feature_importances_
    imp = pd.Series(importances, index=feature_list).sort_values(ascending=False)
    top_feats = [f for f in imp.head(5).index if any(k in f for k in ["barrel", "hardhit", "contact", "bb_", "k_", "launch"])]
    readable = [f.replace("_delta", "").replace("_", " ") for f in top_feats]
    df_2026["breakout_reason"] = "Driven by gains in " + ", ".join(readable)

#  Rank and export 
df_2026 = df_2026.sort_values("breakout_prob_2026", ascending=False).reset_index(drop=True)
df_2026["breakout_rank"] = range(1, len(df_2026) + 1)

out_path = outputs_dir / "breakout_predictions_2026_explained.csv"
df_2026.to_csv(out_path, index=False)
print(f"‚úÖ Saved fixed breakout explanations ‚Üí {out_path}")
display(df_2026[["player_name", "breakout_prob_2026", "breakout_reason"]].head(10))


‚úÖ Aligned features: 25 used / 25 expected
‚úÖ Saved fixed breakout explanations ‚Üí /Users/willwatson/baseball-breakout/outputs/breakout_predictions_2026_explained.csv


Unnamed: 0,player_name,breakout_prob_2026,breakout_reason
0,"O'Hoppe, Logan",0.969408,"better launch profile, fewer strikeouts, impro..."
1,"Benintendi, Andrew",0.969408,"better launch profile, fewer strikeouts, impro..."
2,"Mullins, Cedric",0.969065,"better launch profile, fewer strikeouts, impro..."
3,"Garc√≠a Jr., Luis",0.969065,"better launch profile, fewer strikeouts, impro..."
4,"Chisholm Jr., Jazz",0.968547,"better launch profile, fewer strikeouts, impro..."
5,"Donovan, Brendan",0.968547,"better launch profile, fewer strikeouts, impro..."
6,"Pe√±a, Jeremy",0.968547,"better launch profile, fewer strikeouts, impro..."
7,"Winn, Masyn",0.968547,"better launch profile, fewer strikeouts, impro..."
8,"Bregman, Alex",0.968134,"better launch profile, fewer strikeouts, impro..."
9,"Keith, Colt",0.967641,"better launch profile, fewer strikeouts, impro..."
