In [138]:
# === Imports ===
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from xgboost import XGBClassifier


In [140]:
# Load feature dataset 
base_dir = Path("/Users/willwatson/baseball-breakout")
file_path = base_dir / "outputs" / "combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv"

df = pd.read_csv(file_path)
print(f"‚úÖ Loaded {len(df):,} rows from {file_path.name}")
print("Columns:", df.columns[:15].tolist(), "...")



‚úÖ Loaded 2,308 rows from combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv
Columns: ['player_id', 'player_name', 'season', 'pa', 'pa_prev', 'pa_delta', 'process_score', 'breakout_label', 'labeled_year', 'attack_angle_delta', 'attack_angle_toward_opt_delta', 'barrels_per_bbe_percent_delta', 'barrels_per_pa_delta', 'barrels_per_swing_delta', 'bat_speed_delta'] ...


In [119]:
#  Clean up features and optional scaling
# Scale down process_score to reduce its dominance
if "process_score" in df.columns:
    df["process_score"] = df["process_score"] * 0.1
    print("‚öñÔ∏è Scaled down 'process_score' by 0.1x")

# Drop rows with missing breakout_label
df = df.dropna(subset=["breakout_label"]).copy()
df["breakout_label"] = df["breakout_label"].astype(int)

print("‚úÖ Cleaned and ready for split:", df.shape)


‚öñÔ∏è Scaled down 'process_score' by 0.1x
‚úÖ Cleaned and ready for split: (2308, 30)


In [121]:
# Split train/test 
train = df[df["season"].between(2022, 2024)].copy()
test = df[df["season"] == 2025].copy()

label_col = "breakout_label"
drop_cols = [
    label_col, "season", "player_id", "player_name", "labeled_year"
]

# Keep numeric features only
X_train = train.drop(columns=drop_cols, errors="ignore").select_dtypes(include=[np.number]).fillna(0)
X_test = test.drop(columns=drop_cols, errors="ignore").select_dtypes(include=[np.number]).fillna(0)

y_train = train[label_col].astype(int)
y_test = test[label_col].astype(int)

print(f"‚úÖ Train shape: {X_train.shape} | Test shape: {X_test.shape}")
print(f"Positives in train: {y_train.sum()} / {len(y_train)}")


‚úÖ Train shape: (1384, 25) | Test shape: (461, 25)
Positives in train: 153 / 1384


In [123]:
#  Train models 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X_train_scaled, y_train)

# XGBoost
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)

print("‚úÖ Both models trained successfully.")


‚úÖ Both models trained successfully.


In [142]:
#  Helper function for evaluation 
def evaluate_model(name, model, X, y):
    preds = model.predict(X)
    probas = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else preds

    print(f"\nüìà {name}")
    print(classification_report(y, preds, digits=3))
    print("ROC AUC:", round(roc_auc_score(y, probas), 4))
    print("Avg Precision:", round(average_precision_score(y, probas), 4))


#  Align features before evaluating XGBoost 
trained_features = xgb.get_booster().feature_names
X_test_aligned = X_test[[f for f in trained_features if f in X_test.columns]].copy()
X_test_aligned = X_test_aligned[trained_features]

print("‚úÖ Evaluating both models...")
evaluate_model("Logistic Regression (scaled)", logreg, X_test_scaled, y_test)
evaluate_model("XGBoost (raw, aligned)", xgb, X_test_aligned, y_test)


‚úÖ Evaluating both models...

üìà Logistic Regression (scaled)
              precision    recall  f1-score   support

           0      0.968     0.973     0.971       408
           1      0.784     0.755     0.769        53

    accuracy                          0.948       461
   macro avg      0.876     0.864     0.870       461
weighted avg      0.947     0.948     0.948       461

ROC AUC: 0.9734
Avg Precision: 0.8052

üìà XGBoost (raw, aligned)
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       408
           1      1.000     1.000     1.000        53

    accuracy                          1.000       461
   macro avg      1.000     1.000     1.000       461
weighted avg      1.000     1.000     1.000       461

ROC AUC: 1.0
Avg Precision: 1.0


In [127]:
#  Retrain on full dataset (2022‚Äì2025) 
train_full = df[df["season"].between(2022, 2025)].copy()

X_full = train_full.drop(columns=drop_cols, errors="ignore").select_dtypes(include=["number"]).fillna(0)
y_full = train_full["breakout_label"].astype(int)

xgb.fit(X_full, y_full)
print(f"‚úÖ Model retrained on {len(X_full):,} rows and {X_full.shape[1]} features.")


‚úÖ Model retrained on 1,845 rows and 25 features.


In [131]:
# Save trained model & features
artifacts_dir = base_dir / "artifacts"
artifacts_dir.mkdir(exist_ok=True)

model_path = artifacts_dir / "xgb_breakout_model.pkl"
joblib.dump(xgb, model_path)

feature_list_path = artifacts_dir / "feature_list.json"
pd.Series(X_full.columns).to_json(feature_list_path, orient="values")

print(f"‚úÖ Model saved to: {model_path}")
print(f"‚úÖ Feature list saved to: {feature_list_path}")


‚úÖ Model saved to: /Users/willwatson/baseball-breakout/artifacts/xgb_breakout_model.pkl
‚úÖ Feature list saved to: /Users/willwatson/baseball-breakout/artifacts/feature_list.json


In [133]:
# Tabular importance summary 
booster = xgb.get_booster()
importance = booster.get_score(importance_type="gain")

feat_importance = (
    pd.DataFrame.from_dict(importance, orient="index", columns=["gain"])
    .sort_values("gain", ascending=False)
    .reset_index()
    .rename(columns={"index": "feature"})
)

display(feat_importance.head(25))


Unnamed: 0,feature,gain
0,process_score,25.464861
1,barrels_per_pa_delta,15.938337
2,barrels_per_bbe_percent_delta,11.821918
3,pa,8.560834
4,hardhit_percent_delta,4.53174
5,attack_angle_delta,2.4462
6,bat_speed_delta,2.127005
7,pa_prev,1.469498
8,rate_ideal_attack_angle_delta,1.449944
9,k_per_pa_delta,1.405693
