In [6]:
import pandas as pd
import numpy as np
from glob import glob
import re

In [8]:
#  CONFIGURATION 
DATA_PATHS = {
    2021: "../data/raw/savant_2021.csv",
    2022: "../data/raw/savant_2022.csv",
    2023: "../data/raw/savant_2023.csv",
    2024: "../data/raw/savant_2024.csv",
    2025: "../data/raw/savant_2025.csv",
}

OUTPUT_PATH = "../outputs/combined_breakouts_and_nonbreakouts_2022_2025_ratio_features.csv"

MIN_SEASON = 2021
MIN_PA_FOR_ROW   = 100   # to include a season row at all
MIN_PA_FOR_LABEL = 250   # eligibility for breakout label
BREAKOUT_PERCENTILE = 0.85     # top 15% within season
ABS_Z_THRESHOLD = 1.5          # ALSO require at least +1.5 SD absolute improvement (set None to disable)



In [32]:
def normalize_cols(cols):
    out = []
    for c in cols:
        c2 = c.strip().lower().replace("%", " percent")
        c2 = re.sub(r"[^a-z0-9]+", "_", c2).strip("_")
        out.append(c2)
    return out

# Map many common names to consistent ones we’ll use
RENAME = {
    "player_id":"player_id", "player_name":"player_name",
    "season":"season", "year":"season",
    "pa":"pa", "plate_appearances":"pa",

    # counts for ratios
    "bb":"bb", "so":"so", "whiffs":"whiffs", "swings":"swings", "takes":"takes",
    "bip":"bip", "barrels_total":"barrels_total",

    # percents / rates (as provided)
    "hardhit_percent":"hardhit_percent",
    "barrels_per_bbe_percent":"barrels_per_bbe_percent",

    # useful context (not fed directly to model here)
    "launch_speed":"launch_speed",
    "launch_angle":"launch_angle",
    "attack_angle":"attack_angle",
    "rate_ideal_attack_angle":"rate_ideal_attack_angle",
}

dfs = []
for year, path in DATA_PATHS.items():
    tmp = pd.read_csv(path)
    tmp.columns = normalize_cols(tmp.columns)
    tmp = tmp.rename(columns={k:v for k,v in RENAME.items() if k in tmp.columns})
    tmp["season"] = year  
    dfs.append(tmp)

df = pd.concat(dfs, ignore_index=True)

# Keep one row per player-season 
group_keys = ["player_id", "player_name", "season"]
agg = {c: "mean" for c in df.columns if c not in group_keys}
df = df.groupby(group_keys, as_index=False).agg(agg)

# Basic filters
df = df[df["season"] >= MIN_SEASON]
df = df[df["pa"] >= MIN_PA_FOR_ROW]
df = df.sort_values(["player_id","season"]).reset_index(drop=True)

print("Loaded columns:", sorted(df.columns.tolist()))
print("Seasons:", sorted(df["season"].unique().tolist()))
print("Rows:", len(df))


Loaded columns: ['abs', 'api_break_x_arm', 'api_break_x_batter_in', 'api_break_z_induced', 'api_break_z_with_gravity', 'arm_angle', 'attack_angle', 'attack_direction', 'ba', 'babip', 'barrels_per_bbe_percent', 'barrels_per_pa_percent', 'barrels_total', 'bat_speed', 'batter_run_value_per_100', 'bb', 'bb_percent', 'bbdist', 'bip', 'doubles', 'eff_min_vel', 'effective_speed', 'hardhit_percent', 'hits', 'hrs', 'hyper_speed', 'intercept_ball_minus_batter_pos_x_inches', 'intercept_ball_minus_batter_pos_y_inches', 'iso', 'k_percent', 'launch_angle', 'launch_speed', 'obp', 'pa', 'pitch_percent', 'pitcher_run_exp', 'pitcher_run_value_per_100', 'pitches', 'plate_x', 'plate_z', 'player_id', 'player_name', 'pos3_int_start_distance', 'pos4_int_start_distance', 'pos5_int_start_distance', 'pos6_int_start_distance', 'pos7_int_start_distance', 'pos8_int_start_distance', 'pos9_int_start_distance', 'rate_ideal_attack_angle', 'release_extension', 'release_pos_x', 'release_pos_z', 'run_exp', 'season', 'sin

In [34]:
def safe_div(numer, denom):
    return numer / (denom.replace(0, np.nan) + 1e-6)

def pct_to_rate(series):
    """Convert a percent-like column to fraction if values look like 0–100."""
    s = series.copy()
    if s.dropna().mean() > 1.0:  # crude but works: e.g., 45.0% → 0.45
        s = s / 100.0
    return s

# Ensure required base columns exist (fill with 0 if missing in early seasons)
for col in ["bb","so","whiffs","swings","takes","bip","barrels_total",
            "hardhit_percent","barrels_per_bbe_percent","pa"]:
    if col not in df.columns:
        df[col] = 0.0

# Convert percentage columns to rates
df["hardhit_rate"] = pct_to_rate(df["hardhit_percent"])
df["barrel_bbe_rate"] = pct_to_rate(df["barrels_per_bbe_percent"])

# batted balls proxy (if bip exists, use it)
df["batted_balls"] = df["bip"]  # season-level Balls In Play (already present in your CSVs)

# --- Ratios / per-PA features (season level) ---
# Discipline / contact
df["bb_per_pa"]        = safe_div(df["bb"], df["pa"])
df["k_per_pa"]         = safe_div(df["so"], df["pa"])
df["bb_k_ratio"]       = df["bb"] / (df["so"] + 1e-6)

df["whiff_per_swing"]  = safe_div(df["whiffs"], df["swings"])
df["whiffs_per_pa"]    = safe_div(df["whiffs"], df["pa"])
df["swings_per_pa"]    = safe_div(df["swings"], df["pa"])
df["takes_per_pa"]     = safe_div(df["takes"],  df["pa"])

# Contact rate forms
df["contact_rate"]        = (df["swings"] - df["whiffs"]) / (df["swings"] + 1e-6)
df["contact_per_pa"]      = (df["swings"] - df["whiffs"]) / (df["pa"] + 1e-6)

# Power / quality per opportunity
df["hardhit_per_pa"]      = df["hardhit_rate"] * safe_div(df["batted_balls"], df["pa"])
df["barrels_per_pa"]      = safe_div(df["barrels_total"], df["pa"])
df["barrels_per_swing"]   = safe_div(df["barrels_total"], df["swings"])

# Power Index per PA = (HardHit% + Barrels/BBE%) * (batted_balls / PA)
df["power_index_per_pa"]  = (df["hardhit_rate"] + df["barrel_bbe_rate"]) * safe_div(df["batted_balls"], df["pa"])

ratio_cols = [
    "bb_per_pa","k_per_pa","bb_k_ratio",
    "whiff_per_swing","whiffs_per_pa","swings_per_pa","takes_per_pa",
    "contact_rate","contact_per_pa",
    "hardhit_per_pa","barrels_per_pa","barrels_per_swing","power_index_per_pa",
]

print("Built ratio features:", ratio_cols)
df[["player_name","season"] + ratio_cols].head()


Built ratio features: ['bb_per_pa', 'k_per_pa', 'bb_k_ratio', 'whiff_per_swing', 'whiffs_per_pa', 'swings_per_pa', 'takes_per_pa', 'contact_rate', 'contact_per_pa', 'hardhit_per_pa', 'barrels_per_pa', 'barrels_per_swing', 'power_index_per_pa']


Unnamed: 0,player_name,season,bb_per_pa,k_per_pa,bb_k_ratio,whiff_per_swing,whiffs_per_pa,swings_per_pa,takes_per_pa,contact_rate,contact_per_pa,hardhit_per_pa,barrels_per_pa,barrels_per_swing,power_index_per_pa
0,"Pujols, Albert",2021,0.037543,0.153584,0.244444,0.217391,0.392491,1.805461,2.027304,0.782609,1.412969,0.313993,0.071672,0.039698,0.385666
1,"Pujols, Albert",2022,0.077143,0.157143,0.490909,0.212575,0.405714,1.908571,2.048571,0.787425,1.502857,0.348571,0.088571,0.046407,0.437143
2,"Cabrera, Miguel",2021,0.076046,0.224335,0.338983,0.257711,0.492395,1.910646,1.958175,0.742289,1.418251,0.343129,0.057034,0.029851,0.40096
3,"Cabrera, Miguel",2022,0.05814,0.234884,0.247525,0.270303,0.518605,1.918605,1.902326,0.729697,1.4,0.269231,0.032558,0.01697,0.302007
4,"Cabrera, Miguel",2023,0.084011,0.197832,0.424658,0.256372,0.463415,1.807588,1.934959,0.743628,1.344173,0.268293,0.02981,0.016492,0.298103


In [14]:
# Stat Delta Building

RATIO_FEATURES = [
    "bb_per_pa", "k_per_pa", "bb_k_ratio", 
    "whiff_per_swing", "whiffs_per_pa",
    "hardhit_per_pa", "barrels_per_pa", "barrels_per_swing",
    "power_index_per_pa", "contact_rate", "contact_per_pa",
    "takes_per_pa", "swings_per_pa"
]

# raw skill features
RAW_FEATURES = [
    "launch_speed", "hardhit_percent", "barrels_per_bbe_percent",
    "bat_speed", "attack_angle", "rate_ideal_attack_angle"
]

# Add PA context
CONTEXT_FEATURES = ["pa"]

ALL_FEATURES = RATIO_FEATURES + RAW_FEATURES + CONTEXT_FEATURES

def compute_deltas(group):
    g = group.sort_values("season").copy()
    
    # ratio + raw features
    for col in RATIO_FEATURES + RAW_FEATURES:
        if col in g.columns:
            g[f"{col}_prev"]  = g[col].shift(1)
            g[f"{col}_delta"] = g[col] - g[f"{col}_prev"]
    
    # toward-opt deltas for angles
    if {"attack_angle","attack_angle_prev"}.issubset(g.columns):
        g["attack_angle_toward_opt_delta"] = (
            np.abs(g["attack_angle_prev"] - 8) - np.abs(g["attack_angle"] - 8)
        )
    if {"rate_ideal_attack_angle","rate_ideal_attack_angle_prev"}.issubset(g.columns):
        g["rate_ideal_attack_angle_delta"] = (
            g["rate_ideal_attack_angle"] - g["rate_ideal_attack_angle_prev"]
        )

    # PA context
    if "pa" in g.columns:
        g["pa_prev"]  = g["pa"].shift(1)
        g["pa_delta"] = g["pa"] - g["pa_prev"]
    
    return g

# Apply per player
df_d = df.groupby("player_id", group_keys=False).apply(compute_deltas)



  df_d = df.groupby("player_id", group_keys=False).apply(compute_deltas)


In [36]:
# Build Process Score + Identify Past Breakouts
all_delta_features = sorted([c for c in df_d.columns if c.endswith("_delta")])
score_delta_features = [c for c in all_delta_features if c not in {"pa_delta"}]

print("Delta features (total):", len(all_delta_features))
print("Delta features used in process_score (excl. pa_delta):", len(score_delta_features))

# 2) Z-score by season for process features only
def zscore_by_season(frame, cols):
    out = frame.copy()
    for c in cols:
        mu = out.groupby("season")[c].transform("mean")
        sd = out.groupby("season")[c].transform("std").replace(0, np.nan)
        out[c] = (out[c].fillna(0.0) - mu) / sd
        out[c] = out[c].fillna(0.0)
    return out

df_z = zscore_by_season(df_d.copy(), score_delta_features)

# 3) Build process_score (equal-weight sum of z-scored deltas)
df_z["process_score"] = df_z[score_delta_features].sum(axis=1)

# 4) Label breakouts: top percentile AND absolute z threshold (if set)
df_z["breakout_label"] = 0
eligible = (df_z["pa"] >= MIN_PA_FOR_LABEL)

for s, grp in df_z.groupby("season"):
    idx = grp.index
    cutoff = np.percentile(grp["process_score"], 100 * BREAKOUT_PERCENTILE)
    mask = grp["process_score"] >= cutoff
    if ABS_Z_THRESHOLD is not None:
        mask = mask & (grp["process_score"] >= ABS_Z_THRESHOLD)
    df_z.loc[idx, "breakout_label"] = (mask & eligible.loc[idx]).astype(int)

df_z["labeled_year"] = df_z["season"]

print("Label balance by season:")
print(df_z.groupby("season")["breakout_label"].value_counts().unstack(fill_value=0))

# 5) Export: include IDs, PA context, ALL deltas (ratios + your added raw-skill deltas),
#    and the 3 label columns we just created
export_cols = (
    ["player_id","player_name","season","pa","pa_prev","pa_delta",
     "process_score","breakout_label","labeled_year"]
    + all_delta_features
)

existing = [c for c in export_cols if c in df_z.columns]
missing  = [c for c in export_cols if c not in df_z.columns]
if missing:
    print("Note: missing columns skipped ->", missing)

OUTFILE = "../outputs/combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv"
df_z[existing].to_csv(OUTFILE, index=False)
print(f"Saved to: {OUTFILE} | Features exported: {len(all_delta_features)} | Rows: {len(df_z)}")


Delta features (total): 21
Delta features used in process_score (excl. pa_delta): 20
Label balance by season:
breakout_label    0   1
season                 
2021            463   0
2022            416  53
2023            409  51
2024            406  49
2025            408  53
Saved to: ../outputs/combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv | Features exported: 21 | Rows: 2308


In [40]:
#   Archetype Weighting from Known Breakouts (Last, First Format)


print("\n=== Building Archetype Weighting from Known Breakouts ===")

# Helper to normalize names (remove punctuation, lowercase)
def normalize_name(name):
    if pd.isna(name):
        return ""
    return re.sub(r"[^a-z]", "", name.lower())

df_z["player_name_norm"] = df_z["player_name"].apply(normalize_name)

# Map known breakout players (using "Last, First" format) ---
archetype_map = {
    "caminerojunior": "power_maturation",
    "greeneriley": "power_maturation",
    "robertjrluis": "raw_power",
    "duranjarren": "contact_efficiency",
    "doylebrenton": "contact_efficiency",
    "perdomogeraldo": "approach_refinement",
    "crawfordjp": "approach_lift_refinement",
    "raleighcal": "sustained_power",
}

df_z["archetype_label"] = df_z["player_name_norm"].map(archetype_map)

# Verify that we found them
matched = df_z.loc[df_z["archetype_label"].notna(), ["player_name", "archetype_label"]].drop_duplicates()
if matched.empty:
    print("⚠️ Still no matches — check spelling below:")
    print(df_z["player_name"].head(30))
else:
    print("✅ Matched known breakout players:")
    display(matched)

#  Core process features for archetype weighting 
process_features = [
    "barrels_per_bbe_percent_delta", "hardhit_percent_delta",
    "launch_angle_delta", "attack_angle_delta",
    "rate_ideal_attack_angle_delta", "barrels_per_pa_delta",
    "barrels_per_swing_delta", "hardhit_per_pa_delta",
    "bb_per_pa_delta" if "bb_per_pa_delta" in df_z.columns else None,
    "k_per_pa_delta" if "k_per_pa_delta" in df_z.columns else None,
]
process_features = [c for c in process_features if c in df_z.columns]

# Compute mean delta vectors per archetype 
df_archetypes = df_z[df_z["archetype_label"].notna()][["player_name", "archetype_label"] + process_features]
if len(df_archetypes) == 0:
    raise ValueError("No archetype examples found — check that names match the CSV format exactly.")

archetype_means = df_archetypes.groupby("archetype_label")[process_features].mean().fillna(0)
archetype_weights = archetype_means.div(archetype_means.abs().sum(axis=1), axis=0)

print(f"✅ Derived weight vectors for {len(archetype_weights)} archetypes:")
display(archetype_weights.round(3))

# Compute per-player similarity scores 
for arch in archetype_weights.index:
    w = archetype_weights.loc[arch]
    df_z[f"score_{arch}"] = (df_z[process_features] * w).sum(axis=1)

# Normalize scores 0–1
for arch in archetype_weights.index:
    s = df_z[f"score_{arch}"]
    df_z[f"score_{arch}"] = (s - s.min()) / (s.max() - s.min() + 1e-9)

# Assign dominant archetype 
score_cols = [f"score_{a}" for a in archetype_weights.index]
df_z["dominant_archetype"] = df_z[score_cols].idxmax(axis=1).str.replace("score_", "")
df_z["breakout_potential_index"] = df_z[score_cols].mean(axis=1)

# Export ML-ready file 
ml_cols = (
    ["player_id", "player_name", "season", "breakout_label",
     "process_score", "breakout_potential_index", "dominant_archetype"]
    + score_cols + process_features
)
existing = [c for c in ml_cols if c in df_z.columns]
OUTFILE_ML = "../artifacts/ml_training_base.csv"
df_z[existing].to_csv(OUTFILE_ML, index=False)

print(f"✅ ML training base saved to: {OUTFILE_ML}")
print(f"Columns exported: {len(existing)} | Rows: {len(df_z)}")



=== Building Archetype Weighting from Known Breakouts ===
✅ Matched known breakout players:


Unnamed: 0,player_name,archetype_label
923,"Crawford, J.P.",approach_lift_refinement
1369,"Raleigh, Cal",sustained_power
1876,"Perdomo, Geraldo",approach_refinement
1893,"Robert Jr., Luis",raw_power
2033,"Duran, Jarren",contact_efficiency
2119,"Greene, Riley",power_maturation
2162,"Doyle, Brenton",contact_efficiency
2225,"Caminero, Junior",power_maturation


✅ Derived weight vectors for 6 archetypes:


Unnamed: 0_level_0,barrels_per_bbe_percent_delta,hardhit_percent_delta,attack_angle_delta,rate_ideal_attack_angle_delta,barrels_per_pa_delta,barrels_per_swing_delta,hardhit_per_pa_delta,bb_per_pa_delta,k_per_pa_delta
archetype_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
approach_lift_refinement,0.108,0.089,-0.154,-0.181,0.101,0.094,0.03,0.161,0.082
approach_refinement,0.124,0.111,-0.031,0.002,0.142,0.159,0.131,0.126,-0.174
contact_efficiency,0.094,0.113,-0.038,0.057,0.115,0.126,0.151,0.108,-0.197
power_maturation,0.228,0.07,0.122,0.125,0.23,0.138,0.066,0.016,0.005
raw_power,-0.069,-0.074,-0.014,0.008,-0.132,-0.095,-0.192,0.234,0.183
sustained_power,0.178,0.119,0.036,-0.043,0.166,0.15,0.096,0.136,-0.076


✅ ML training base saved to: ../artifacts/ml_training_base.csv
Columns exported: 22 | Rows: 2308


In [42]:
# EXPORT

# Collect all delta features (ratios + raw-skill + toward-opt)
all_delta_features = sorted([c for c in df_z.columns if c.endswith("_delta")])

export_cols = (
    ["player_id","player_name","season","pa","pa_prev","pa_delta",
     "process_score","breakout_label","labeled_year"]
    + all_delta_features
)

# sanity check
missing = [c for c in export_cols if c not in df_z.columns]
if missing:
    print("Missing columns (will be skipped):", missing)

existing = [c for c in export_cols if c in df_z.columns]

OUTFILE = "../outputs/combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv"
df_z[existing].to_csv(OUTFILE, index=False)

print(f"Saved to: {OUTFILE}")
print("Rows:", len(df_z), "| Features exported:", len(all_delta_features))


Saved to: ../outputs/combined_breakouts_and_nonbreakouts_2022_2025_ratio_plus_raw.csv
Rows: 2308 | Features exported: 21
