In [1]:
# =====================================================
# YAMAMOTO V3 DATASET PREPROCESSING SCRIPT (2025, NO SPLIT)
# =====================================================

import pandas as pd
import numpy as np
from pybaseball import statcast_pitcher

# ----------------------------------------
# 1. Load raw Statcast for Yamamoto (2025)
# ----------------------------------------

PITCHER_ID = 808967   # Yoshinobu Yamamoto MLBAM ID

start_date = "2025-03-01"
end_date   = "2025-11-16"

print(f"Downloading Statcast data for pitcher {PITCHER_ID} from {start_date} to {end_date}...")
df = statcast_pitcher(start_date, end_date, PITCHER_ID)

# Keep only rows with valid pitch_type
df = df[~df["pitch_type"].isna()].copy()

# Sort by game, PA, pitch number
df = df.sort_values(["game_pk", "at_bat_number", "pitch_number"]).reset_index(drop=True)

print("Rows after filtering null pitch_type:", len(df))


# ----------------------------------------
# 2. Identifiers & sequence indices
# ----------------------------------------

group_cols = ["game_pk", "at_bat_number"]

# pitch index within PA
df["pitch_in_pa"] = df.groupby(group_cols).cumcount()

# (Still removing pitch_in_game – time drift, low strategic value)


# ----------------------------------------
# 3. Target encoding: pitch_type_idx
# ----------------------------------------

pitch_types = sorted(df["pitch_type"].unique())
pitch2idx = {p: i for i, p in enumerate(pitch_types)}
idx2pitch = {i: p for p, i in pitch2idx.items()}

df["pitch_type_idx"] = df["pitch_type"].map(pitch2idx)


# ----------------------------------------
# 4. Batter handedness encoding
# ----------------------------------------

stands = sorted(df["stand"].dropna().unique())  # ["L", "R"]
stand2idx = {s: i for i, s in enumerate(stands)}
idx2stand = {i: s for s, i in stand2idx.items()}

df["batter_hand_idx"] = df["stand"].map(stand2idx)


# ----------------------------------------
# 5. Previous pitch types (1-back + 2-back)
# ----------------------------------------

df["prev_pitch_type"] = df.groupby(group_cols)["pitch_type"].shift(1).fillna("START")
df["prev_prev_pitch_type"] = df.groupby(group_cols)["pitch_type"].shift(2).fillna("START")

prev_types = ["START"] + pitch_types
prev_pitch2idx = {p: i for i, p in enumerate(prev_types)}
df["prev_pitch_idx"] = df["prev_pitch_type"].map(prev_pitch2idx)


# ----------------------------------------
# 6. Previous pitch result (coarse)
# ----------------------------------------

def coarse_result(desc, pitch_type_code, events):

    if isinstance(desc, str):
        d = desc.lower()
    else:
        d = ""

    if "hit by pitch" in d:
        return "hit_by_pitch"
    if "foul" in d:
        return "foul"
    if "swinging_strike" in d:
        return "swinging_strike"
    if "called_strike" in d:
        return "called_strike"
    if "ball" in d and "blocked" not in d:
        return "ball"
    if "in play" in d:
        return "in_play"

    if isinstance(events, str) and events != "":
        return "in_play"

    if isinstance(pitch_type_code, str):
        t = pitch_type_code.upper()
        if t == "B":
            return "ball"
        if t == "S":
            return "called_strike"
        if t == "X":
            return "in_play"

    return "other"


df["coarse_result"] = df.apply(
    lambda row: coarse_result(
        row.get("description", None),
        row.get("type", None),
        row.get("events", None)
    ),
    axis=1
)

df["prev_pitch_result"] = df.groupby(group_cols)["coarse_result"].shift(1).fillna("START")

result_categories = sorted(df["prev_pitch_result"].unique())
result2idx = {r: i for i, r in enumerate(result_categories)}

df["prev_pitch_result_idx"] = df["prev_pitch_result"].map(result2idx)


# ----------------------------------------
# 7. Numeric context (V1 core features)
# ----------------------------------------

df["balls"] = df["balls"].astype(int)
df["strikes"] = df["strikes"].astype(int)
df["outs_when_up"] = df["outs_when_up"].astype(int)

df["inning"] = df["inning"].astype(int)
df["is_top_inning"] = (df["inning_topbot"] == "Top").astype(int)

df["on_1b_flag"] = df["on_1b"].notna().astype(int)
df["on_2b_flag"] = df["on_2b"].notna().astype(int)
df["on_3b_flag"] = df["on_3b"].notna().astype(int)


# ----------------------------------------
# ✅ 8. Count leverage features (V2)
# ----------------------------------------

df["pitcher_ahead_flag"] = (((df["strikes"] == 2) & (df["balls"] <= 1))).astype(int)
df["hitter_ahead_flag"]  = (((df["balls"] >= 2) & (df["balls"] > df["strikes"]))).astype(int)
df["putaway_count_flag"] = (df["strikes"] == 2).astype(int)


# ----------------------------------------
# ✅ 9. Platoon advantage (Yamamoto = RHP)
# ----------------------------------------

df["platoon_adv"] = (df["stand"] == "L").astype(int)


# ----------------------------------------
# ✅ 10. Fastball exposure & tunneling (V2)
# ----------------------------------------

fastball_like = {"FF", "FC", "FS", "SI", "SF"}
is_fastball_like = df["pitch_type"].isin(fastball_like).astype(int)

df["fastballs_in_pa"] = (
    is_fastball_like
    .groupby([df["game_pk"], df["at_bat_number"]])
    .cumsum()
    .shift(1)
    .fillna(0)
    .astype(int)
)

prev_fastball = df["prev_pitch_type"].isin(fastball_like)
prev_prev_fastball = df["prev_prev_pitch_type"].isin(fastball_like)

df["last_two_fastballs_flag"] = (prev_fastball & prev_prev_fastball).astype(int)


# ----------------------------------------
# ✅ 11. RISP pressure (V2)
# ----------------------------------------

df["risp_flag"] = ((df["on_2b_flag"] == 1) | (df["on_3b_flag"] == 1)).astype(int)


# ----------------------------------------
# 12. Score differential from Dodgers POV
# ----------------------------------------

DODGERS_ABBR = "LAD"

df["home_score"] = df["home_score"].astype(int)
df["away_score"] = df["away_score"].astype(int)

df["dodgers_score"] = np.where(
    df["home_team"] == DODGERS_ABBR,
    df["home_score"],
    df["away_score"]
)

df["opponent_score"] = np.where(
    df["home_team"] == DODGERS_ABBR,
    df["away_score"],
    df["home_score"]
)

df["score_diff_pov"] = df["dodgers_score"] - df["opponent_score"]


# ----------------------------------------
# ✅ 13. High leverage flag (V2)
# ----------------------------------------

df["high_leverage_flag"] = (
    (df["inning"] >= 7) & (df["score_diff_pov"].abs() <= 1)
).astype(int)


# ----------------------------------------
# ✅ 14. NEW (V3): Previous pitch physics
# ----------------------------------------

# Ensure physics columns exist and are numeric
physics_cols = ["release_speed", "pfx_x", "pfx_z"]
for col in physics_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Shift within PA to get previous pitch physics
for col in physics_cols:
    df[f"prev_{col}"] = df.groupby(group_cols)[col].shift(1)

# Fill NaNs (first pitch in PA) with global mean of that prev_ column
for col in physics_cols:
    prev_col = f"prev_{col}"
    mean_val = df[prev_col].mean()
    df[prev_col] = df[prev_col].fillna(mean_val)

# Compute Yamamoto's mean four-seam fastball speed
ff_mask = df["pitch_type"] == "FF"
ff_mean_speed = df.loc[ff_mask, "release_speed"].mean()

# Fallback: if somehow no FFs, use overall mean
if np.isnan(ff_mean_speed):
    ff_mean_speed = df["release_speed"].mean()

# Relative speed feature: previous pitch speed minus FF mean
df["prev_speed_minus_ff_mean"] = df["prev_release_speed"] - ff_mean_speed


# ----------------------------------------
# ✅ 15. Keep only V3 columns
# ----------------------------------------

v3_cols = [
    # Identifiers
    "game_pk",
    "at_bat_number",
    "pitch_in_pa",
    "batter",
    "home_team",
    "away_team",

    # Target
    "pitch_type",
    "pitch_type_idx",

    # Categorical
    "stand",
    "batter_hand_idx",
    "prev_pitch_type",
    "prev_pitch_idx",
    "prev_prev_pitch_type",
    "prev_pitch_result",
    "prev_pitch_result_idx",

    # Numeric context
    "balls",
    "strikes",
    "outs_when_up",
    "inning",
    "is_top_inning",
    "on_1b_flag",
    "on_2b_flag",
    "on_3b_flag",
    "score_diff_pov",

    "pitcher_ahead_flag",
    "hitter_ahead_flag",
    "putaway_count_flag",
    "platoon_adv",
    "fastballs_in_pa",
    "last_two_fastballs_flag",
    "risp_flag",
    "high_leverage_flag",

    # ✅ V3 physics features
    "prev_release_speed",
    "prev_pfx_x",
    "prev_pfx_z",
    "prev_speed_minus_ff_mean",
]

df_v3 = df[v3_cols].copy()

print("Final v3 dataset shape:", df_v3.shape)
print(df_v3.head())


# ----------------------------------------
# ✅ 16. Save V3 CSV
# ----------------------------------------

output_path = "yamamoto_v3_pitches_2025.csv"
df_v3.to_csv(output_path, index=False)

print("Saved v3 dataset to:", output_path)

Downloading Statcast data for pitcher 808967 from 2025-03-01 to 2025-11-16...
Gathering Player Data
Rows after filtering null pitch_type: 3437
Final v3 dataset shape: (3437, 36)
   game_pk  at_bat_number  pitch_in_pa  batter home_team away_team pitch_type  \
0   776185              4            0  672695        AZ       LAD         FC   
1   776185              4            1  672695        AZ       LAD         FS   
2   776185              4            2  672695        AZ       LAD         FS   
3   776185              4            3  672695        AZ       LAD         FS   
4   776185              5            0  606466        AZ       LAD         CU   

   pitch_type_idx stand  batter_hand_idx  ... putaway_count_flag  platoon_adv  \
0               1     L                0  ...                  0            1   
1               3     L                0  ...                  0            1   
2               3     L                0  ...                  0            1   
3          