In [204]:
import pandas as pd
import numpy as np
from pybaseball import statcast_pitcher, playerid_lookup, statcast_batter

In [206]:
ids = pd.read_csv("mlbids.csv")

In [208]:
# get pitcher data on yamamoto in just the 2025 season
START = "2025-03-01"
END   = "2025-11-20"
player = playerid_lookup('yamamoto', 'yoshinobu')
print(player)

  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0  yamamoto  yoshinobu     808967  yamay001  yamamyo01          33825   

   mlb_played_first  mlb_played_last  
0            2024.0           2025.0  


In [210]:
df = statcast_pitcher(START, END, 808967)

Gathering Player Data


In [211]:
df

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FS,2025-11-01,92.1,-2.02,5.15,"Yamamoto, Yoshinobu",672386,808967,grounded_into_double_play,hit_into_play,...,,2.33,0.48,0.48,36.9,8.375233,-7.664043,17.979522,48.911447,37.014581
1,CU,2025-11-01,80.3,-1.70,5.45,"Yamamoto, Yoshinobu",672386,808967,,called_strike,...,,4.85,-1.36,-1.36,48.1,,,,,
2,FC,2025-11-01,92.8,-1.91,5.24,"Yamamoto, Yoshinobu",672386,808967,,foul,...,,2.04,-0.23,-0.23,39.1,14.162806,5.694338,22.383742,47.652657,23.055298
3,FS,2025-11-01,90.9,-1.97,5.26,"Yamamoto, Yoshinobu",680718,808967,walk,blocked_ball,...,,2.74,0.69,-0.69,40.1,,,,,
4,FS,2025-11-01,90.8,-2.02,5.27,"Yamamoto, Yoshinobu",680718,808967,,ball,...,,2.30,0.23,-0.23,38.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,FS,2025-03-04,90.7,-1.60,5.59,"Yamamoto, Yoshinobu",680574,808967,,blocked_ball,...,,2.55,0.95,0.95,,,,,,
3436,FF,2025-03-04,95.8,-1.58,5.66,"Yamamoto, Yoshinobu",680574,808967,,foul,...,,1.11,0.60,0.60,,,,,,
3437,CU,2025-03-04,75.4,-1.27,5.89,"Yamamoto, Yoshinobu",680574,808967,,ball,...,,5.49,-0.68,-0.68,,,,,,
3438,FF,2025-03-04,96.8,-1.62,5.75,"Yamamoto, Yoshinobu",670770,808967,double,hit_into_play,...,,0.88,0.73,-0.73,,,,,,


In [212]:
# get only regular and post season games
if "game_type" in df.columns:
    df = df[df["game_type"].isin(["R", "P"])].copy()

In [213]:
df = pd.merge(df, ids, left_on='batter', right_on='MLBID', how='left').copy()

In [214]:
# rename pitcher team score and opponent team score
df["pitcher_team"] = df["fld_score"]
df["opponent"] = df["bat_score"]

In [215]:
# change types
df["balls"]   = df["balls"].astype(int)
df["strikes"] = df["strikes"].astype(int)
df["outs"]    = df["outs_when_up"].astype(int)
df["inning"]  = df["inning"].astype(int)
df["top_bot"] = (df["inning_topbot"] == "Top").astype(int)

# change name of batter name and id column
df["batter_name"] = df["PLAYERNAME"]
df["batter_id"] = df["batter"]

In [216]:
# change pitcher and batter hand column names to be more intuitive
df["p_hand"] = df["p_throws"]
df["b_hand"] = df["stand"]

df["same_hand_matchup"] = (df["p_hand"] == df["b_hand"]).astype(int)

In [217]:
# total number of runners on bases
df["num_runners_onb"] = df[["on_1b", "on_2b", "on_3b"]].sum(axis=1)

df[['on_3b', 'on_2b', 'on_1b']] = df[['on_3b', 'on_2b', 'on_1b']].notna()

In [218]:
# get previous pitch data
df["prev_pitch_type"] = (
    df.groupby(["game_pk", "at_bat_number"])["pitch_type"].shift(1)
)
df["prev_pitch_result"] = (
    df.groupby(["game_pk", "at_bat_number"])["description"]
      .shift(1)
)

In [219]:
# get number of pitches thrown in that game
df["p_pitch_count"] = df.groupby(["game_pk"])["pitch_type"].cumcount() + 1

In [220]:
cols = [
    'game_pk', 'game_date', 'at_bat_number',
    'pitch_number', 'pitch_type',
    'batter_name', 'batter_id',
    'p_hand', 'b_hand',
    'same_hand_matchup',
    'balls', 'strikes',
    'outs', 'on_3b',
    'on_2b','on_1b',
    'inning', 'top_bot',
    'p_pitch_count',
    'pitcher_team', 'opponent',
    'prev_pitch_type', 'prev_pitch_result'
]

In [221]:
df = df[cols].copy()
df = df.dropna()

In [222]:
df.to_csv('yama_pitching_data', index=False)

In [223]:
df

Unnamed: 0,game_pk,game_date,at_bat_number,pitch_number,pitch_type,batter_name,batter_id,p_hand,b_hand,same_hand_matchup,...,on_3b,on_2b,on_1b,inning,top_bot,p_pitch_count,pitcher_team,opponent,prev_pitch_type,prev_pitch_result
1,776185,2025-09-25,57,5,SI,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,2,8,0,SL,hit_into_play
2,776185,2025-09-25,57,4,FF,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,3,8,0,SI,ball
3,776185,2025-09-25,57,3,FF,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,4,8,0,FF,foul
4,776185,2025-09-25,57,2,FS,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,5,8,0,FF,ball
5,776185,2025-09-25,57,1,SI,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,6,8,0,FS,ball
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2785,778563,2025-03-18,5,1,FF,Seiya Suzuki,673548,R,R,1,...,False,False,True,1,0,67,0,0,FS,ball
2787,778563,2025-03-18,4,5,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,69,0,0,FF,ball
2788,778563,2025-03-18,4,4,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,70,0,0,FF,called_strike
2789,778563,2025-03-18,4,3,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,71,0,0,FF,called_strike


In [224]:
all_rows = []

for bid in unique_batters: #Get all unique batters Yamamoto faced
    bf = statcast_batter('2025-03-01', '2025-10-01', bid)
    if bf.empty:
        continue

    bf = bf[bf['events'].notna()].copy() #Remove entries that had no results

    #Mark hits, etc.
    tb_map = {'single':1,'double':2,'triple':3,'home_run':4}

    bf['is_hit'] = bf['events'].isin(tb_map.keys())
    bf['total_bases'] = bf['events'].map(tb_map).fillna(0)
    bf['is_walk'] = bf['events'].isin(['walk','intent_walk'])
    bf['is_hbp']  = bf['events'] == 'hit_by_pitch'
    bf['is_sf']   = bf['events'] == 'sac_fly'

    bf['AB'] = (~bf['is_walk'] & ~bf['is_hbp'] & ~bf['is_sf']).astype(int)

    grouped = bf.groupby('pitch_type').agg( #Calculate stats 
        AB=('AB','sum'),
        H=('is_hit','sum'),
        BB=('is_walk','sum'),
        HBP=('is_hbp','sum'),
        SF=('is_sf','sum'),
        TB=('total_bases','sum')
    ).reset_index()

    grouped['batter_id'] = bid
    all_rows.append(grouped)

#Combine results
full = pd.concat(all_rows, ignore_index=True)

full['AVG'] = full['H'] / full['AB'].replace(0, pd.NA)
full['OBP'] = (full['H'] + full['BB'] + full['HBP']) / \
              (full['AB'] + full['BB'] + full['HBP'] + full['SF']).replace(0, pd.NA)
full['SLG'] = full['TB'] / full['AB'].replace(0, pd.NA)

final_df = full.pivot(
    index='batter_id',
    columns='pitch_type',
    values=['AVG','OBP','SLG']
)

final_df.columns = [f"{stat}_{ptype}" for (stat, ptype) in final_df.columns]
final_df = final_df.reset_index()

#Add the names to table, given batter ID
name_map = (
    df[['batter_id','batter_name']]
      .drop_duplicates()
      .set_index('batter_id')['batter_name']
      .to_dict()
)

final_df['batter_name'] = final_df['batter_id'].map(name_map)

cols = ['batter_name','batter_id'] + [c for c in final_df.columns if c not in ['batter_name','batter_id']]
final_df = final_df[cols]




Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering 

In [225]:
final_df

Unnamed: 0,batter_name,batter_id,AVG_CH,AVG_CS,AVG_CU,AVG_EP,AVG_FA,AVG_FC,AVG_FF,AVG_FO,...,SLG_FO,SLG_FS,SLG_KC,SLG_KN,SLG_PO,SLG_SC,SLG_SI,SLG_SL,SLG_ST,SLG_SV
0,Martin Maldonado,455117,0.0,1.0,0.090909,,,0.272727,0.2,,...,,0.5,0.0,,,,0.285714,0.275862,0.5,
1,Andrew McCutchen,457705,0.289474,,0.236842,1.0,,0.384615,0.184713,,...,,0.692308,0.5,,,,0.311927,0.371795,0.170732,
2,Tommy Pham,502054,0.310345,0.0,0.142857,,0.0,0.272727,0.239316,,...,,0.727273,0.0,,,,0.354167,0.416667,0.125,0.5
3,Paul Goldschmidt,502671,0.293103,,0.285714,,1.0,0.28,0.308725,,...,,0.3125,0.0,,,,0.444444,0.426667,0.225806,0.333333
4,Travis D'Arnaud,518595,0.1875,,0.0,,,0.058824,0.191781,,...,,0.0,0.0,,,,0.361111,0.485714,0.647059,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,Chase Meidroth,805367,0.152174,,0.315789,,0.0,0.263158,0.304636,,...,,0.333333,0.4,,,,0.320388,0.231707,0.166667,0.0
205,Jacob Wilson,805779,0.410256,0.0,0.166667,,0.0,0.314286,0.315315,,...,,0.461538,1.0,,,,0.371069,0.641304,0.176471,0.0
206,Bryce Eldridge,805811,0.0,,0.333333,,,0.0,0.083333,,...,,0.0,0.0,,,,2.0,0.0,0.0,
207,Matt Shaw,807713,0.261905,,0.190476,0.0,0.0,0.304348,0.201835,,...,,1.714286,0.0,,,,0.25,0.4,0.363636,0.4


In [226]:
final_df[:24]

Unnamed: 0,batter_name,batter_id,AVG_CH,AVG_CS,AVG_CU,AVG_EP,AVG_FA,AVG_FC,AVG_FF,AVG_FO,...,SLG_FO,SLG_FS,SLG_KC,SLG_KN,SLG_PO,SLG_SC,SLG_SI,SLG_SL,SLG_ST,SLG_SV
0,Martin Maldonado,455117,0.0,1.0,0.090909,,,0.272727,0.2,,...,,0.5,0.0,,,,0.285714,0.275862,0.5,
1,Andrew McCutchen,457705,0.289474,,0.236842,1.0,,0.384615,0.184713,,...,,0.692308,0.5,,,,0.311927,0.371795,0.170732,
2,Tommy Pham,502054,0.310345,0.0,0.142857,,0.0,0.272727,0.239316,,...,,0.727273,0.0,,,,0.354167,0.416667,0.125,0.5
3,Paul Goldschmidt,502671,0.293103,,0.285714,,1.0,0.28,0.308725,,...,,0.3125,0.0,,,,0.444444,0.426667,0.225806,0.333333
4,Travis D'Arnaud,518595,0.1875,,0.0,,,0.058824,0.191781,,...,,0.0,0.0,,,,0.361111,0.485714,0.647059,0.0
5,DJ LeMahieu,518934,0.153846,,0.142857,0.0,,0.272727,0.361111,,...,,0.5,0.0,,,,0.095238,0.466667,0.428571,0.0
6,Wilmer Flores,527038,0.195122,,0.190476,,,0.125,0.266234,,...,,0.0,0.375,,,,0.451613,0.333333,0.387097,0.0
7,Marcell Ozuna,542303,0.137255,,0.137931,,,0.26087,0.247934,0.0,...,0.0,0.238095,0.142857,,,,0.481481,0.37037,0.409091,0.0
8,Jon Berti,542932,0.0,,0.2,0.5,1.0,0.2,0.25,,...,,0.0,0.0,,,,0.142857,0.125,0.555556,
9,Marcus Semien,543760,0.159091,,0.235294,,,0.255814,0.228916,,...,,0.0,0.777778,,,,0.408602,0.329412,0.416667,1.0


In [227]:
# Yamamoto pitch types
keep_pitch_types = ['FF', 'FS', 'CU', 'FC', 'SI', 'SL']

# metrics you computed
metrics = ['AVG', 'OBP', 'SLG']

# build list of desired column names
keep_cols = ['batter_name', 'batter_id']  # always keep identifiers
for m in metrics:
    for pt in keep_pitch_types:
        col = f"{m}_{pt}"
        if col in final_df.columns:
            keep_cols.append(col)

# filter the dataframe
final_df_filtered = final_df[keep_cols]

final_df_filtered #Keep stats with only the pitch types Yamamoto throws

Unnamed: 0,batter_name,batter_id,AVG_FF,AVG_FS,AVG_CU,AVG_FC,AVG_SI,AVG_SL,OBP_FF,OBP_FS,OBP_CU,OBP_FC,OBP_SI,OBP_SL,SLG_FF,SLG_FS,SLG_CU,SLG_FC,SLG_SI,SLG_SL
0,Martin Maldonado,455117,0.2,0.25,0.090909,0.272727,0.228571,0.172414,0.238095,0.25,0.090909,0.466667,0.25,0.2,0.333333,0.5,0.181818,0.545455,0.285714,0.275862
1,Andrew McCutchen,457705,0.184713,0.307692,0.236842,0.384615,0.247706,0.230769,0.267045,0.357143,0.25641,0.5,0.344,0.354839,0.350318,0.692308,0.315789,0.461538,0.311927,0.371795
2,Tommy Pham,502054,0.239316,0.272727,0.142857,0.272727,0.270833,0.233333,0.387755,0.333333,0.137931,0.305556,0.345794,0.328571,0.410256,0.727273,0.142857,0.484848,0.354167,0.416667
3,Paul Goldschmidt,502671,0.308725,0.25,0.285714,0.28,0.252525,0.266667,0.371951,0.314286,0.347826,0.357143,0.292453,0.303797,0.489933,0.3125,0.52381,0.36,0.444444,0.426667
4,Travis D'Arnaud,518595,0.191781,0.0,0.0,0.058824,0.25,0.257143,0.259259,0.0,0.181818,0.157895,0.289474,0.333333,0.328767,0.0,0.0,0.058824,0.361111,0.485714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,Chase Meidroth,805367,0.304636,0.333333,0.315789,0.263158,0.242718,0.207317,0.40678,0.428571,0.315789,0.317073,0.327586,0.266667,0.384106,0.333333,0.578947,0.289474,0.320388,0.231707
205,Jacob Wilson,805779,0.315315,0.384615,0.166667,0.314286,0.289308,0.347826,0.387097,0.357143,0.230769,0.368421,0.323353,0.368421,0.486486,0.461538,0.25,0.371429,0.371069,0.641304
206,Bryce Eldridge,805811,0.083333,0.0,0.333333,0.0,1.0,0.0,0.266667,0.0,0.5,0.0,1.0,0.166667,0.166667,0.0,0.333333,0.0,2.0,0.0
207,Matt Shaw,807713,0.201835,0.428571,0.190476,0.304348,0.197368,0.2625,0.276423,0.428571,0.25,0.392857,0.314607,0.313953,0.376147,1.714286,0.238095,0.521739,0.25,0.4


In [246]:
final_df_filtered.to_csv("batter_stats.csv", index=False)


In [250]:
print(final_df_filtered.columns.tolist())

['batter_name', 'batter_id', 'AVG_FF', 'AVG_FS', 'AVG_CU', 'AVG_FC', 'AVG_SI', 'AVG_SL', 'OBP_FF', 'OBP_FS', 'OBP_CU', 'OBP_FC', 'OBP_SI', 'OBP_SL', 'SLG_FF', 'SLG_FS', 'SLG_CU', 'SLG_FC', 'SLG_SI', 'SLG_SL']


In [252]:
import pandas as pd
print(pd.read_csv("yamamoto_v3_pitches_2025.csv").columns)


Index(['game_pk', 'at_bat_number', 'pitch_in_pa', 'batter', 'home_team',
       'away_team', 'pitch_type', 'pitch_type_idx', 'stand', 'batter_hand_idx',
       'prev_pitch_type', 'prev_pitch_idx', 'prev_prev_pitch_type',
       'prev_pitch_result', 'prev_pitch_result_idx', 'balls', 'strikes',
       'outs_when_up', 'inning', 'is_top_inning', 'on_1b_flag', 'on_2b_flag',
       'on_3b_flag', 'score_diff_pov', 'pitcher_ahead_flag',
       'hitter_ahead_flag', 'putaway_count_flag', 'platoon_adv',
       'fastballs_in_pa', 'last_two_fastballs_flag', 'risp_flag',
       'high_leverage_flag', 'prev_release_speed', 'prev_pfx_x', 'prev_pfx_z',
       'prev_speed_minus_ff_mean'],
      dtype='object')
