In [1]:
# Import statcast+batting cluster datasets
from pathlib import Path

BASE_DIR = Path.cwd()
data_path = BASE_DIR / "data" / "statcast+cluster.csv"

In [2]:
import pandas as pd

data = pd.read_csv(data_path)
print(data)

       pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0              KC  2024-04-02           80.8          -3.13           5.61   
1              FF  2024-04-02           95.4          -1.95           5.12   
2              SI  2024-04-02           90.7          -1.26           5.13   
3              SI  2024-04-02           91.8           1.75           4.89   
4              SI  2024-04-02           94.0          -1.90           4.99   
...           ...         ...            ...            ...            ...   
610349         FF  2024-10-30           95.6          -2.76           6.15   
610350         SI  2024-10-30           96.0          -0.72           6.67   
610351         FF  2024-10-30           95.8          -1.55           5.85   
610352         SL  2024-10-30           85.9           2.78           5.53   
610353         CH  2024-10-30           86.7          -2.34           5.70   

               player_name  batter  pitcher     events      des

In [None]:
# EDA for 'description', 'events', 'bb_type'
print(data['description'].unique())

['hit_into_play' 'foul_tip' 'ball' 'called_strike' 'foul'
 'swinging_strike' 'blocked_ball' 'swinging_strike_blocked' 'hit_by_pitch'
 'foul_bunt' 'pitchout' 'missed_bunt' 'bunt_foul_tip']


In [4]:
print(data['events'].unique())

['field_out' 'strikeout' 'walk' nan 'single' 'home_run' 'force_out'
 'field_error' 'double' 'grounded_into_double_play' 'hit_by_pitch'
 'catcher_interf' 'triple' 'sac_fly' 'double_play' 'sac_bunt'
 'fielders_choice' 'truncated_pa' 'fielders_choice_out'
 'strikeout_double_play' 'sac_fly_double_play' 'triple_play']


In [5]:
print(data['bb_type'].unique())

['ground_ball' nan 'fly_ball' 'line_drive' 'popup']


In [6]:
data['events'].isna().sum()

453646

In [7]:
print(data['events'].value_counts())

events
field_out                    63156
strikeout                    35662
single                       22249
walk                         12305
double                        6736
home_run                      4676
force_out                     3019
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
fielders_choice                344
double_play                    293
truncated_pa                   282
fielders_choice_out            262
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Field out is subdivided into ground out, line out, fly out, and pop out.
field_out_rows = data['events'] == 'field_out'

data.loc[field_out_rows & (data['bb_type'] == 'ground_ball'), 'events'] = 'ground_out'
data.loc[field_out_rows & (data['bb_type'] == 'line_drive'), 'events'] = 'line_out'
data.loc[field_out_rows & (data['bb_type'] == 'fly_ball'), 'events'] = 'fly_out'
data.loc[field_out_rows & (data['bb_type'] == 'popup'), 'events'] = 'pop_out'

print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   26675
single                       22249
fly_out                      20179
walk                         12305
line_out                      8944
pop_out                       7357
double                        6736
home_run                      4676
force_out                     3019
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
fielders_choice                344
double_play                    293
truncated_pa                   282
fielders_choice_out            262
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
field_out                        1
Name: count, dtype: int64


In [9]:
# Handle edge cases.
print(data[data['events'] == 'field_out']['des'])

215950    James McCann out to catcher Danny Jansen.  Jam...
Name: des, dtype: object


In [10]:
data.loc[data['events'] == 'field_out', 'events'] = 'pop_out'
print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   26675
single                       22249
fly_out                      20179
walk                         12305
line_out                      8944
pop_out                       7358
double                        6736
home_run                      4676
force_out                     3019
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
fielders_choice                344
double_play                    293
truncated_pa                   282
fielders_choice_out            262
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Force out is subdivided into ground out, line out, fly out, and pop out.
force_out_rows = data['events'] == 'force_out'

data.loc[force_out_rows & (data['bb_type'] == 'ground_ball'), 'events'] = 'ground_out'
data.loc[force_out_rows & (data['bb_type'] == 'line_drive'), 'events'] = 'line_out'
data.loc[force_out_rows & (data['bb_type'] == 'fly_ball'), 'events'] = 'fly_out'
data.loc[force_out_rows & (data['bb_type'] == 'popup'), 'events'] = 'pop_out'

print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   29650
single                       22249
fly_out                      20190
walk                         12305
line_out                      8965
pop_out                       7370
double                        6736
home_run                      4676
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
fielders_choice                344
double_play                    293
truncated_pa                   282
fielders_choice_out            262
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Fielder's choice is subdivided into ground out, line out, fly out, and pop out.
fc_rows = data['events'] == 'fielders_choice'

data.loc[fc_rows & (data['bb_type'] == 'ground_ball'), 'events'] = 'ground_out'
data.loc[fc_rows & (data['bb_type'] == 'line_drive'), 'events'] = 'line_out'
data.loc[fc_rows & (data['bb_type'] == 'fly_ball'), 'events'] = 'fly_out'
data.loc[fc_rows & (data['bb_type'] == 'popup'), 'events'] = 'pop_out'

print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   29994
single                       22249
fly_out                      20190
walk                         12305
line_out                      8965
pop_out                       7370
double                        6736
home_run                      4676
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
double_play                    293
truncated_pa                   282
fielders_choice_out            262
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Fielder's choice out is subdivided into ground out, line out, fly out, and pop out.
fco_rows = data['events'] == 'fielders_choice_out'

data.loc[fco_rows & (data['bb_type'] == 'ground_ball'), 'events'] = 'ground_out'
data.loc[fco_rows & (data['bb_type'] == 'line_drive'), 'events'] = 'line_out'
data.loc[fco_rows & (data['bb_type'] == 'fly_ball'), 'events'] = 'fly_out'
data.loc[fco_rows & (data['bb_type'] == 'popup'), 'events'] = 'pop_out'

print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   30255
single                       22249
fly_out                      20190
walk                         12305
line_out                      8966
pop_out                       7370
double                        6736
home_run                      4676
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
double_play                    293
truncated_pa                   282
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Dropping 'truncated_pa' events
data = data[data['events'] != 'truncated_pa']
print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   30255
single                       22249
fly_out                      20190
walk                         12305
line_out                      8966
pop_out                       7370
double                        6736
home_run                      4676
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
double_play                    293
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
triple_play                      2
Name: count, dtype: int64


In [None]:
# Dropping 'triple_play' events
data = data[data['events'] != 'triple_play']
print(data['events'].value_counts())

events
strikeout                    35662
ground_out                   30255
single                       22249
fly_out                      20190
walk                         12305
line_out                      8966
pop_out                       7370
double                        6736
home_run                      4676
grounded_into_double_play     2784
hit_by_pitch                  1759
sac_fly                       1066
field_error                    945
triple                         594
sac_bunt                       385
double_play                    293
strikeout_double_play           96
catcher_interf                  83
sac_fly_double_play             10
Name: count, dtype: int64


In [None]:
# Renaming columns
data['events'] = data['events'].replace({'double_play': 'DoublePlay', 'grounded_into_double_play': 'DoublePlay',
                                         'sac_fly_double_play': 'DoublePlay', 'strikeout_double_play': 'DoublePlay',
                                         'walk': 'Walk', 'hit_by_pitch': 'HBP/CatInt', 'catcher_interf': 'HBP/CatInt',
                                         'strikeout': 'K', 'field_error': 'Error', 'single': 'Single',
                                         'double': 'Double', 'triple': 'Triple', 'home_run': 'HR',
                                         'ground_out': 'GroundOut', 'line_out': 'LineOut/InfFly', 'pop_out': 'LineOut/InfFly',
                                         'fly_out': 'FlyOut', 'sac_fly': 'FlyOut', 'triple_play': 'TriplePlay', 'sac_bunt': 'GroundOut'
                                         })

print(data['events'].value_counts())

events
K                 35662
GroundOut         30640
Single            22249
FlyOut            21256
LineOut/InfFly    16336
Walk              12305
Double             6736
HR                 4676
DoublePlay         3183
HBP/CatInt         1842
Error               945
Triple              594
Name: count, dtype: int64


In [None]:
# Caculate event occurrence probabilities vs. RHP by cluster
data_rhp = data[data['p_throws'] == 'R']

grouped_rhp = data_rhp.groupby('batter_cluster')['events']
event_counts_rhp = grouped_rhp.value_counts(normalize=True).unstack(fill_value=0)

print(event_counts_rhp)

events            Double  DoublePlay     Error    FlyOut  GroundOut  \
batter_cluster                                                        
1.0             0.041411    0.020457  0.005549  0.138397   0.183452   
2.0             0.040886    0.024868  0.006717  0.131055   0.229428   
3.0             0.042477    0.022265  0.007636  0.137440   0.199166   
4.0             0.042458    0.024676  0.006432  0.125105   0.182655   
5.0             0.045325    0.018147  0.005603  0.147516   0.195852   
6.0             0.046123    0.016133  0.004355  0.133370   0.175830   
7.0             0.040350    0.018568  0.003749  0.143903   0.178004   
8.0             0.041146    0.014934  0.005893  0.151072   0.229300   

events          HBP/CatInt        HR         K  LineOut/InfFly    Single  \
batter_cluster                                                             
1.0               0.014080  0.024598  0.227514        0.107421  0.137568   
2.0               0.013241  0.019959  0.202881        0.11400

In [None]:
# Caculate event occurrence probabilities vs. LHP by cluster
data_lhp = data[data['p_throws'] == 'L']

grouped_lhp = data_lhp.groupby('batter_cluster')['events']
event_counts_lhp = grouped_lhp.value_counts(normalize=True).unstack(fill_value=0)

print(event_counts_lhp)

events            Double  DoublePlay     Error    FlyOut  GroundOut  \
batter_cluster                                                        
1.0             0.040474    0.017572  0.008292  0.148667   0.180257   
2.0             0.048309    0.023741  0.006073  0.147274   0.210766   
3.0             0.047605    0.021800  0.008602  0.147709   0.185674   
4.0             0.047553    0.020517  0.006091  0.139667   0.167130   
5.0             0.039145    0.015056  0.004818  0.123156   0.220717   
6.0             0.039054    0.015236  0.004904  0.110158   0.211559   
7.0             0.034272    0.017136  0.005508  0.111383   0.219094   
8.0             0.034164    0.020641  0.004982  0.122064   0.256228   

events          HBP/CatInt        HR         K  LineOut/InfFly    Single  \
batter_cluster                                                             
1.0               0.009279  0.025074  0.212636        0.110563  0.135242   
2.0               0.011456  0.024983  0.197930        0.10807

In [19]:
# Single is subdivided into LongSingle, MediumSingle, ShortSingle by certain probabilities
event_counts_rhp['LongSingle'] = event_counts_rhp['Single'] * 0.3
event_counts_lhp['LongSingle'] = event_counts_lhp['Single'] * 0.3

event_counts_rhp['MediumSingle'] = event_counts_rhp['Single'] * 0.5
event_counts_lhp['MediumSingle'] = event_counts_lhp['Single'] * 0.5

event_counts_rhp['ShortSingle'] = event_counts_rhp['Single'] * 0.2
event_counts_lhp['ShortSingle'] = event_counts_lhp['Single'] * 0.2

print(event_counts_rhp.head())
print(event_counts_lhp.head())

events            Double  DoublePlay     Error    FlyOut  GroundOut  \
batter_cluster                                                        
1.0             0.041411    0.020457  0.005549  0.138397   0.183452   
2.0             0.040886    0.024868  0.006717  0.131055   0.229428   
3.0             0.042477    0.022265  0.007636  0.137440   0.199166   
4.0             0.042458    0.024676  0.006432  0.125105   0.182655   
5.0             0.045325    0.018147  0.005603  0.147516   0.195852   

events          HBP/CatInt        HR         K  LineOut/InfFly    Single  \
batter_cluster                                                             
1.0               0.014080  0.024598  0.227514        0.107421  0.137568   
2.0               0.013241  0.019959  0.202881        0.114003  0.155406   
3.0               0.012191  0.028168  0.234520        0.111582  0.154828   
4.0               0.012948  0.036909  0.248991        0.104380  0.133807   
5.0               0.010119  0.036545  0.227630

In [20]:
# Double is subdivided into LongDouble, ShortDouble by certain probabilities
event_counts_rhp['ShortDouble'] = event_counts_rhp['Double'] * 0.8
event_counts_lhp['ShortDouble'] = event_counts_lhp['Double'] * 0.8

event_counts_rhp['LongDouble'] = event_counts_rhp['Double'] * 0.2
event_counts_lhp['LongDouble'] = event_counts_lhp['Double'] * 0.2

print(event_counts_rhp.head())
print(event_counts_lhp.head())

events            Double  DoublePlay     Error    FlyOut  GroundOut  \
batter_cluster                                                        
1.0             0.041411    0.020457  0.005549  0.138397   0.183452   
2.0             0.040886    0.024868  0.006717  0.131055   0.229428   
3.0             0.042477    0.022265  0.007636  0.137440   0.199166   
4.0             0.042458    0.024676  0.006432  0.125105   0.182655   
5.0             0.045325    0.018147  0.005603  0.147516   0.195852   

events          HBP/CatInt        HR         K  LineOut/InfFly    Single  \
batter_cluster                                                             
1.0               0.014080  0.024598  0.227514        0.107421  0.137568   
2.0               0.013241  0.019959  0.202881        0.114003  0.155406   
3.0               0.012191  0.028168  0.234520        0.111582  0.154828   
4.0               0.012948  0.036909  0.248991        0.104380  0.133807   
5.0               0.010119  0.036545  0.227630

In [21]:
# FlyOut is subdivided into LongFly, MediumFlye, ShortFly by certain probabilities
event_counts_rhp['LongFly'] = event_counts_rhp['FlyOut'] * 0.2
event_counts_lhp['LongFly'] = event_counts_lhp['FlyOut'] * 0.2

event_counts_rhp['MediumFly'] = event_counts_rhp['FlyOut'] * 0.5
event_counts_lhp['MediumFly'] = event_counts_lhp['FlyOut'] * 0.5

event_counts_rhp['ShortFly'] = event_counts_rhp['FlyOut'] * 0.3
event_counts_lhp['ShortFly'] = event_counts_lhp['FlyOut'] * 0.3

print(event_counts_rhp.head())
print(event_counts_lhp.head())

events            Double  DoublePlay     Error    FlyOut  GroundOut  \
batter_cluster                                                        
1.0             0.041411    0.020457  0.005549  0.138397   0.183452   
2.0             0.040886    0.024868  0.006717  0.131055   0.229428   
3.0             0.042477    0.022265  0.007636  0.137440   0.199166   
4.0             0.042458    0.024676  0.006432  0.125105   0.182655   
5.0             0.045325    0.018147  0.005603  0.147516   0.195852   

events          HBP/CatInt        HR         K  LineOut/InfFly    Single  \
batter_cluster                                                             
1.0               0.014080  0.024598  0.227514        0.107421  0.137568   
2.0               0.013241  0.019959  0.202881        0.114003  0.155406   
3.0               0.012191  0.028168  0.234520        0.111582  0.154828   
4.0               0.012948  0.036909  0.248991        0.104380  0.133807   
5.0               0.010119  0.036545  0.227630

In [22]:
# Drop original 'Single', 'Double', 'Flyout'
event_counts_rhp = event_counts_rhp.drop(columns=['Single', 'Double', 'FlyOut'])
event_counts_lhp = event_counts_lhp.drop(columns=['Single', 'Double', 'FlyOut'])

print(event_counts_rhp)
print(event_counts_lhp)

events          DoublePlay     Error  GroundOut  HBP/CatInt        HR  \
batter_cluster                                                          
1.0               0.020457  0.005549   0.183452    0.014080  0.024598   
2.0               0.024868  0.006717   0.229428    0.013241  0.019959   
3.0               0.022265  0.007636   0.199166    0.012191  0.028168   
4.0               0.024676  0.006432   0.182655    0.012948  0.036909   
5.0               0.018147  0.005603   0.195852    0.010119  0.036545   
6.0               0.016133  0.004355   0.175830    0.009749  0.038205   
7.0               0.018568  0.003749   0.178004    0.010534  0.031601   
8.0               0.014934  0.005893   0.229300    0.011277  0.018084   

events                 K  LineOut/InfFly    Triple      Walk  LongSingle  \
batter_cluster                                                             
1.0             0.227514        0.107421  0.003479  0.096074    0.041270   
2.0             0.202881        0.114003 

In [23]:
# Output batting probablility matrix vs. RHP
output_path = BASE_DIR / "data" / "batstatmat_v_rhp.csv"

event_counts_rhp.to_csv(output_path)

In [24]:
# Output batting probablility matrix vs. LHP
output_path = BASE_DIR / "data" / "batstatmat_v_lhp.csv"

event_counts_lhp.to_csv(output_path)