In [1]:
from pathlib import Path

BASE_DIR = Path.cwd()
data_path = BASE_DIR / "data" / "statcast_switch_split.csv"

In [2]:
import pandas as pd

data = pd.read_csv(data_path, index_col=0)
print(data)

             game_date  release_speed  release_pos_x  release_pos_z  \
pitch_type                                                            
FF          2024-04-02           95.0          -2.01           5.22   
CH          2024-04-02           88.5          -2.09           4.95   
SI          2024-04-02           95.0          -2.02           5.12   
KC          2024-04-02           80.8          -3.13           5.61   
FF          2024-04-02           95.4          -1.95           5.12   
...                ...            ...            ...            ...   
FF          2024-10-30           95.6          -2.76           6.15   
SI          2024-10-30           96.0          -0.72           6.67   
FF          2024-10-30           95.8          -1.55           5.85   
SL          2024-10-30           85.9           2.78           5.53   
CH          2024-10-30           86.7          -2.34           5.70   

                   player_name  batter  pitcher     events      description 

In [3]:
print(data['bat_speed'].isnull().sum())
print(data['swing_length'].isnull().sum())
print(data['launch_angle'].isnull().sum())
print(data['launch_speed'].isnull().sum())
print(data['hyper_speed'].isnull().sum())
print(data['hit_distance_sc'].isnull().sum())

379001
379001
466218
466527
464967
465004


In [4]:
print(data['bat_speed'].mean())
print(data['swing_length'].mean())

69.4810894852367
7.216296085020898


In [5]:
print(data["events"].unique())
print('')
print(data["description"].unique())

['single' nan 'field_out' 'strikeout' 'walk' 'home_run' 'force_out'
 'field_error' 'double' 'grounded_into_double_play' 'hit_by_pitch'
 'catcher_interf' 'triple' 'sac_fly' 'double_play' 'sac_bunt'
 'fielders_choice' 'truncated_pa' 'fielders_choice_out'
 'strikeout_double_play' 'sac_fly_double_play' 'triple_play']

['hit_into_play' 'foul' 'called_strike' 'foul_tip' 'ball'
 'swinging_strike' 'blocked_ball' 'swinging_strike_blocked' 'hit_by_pitch'
 'foul_bunt' 'pitchout' 'missed_bunt' 'bunt_foul_tip']


In [6]:
# Filter data to include only instances where the event description is 'hit_into_play'.
data_hip = data[data["description"] == "hit_into_play"]
print(data_hip)

             game_date  release_speed  release_pos_x  release_pos_z  \
pitch_type                                                            
FF          2024-04-02           95.0          -2.01           5.22   
KC          2024-04-02           80.8          -3.13           5.61   
FC          2024-04-02           91.4           2.30           6.31   
SI          2024-04-02           96.3           2.23           5.96   
FF          2024-04-02           96.1           2.15           6.23   
...                ...            ...            ...            ...   
SL          2024-10-30           86.0          -2.33           5.43   
FF          2024-10-30           94.0           0.06           5.99   
CH          2024-10-30           88.3          -2.49           5.53   
FF          2024-10-30           95.5          -1.78           5.85   
FF          2024-10-30           95.6          -2.76           6.15   

                   player_name  batter  pitcher     events    description  \

In [7]:
# Check the number of missing values in key columns related to batting metrics.
print(data_hip['bat_speed'].isnull().sum())
print(data_hip['swing_length'].isnull().sum())
print(data_hip['launch_angle'].isnull().sum())
print(data_hip['launch_speed'].isnull().sum())
print(data_hip['hyper_speed'].isnull().sum())
print(data_hip['hit_distance_sc'].isnull().sum())

7013
7013
310
374
374
406


In [8]:
# Filter out data where the event is 'sac_bunt' from the previous filtered dataset.
data_hip_nonbunt = data_hip[data_hip['events']!='sac_bunt']
print(data_hip_nonbunt)

             game_date  release_speed  release_pos_x  release_pos_z  \
pitch_type                                                            
FF          2024-04-02           95.0          -2.01           5.22   
KC          2024-04-02           80.8          -3.13           5.61   
FC          2024-04-02           91.4           2.30           6.31   
SI          2024-04-02           96.3           2.23           5.96   
FF          2024-04-02           96.1           2.15           6.23   
...                ...            ...            ...            ...   
SL          2024-10-30           86.0          -2.33           5.43   
FF          2024-10-30           94.0           0.06           5.99   
CH          2024-10-30           88.3          -2.49           5.53   
FF          2024-10-30           95.5          -1.78           5.85   
FF          2024-10-30           95.6          -2.76           6.15   

                   player_name  batter  pitcher     events    description  \

In [9]:
# Check the number of missing values in the non-bunt dataset for key batting metrics.
print(data_hip_nonbunt['bat_speed'].isnull().sum())
print(data_hip_nonbunt['swing_length'].isnull().sum())
print(data_hip_nonbunt['launch_angle'].isnull().sum())
print(data_hip_nonbunt['launch_speed'].isnull().sum())
print(data_hip_nonbunt['hyper_speed'].isnull().sum())
print(data_hip_nonbunt['hit_distance_sc'].isnull().sum())

6768
6768
303
367
367
399


In [10]:
# Drop rows with missing values in the 'bat_speed' and 'swing_length' columns.
data_hip_nonbunt_fin = data_hip_nonbunt.dropna(subset=['bat_speed', 'swing_length'])

# Verify that no missing values remain in the critical columns.
print(data_hip_nonbunt_fin['bat_speed'].isnull().sum())
print(data_hip_nonbunt_fin['swing_length'].isnull().sum())
print(data_hip_nonbunt_fin['launch_angle'].isnull().sum())
print(data_hip_nonbunt_fin['launch_speed'].isnull().sum())
print(data_hip_nonbunt_fin['hyper_speed'].isnull().sum())
print(data_hip_nonbunt_fin['hit_distance_sc'].isnull().sum())

0
0
235
298
298
267


In [11]:
# Calculate summary statistics (mean and standard deviation) for bat speed and swing length by player.
summary_stats = data_hip_nonbunt_fin.groupby('player_name')[['bat_speed', 'swing_length']].agg(['mean', 'std']).reset_index()
summary_stats.columns = ['player_name', 'mean_batspeed', 'std_batspeed', 'mean_swinglength', 'std_swinglength']

# Display the summary statistics.
print(summary_stats)

           player_name  mean_batspeed  std_batspeed  mean_swinglength  \
0           Abrams, CJ      71.869018      5.844900          7.404786   
1          Abreu, José      71.720968      7.114270          7.130645   
2        Abreu, Wilyer      74.762868      4.130587          7.377574   
3    Acuña Jr., Ronald      76.213793      6.432308          7.625000   
4     Acuña, Luisangel      72.229412     12.005669          7.232353   
..                 ...            ...           ...               ...   
707        Yorke, Nick      72.445455      5.383903          7.318182   
708  Yoshida, Masataka      70.972876      4.644194          7.176471   
709       Young, Jacob      65.606087     13.443015          6.655652   
710       Zavala, Seby      69.047368      4.368749          7.189474   
711   d'Arnaud, Travis      71.045405      5.623475          7.747568   

     std_swinglength  
0           0.603652  
1           0.802984  
2           0.613842  
3           0.601719  
4       

In [12]:
# Merge additional player details ('batter' and 'stand') into the summary statistics dataset.
summary_stats_final = summary_stats.merge(data_hip_nonbunt_fin[['batter', 'stand', 'player_name']].drop_duplicates(), 
                                on='player_name', 
                                how='left')

# Display the merged dataset with additional player information.
print(summary_stats_final)

           player_name  mean_batspeed  std_batspeed  mean_swinglength  \
0           Abrams, CJ      71.869018      5.844900          7.404786   
1          Abreu, José      71.720968      7.114270          7.130645   
2        Abreu, Wilyer      74.762868      4.130587          7.377574   
3    Acuña Jr., Ronald      76.213793      6.432308          7.625000   
4     Acuña, Luisangel      72.229412     12.005669          7.232353   
..                 ...            ...           ...               ...   
707        Yorke, Nick      72.445455      5.383903          7.318182   
708  Yoshida, Masataka      70.972876      4.644194          7.176471   
709       Young, Jacob      65.606087     13.443015          6.655652   
710       Zavala, Seby      69.047368      4.368749          7.189474   
711   d'Arnaud, Travis      71.045405      5.623475          7.747568   

     std_swinglength  batter stand  
0           0.603652  682928     L  
1           0.802984  547989     R  
2           

In [14]:
# Read attack angle data from a CSV file.
data_path = BASE_DIR / "data" / "attack_angle.csv"

at_angle=pd.read_csv(data_path)
print(at_angle)

           player_name  attack_angle
0           Abrams, CJ     15.378242
1          Abreu, José      5.253244
2        Abreu, Wilyer     17.837165
3    Acuña Jr., Ronald     13.396897
4     Acuña, Luisangel      4.588206
..                 ...           ...
616        Yorke, Nick     13.279616
617  Yoshida, Masataka     11.139459
618       Young, Jacob      2.507728
619       Zavala, Seby     29.371732
620   d'Arnaud, Travis     19.999097

[621 rows x 2 columns]


In [15]:
# Merge the attack angle data with the summary statistics dataset.
merged_df = pd.merge(summary_stats_final, at_angle, on='player_name', how='left')
print(merged_df)

           player_name  mean_batspeed  std_batspeed  mean_swinglength  \
0           Abrams, CJ      71.869018      5.844900          7.404786   
1          Abreu, José      71.720968      7.114270          7.130645   
2        Abreu, Wilyer      74.762868      4.130587          7.377574   
3    Acuña Jr., Ronald      76.213793      6.432308          7.625000   
4     Acuña, Luisangel      72.229412     12.005669          7.232353   
..                 ...            ...           ...               ...   
707        Yorke, Nick      72.445455      5.383903          7.318182   
708  Yoshida, Masataka      70.972876      4.644194          7.176471   
709       Young, Jacob      65.606087     13.443015          6.655652   
710       Zavala, Seby      69.047368      4.368749          7.189474   
711   d'Arnaud, Travis      71.045405      5.623475          7.747568   

     std_swinglength  batter stand  attack_angle  
0           0.603652  682928     L     15.378242  
1           0.802984 

In [16]:
# Count the number of missing values in the 'attack_angle' column.
missing_attack_angle = merged_df['attack_angle'].isnull().sum()
print(missing_attack_angle)

91


In [17]:
# Drop rows with missing values in the 'attack_angle' column.
cleaned_df = merged_df.dropna(subset=['attack_angle'])
print(cleaned_df)

           player_name  mean_batspeed  std_batspeed  mean_swinglength  \
0           Abrams, CJ      71.869018      5.844900          7.404786   
1          Abreu, José      71.720968      7.114270          7.130645   
2        Abreu, Wilyer      74.762868      4.130587          7.377574   
3    Acuña Jr., Ronald      76.213793      6.432308          7.625000   
4     Acuña, Luisangel      72.229412     12.005669          7.232353   
..                 ...            ...           ...               ...   
707        Yorke, Nick      72.445455      5.383903          7.318182   
708  Yoshida, Masataka      70.972876      4.644194          7.176471   
709       Young, Jacob      65.606087     13.443015          6.655652   
710       Zavala, Seby      69.047368      4.368749          7.189474   
711   d'Arnaud, Travis      71.045405      5.623475          7.747568   

     std_swinglength  batter stand  attack_angle  
0           0.603652  682928     L     15.378242  
1           0.802984 

In [18]:
# Save the cleaned dataset to a CSV file.
output_path = BASE_DIR / "data" / "batter_stats_sum.csv"

cleaned_df.to_csv(output_path, index=False)