# Importing Dataset

In [1]:
# Specify relative paths
from pathlib import Path

BASE_DIR = Path.cwd()
data_path = BASE_DIR / "data" / "statcast_pitch_swing_data_20240402_20241030_with_arm_angle.csv"

In [2]:
# Import Dataset
import pandas as pd

data = pd.read_csv(data_path)
print(data.head())

  pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0         FF  2024-04-02           95.0          -2.01           5.22   
1         CH  2024-04-02           88.5          -2.09           4.95   
2         SI  2024-04-02           95.0          -2.02           5.12   
3         KC  2024-04-02           80.8          -3.13           5.61   
4         FF  2024-04-02           95.4          -1.95           5.12   

       player_name  batter  pitcher     events    description  ...  \
0  Rocchio, Brayan  677587   622491     single  hit_into_play  ...   
1  Rocchio, Brayan  677587   622491        NaN           foul  ...   
2  Rocchio, Brayan  677587   622491        NaN  called_strike  ...   
3   Rizzo, Anthony  519203   668678  field_out  hit_into_play  ...   
4   Hedges, Austin  595978   622491  strikeout       foul_tip  ...   

   n_thruorder_pitcher  n_priorpa_thisgame_player_at_bat  \
0                    3                                 2   
1                   

# EDA for Switch Hitters
: Searching for significant differences based on switch hitters' stance.

In [3]:
# Finding Switch Hitters
stand_unique_count = data.groupby('player_name')['stand'].nunique().reset_index(name='unique_stand_count')
print(stand_unique_count[stand_unique_count['unique_stand_count']==2])

          player_name  unique_stand_count
9     Adrianza, Ehire                   2
11      Albies, Ozzie                   2
23      Amador, Adael                   2
39    Bailey, Patrick                   2
42    Baldwin, Brooks                   2
..                ...                 ...
575     Toro, Abraham                   2
585      Tucker, Cole                   2
594  Vargas, Ildemaro                   2
617     Walls, Taylor                   2
621      Waters, Drew                   2

[66 rows x 2 columns]


In [4]:
# Filtering Switch Hitters
players_with_both_stands = data.groupby('player_name')['stand'].nunique()
players_with_both_stands = players_with_both_stands[players_with_both_stands == 2].index

# Calculate the average swing_length and bat_speed for filtered players with 'L' and 'R'
stand_comparison = data[data['player_name'].isin(players_with_both_stands)]
stand_comparison = stand_comparison.groupby(['player_name', 'stand'])[['swing_length', 'bat_speed', 'launch_angle', 'launch_speed']].mean().reset_index()

# Print Result
print(stand_comparison)

          player_name stand  swing_length  bat_speed  launch_angle  \
0     Adrianza, Ehire     L      6.953125  62.709375      6.647059   
1     Adrianza, Ehire     R      6.468750  61.843750     21.000000   
2       Albies, Ozzie     L      7.329500  67.403167     21.044681   
3       Albies, Ozzie     R      6.841923  67.388846     17.189815   
4       Amador, Adael     L      7.220000  68.676364     19.093023   
..                ...   ...           ...        ...           ...   
127  Vargas, Ildemaro     R      7.107527  68.750000     11.325153   
128     Walls, Taylor     L      6.983862  64.958501     22.122530   
129     Walls, Taylor     R      7.254206  64.771028     17.438356   
130      Waters, Drew     L      6.767500  72.795000     11.640000   
131      Waters, Drew     R      6.600000  64.475000     10.000000   

     launch_speed  
0       79.747059  
1       85.972727  
2       82.058422  
3       84.451389  
4       82.825581  
..            ...  
127     85.006748  

In [6]:
# Filtering Switch Hitters
players_with_both_stands = data.groupby('player_name')['stand'].nunique()
players_with_both_stands = players_with_both_stands[players_with_both_stands == 2].index

# Changing names of Switch Hitters
data.loc[(data['stand'] == 'L') & (data['player_name'].isin(players_with_both_stands)), 'player_name'] = data.loc[(data['stand'] == 'L') & (data['player_name'].isin(players_with_both_stands)), 'player_name'] + '-L'
data.loc[(data['stand'] == 'R') & (data['player_name'].isin(players_with_both_stands)), 'player_name'] = data.loc[(data['stand'] == 'R') & (data['player_name'].isin(players_with_both_stands)), 'player_name'] + '-R'

# print results
changed_data = data[data['player_name'].str.endswith('-L') | data['player_name'].str.endswith('-R')]
print(changed_data[['player_name', 'stand']].drop_duplicates())


                player_name stand
0         Rocchio, Brayan-L     L
20           Hicks, Aaron-R     R
134           Heim, Jonah-L     L
172     Profar, Jurickson-L     L
244         Albies, Ozzie-R     R
...                     ...   ...
574882     Clement, Ernie-L     L
603752     Collins, Isaac-R     R
616864     Collins, Isaac-L     L
633847   Campero, Gustavo-L     L
633999   Campero, Gustavo-R     R

[132 rows x 2 columns]


In [7]:
print(data.head(10))

  pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0         FF  2024-04-02           95.0          -2.01           5.22   
1         CH  2024-04-02           88.5          -2.09           4.95   
2         SI  2024-04-02           95.0          -2.02           5.12   
3         KC  2024-04-02           80.8          -3.13           5.61   
4         FF  2024-04-02           95.4          -1.95           5.12   
5         SI  2024-04-02           90.7          -1.26           5.13   
6         SI  2024-04-02           91.8           1.75           4.89   
7         SI  2024-04-02           94.0          -1.90           4.99   
8         SI  2024-04-02           89.9          -1.14           5.01   
9         FF  2024-04-02           93.4          -3.17           5.63   

         player_name  batter  pitcher     events    description  ...  \
0  Rocchio, Brayan-L  677587   622491     single  hit_into_play  ...   
1  Rocchio, Brayan-L  677587   622491        NaN    

# Saving Dataset

In [8]:
output_path = BASE_DIR / "data" / "statcast_switch_split.csv"

data.to_csv(output_path, index=False)