# Importing Dataset

In [3]:
# Specify relative paths
from pathlib import Path

BASE_DIR = Path.cwd()
data_path = BASE_DIR / "data" / "statcast_switch_split.csv"

In [4]:
# Import dataset
import pandas as pd

data = pd.read_csv(data_path)
print(data.head(10))

  pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0         FF  2024-04-02           95.0          -2.01           5.22   
1         CH  2024-04-02           88.5          -2.09           4.95   
2         SI  2024-04-02           95.0          -2.02           5.12   
3         KC  2024-04-02           80.8          -3.13           5.61   
4         FF  2024-04-02           95.4          -1.95           5.12   
5         SI  2024-04-02           90.7          -1.26           5.13   
6         SI  2024-04-02           91.8           1.75           4.89   
7         SI  2024-04-02           94.0          -1.90           4.99   
8         SI  2024-04-02           89.9          -1.14           5.01   
9         FF  2024-04-02           93.4          -3.17           5.63   

         player_name  batter  pitcher     events    description  ...  \
0  Rocchio, Brayan-L  677587   622491     single  hit_into_play  ...   
1  Rocchio, Brayan-L  677587   622491        NaN    

# One-Way ANOVA
: Searching for significantly different variables by player name

In [5]:
# Drop missing values
new_data = data.dropna(subset=['swing_length', 'bat_speed'])

In [6]:
# Execute One-Way ANOVA
from scipy.stats import f_oneway

numeric_columns = ['balls', 'strikes', 'outs_when_up','hc_x', 'hc_y', 'hit_distance_sc', 
                   'launch_speed', 'launch_angle', 'estimated_ba_using_speedangle', 
                   'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 
                   'babip_value', 'iso_value', 'launch_speed_angle', 'bat_speed', 
                   'swing_length', 'estimated_slg_using_speedangle', 'hyper_speed', 
                   'age_bat_legacy', 'age_bat']

significant_vars = []

for col in numeric_columns:
    groups = [group[col].dropna().values for _, group in new_data.groupby('player_name')]
    
    if len(groups) > 1:
        f_stat, p_value = f_oneway(*groups)
        if p_value < 0.05:
            significant_vars.append((col, f_stat, p_value))


significant_vars_df = pd.DataFrame(significant_vars, columns=['Variable', 'F_stat', 'p_value'])
significant_vars_df['p_value'] = significant_vars_df['p_value'].apply(lambda x: round(x, 4))
print("Variables showing significant differences by Player Name:")
print(significant_vars_df)


  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)


Variables showing significant differences by Player Name:
           Variable     F_stat  p_value
0             balls   6.662329      0.0
1           strikes   4.282164      0.0
2      outs_when_up   4.678524      0.0
3   hit_distance_sc   3.213993      0.0
4      launch_speed   4.914000      0.0
5      launch_angle   4.220165      0.0
6         bat_speed  54.627861      0.0
7      swing_length  93.253266      0.0
8       hyper_speed   7.940085      0.0
9    age_bat_legacy        inf      0.0
10          age_bat        inf      0.0


  f_stat, p_value = f_oneway(*groups)
  f_stat, p_value = f_oneway(*groups)
