In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd

### Load data
Load the datafrom CSV - 2024 data only. Of course, make sure that you have the full 2024 data csv in your local directory.

In [97]:
statcast_df_raw = pd.read_csv('statcast_2024.csv')

### Filtering
Apply some basic filters: 

- Regular season games (game_type = ‘R’)
- Games that aren’t a “decided”/blowout (home_score_diff < 6)
- Remove pitchers that only have a few appearances (< 3)
- Remove rows with NaN for pitch type


In [98]:
# Regular season games
statcast_df = statcast_df_raw[statcast_df_raw['game_type'] == 'R']

# Score difference
statcast_df = statcast_df[abs(statcast_df['home_score_diff']) < 6]

# Identify pitchers with < 3 appearances
pitcher_appearances = statcast_df.groupby(['pitcher', 'game_date']).size()
pitcher_games = pitcher_appearances.groupby('pitcher').size()
pitchers_with_few_games = pitcher_games[pitcher_games < 3].index
# Filter out those pitchers
statcast_df[~statcast_df['pitcher'].isin(pitchers_with_few_games)]


# Filter out NaN for pitch_type
statcast_df = statcast_df.dropna(subset=['pitch_type'])

Only keep relevant columns - 60 columns of potential interest

In [99]:
cols = ['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z', 'batter', 'pitcher', 'events', 'description', 'zone', 'des', 'game_type',
        'stand', 'p_throws', 'type', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'hc_x', 'hc_y',
        'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
        'release_spin_rate', 'release_extension', 'game_pk', 'release_pos_y', 'estimated_woba_using_speedangle', 'launch_speed_angle', 'at_bat_number',
        'pitch_number', 'pitch_name', 'home_score', 'away_score', 'spin_axis', 'delta_home_win_exp', 'delta_run_exp', 'bat_speed', 'swing_length',
        'estimated_slg_using_speedangle', 'delta_pitcher_run_exp', 'bat_win_exp', 'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat', 'arm_angle']

statcast_df = statcast_df[cols]

### Encoding and Normalizing

In [100]:
# One-hot encoded fields (pitch type, zone, and type (outcome))
statcast_df.join(pd.get_dummies(statcast_df['pitch_type'], prefix='pitch'))  # pitch type
statcast_df.join(pd.get_dummies(statcast_df['zone'], prefix='zone'))  # zone
statcast_df.join(pd.get_dummies(statcast_df['type'], prefix='type'))  # type

# Binary encoding (batter and pitcher handedness)
statcast_df['stand'] = statcast_df['stand'].replace({'L': 0, 'R': 1})
statcast_df['p_throws'] = statcast_df['p_throws'].replace({'L': 0, 'R': 1})

  statcast_df['stand'] = statcast_df['stand'].replace({'L': 0, 'R': 1})
  statcast_df['p_throws'] = statcast_df['p_throws'].replace({'L': 0, 'R': 1})


List the columns that are associated with each of the three data uses - pitch profile (clustering), pitch sequencing data, and hitter field (targets)

In [101]:
pitch_profile = ['pitch_type', 'vx0', 'vy0', 'vz0', 'release_spin_rate', 'spin_axis', 'pfx_x', 'pfx_z', 'release_extension', 'release_pos_y']
pitch_sequencing = ['pitch_number', 'pitch_type', 'balls', 'strikes', 'stand', 'p_throws']
hitter_fields = ['launch_speed', 'launch_angle', 'plate_x', 'plate_z', 'sz_top', 'sz_bot']

Identify the numeric fields that should be normalized. Other fields are categorical or sequential data.<br>
Pitch profile fields that aren't standardized: pitch_type <br>
Pitch sequencing fields aren't normalized

In [102]:
from sklearn.preprocessing import StandardScaler

pitch_profile_numeric = ['vx0', 'vy0', 'vz0', 'release_spin_rate', 'spin_axis', 'pfx_x', 'pfx_z', 'release_extension', 'release_pos_y']
hitter_fields_numeric = ['launch_speed', 'launch_angle', 'plate_x', 'plate_z',  'sz_top', 'sz_bot']

# Transform pitch profile
statcast_df[pitch_profile_numeric] = StandardScaler().fit_transform(statcast_df[pitch_profile_numeric])
# Transform hitter fields
statcast_df[hitter_fields_numeric] = StandardScaler().fit_transform(statcast_df[hitter_fields_numeric])