In [14]:
import pandas as pd

marte_raw = pd.read_csv('marte_vs_nlwest_19_22.csv',
                        usecols=['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'pitcher', 'zone', 'des', 'p_throws', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'hc_x', 'hc_y', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'iso_value', 'launch_speed_angle', 'pitch_name', 'spin_axis'])

# rename the primary response variable
marte_raw.rename(columns={'estimated_woba_using_speedangle': 'xwOBA'}, inplace=True)
print(f"data-set shape prior to filtering: {marte_raw.shape}")

# filter sacrifice events; if woba_denom is NaN, then sacrifice event
marte_raw.dropna(axis='rows', subset=['woba_denom'], how='any', inplace=True)
print(f"data-set shape after to filtering sac hits: {marte_raw.shape}")

data-set shape prior to filtering: (867, 30)
data-set shape after to filtering sac hits: (865, 30)


- release_speed
    - Pitch velocities from 2008-16 are via Pitch F/X, and adjusted to roughly out-of-hand release point. All velocities from 2017 and beyond are Statcast, which are reported out-of-hand.
- release_pos_x
    - horizontal Release Position of the ball measured in feet from the catcher's perspective.
- release_pos_z
    - Vertical Release Position of the ball measured in feet from the catcher's perspective.
- plate_x
    - Horizontal position of the ball when it crosses home plate from the catcher's perspective.
- plate_z
    - Vertical position of the ball when it crosses home plate from the catcher's perspective.
- pitcher
    - MLB Player Id tied to the play event.
- zone
    - Zone location of the ball when it crosses the plate from the catcher's perspective.
- pfx_x
    - Horizontal movement in feet from the catcher's perspective.
- pfx_z
    - Vertical movement in feet from the catcher's perpsective.
- effective_speed
    - Derived speed based on the the extension of the pitcher's release.
- release_spin
    - Spin rate of pitch tracked by Statcast.
- spin_axis
    - The Spin Axis in the 2D X-Z plane in degrees from 0 to 360, such that 180 represents a pure backspin fastball and 0 degrees represents a pure topspin (12-6) curveball

In [28]:
import numpy as np

# filter raw set with proposed model metrics
marte_vs = marte_raw.drop(columns=['game_date', 'release_pos_x', 'release_pos_z', 'des', 'bb_type', 'balls', 'strikes', 'hc_x', 'hc_y', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_extension', 'woba_denom', 'iso_value', 'launch_speed_angle', 'pitch_name'])
print(f"new data-set shape after to filtering for proper metrics: {marte_vs.shape}")

# vectorize the horizontal & vertical movement using pythagorean theorem
marte_vs['pfx_v'] = np.sqrt(marte_vs.pfx_x ** 2 + marte_vs.pfx_z ** 2)
print(marte_vs.head())

new data-set shape after to filtering for proper metrics: (865, 12)
  pitch_type  release_speed  pitcher  zone p_throws  game_year  pfx_x  pfx_z  \
0         SI           92.9   596001     5        R       2022  -1.15   0.68   
1         SI           92.6   596001     9        R       2022  -1.05   0.80   
2         SI           93.0   596001     8        R       2022  -1.13   0.79   
3         SI           93.3   518397     8        L       2022   1.19   0.28   
4         SI           94.1   502171     4        R       2022  -1.34   0.45   

   release_spin_rate  xwOBA  woba_value  spin_axis     pfx_v  
0             2303.0  0.709         0.9      211.0  1.336001  
1             2316.0  0.262         0.0      204.0  1.320038  
2             2245.0  0.231         0.0      219.0  1.378768  
3             2189.0  0.117         0.0      135.0  1.222497  
4             2126.0  0.438         0.0      221.0  1.413542  


In [23]:
from sklearn.model_selection import train_test_split

X = marte_vs.drop(columns=['xwOBA'])
y = marte_vs.xwOBA
# split data into training and testing sets
# 1st split - remove 15% for validation (holdout)
X_splitAgain, X_holdOut, y_splitAgain, y_holdOut = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=1642)

# 2nd split
X_train, X_test, y_train, y_test = train_test_split(
    X_splitAgain,
    y_splitAgain,
    test_size=15/85,
    random_state=1642)

print('Train set shape\n',X_train.shape, y_train.shape)
print('Test set shape\n',X_test.shape, y_test.shape)
print('Houldout set shape\n',X_holdOut.shape, y_holdOut.shape)
print('---')


Train set shape
 (605, 13) (605,)
Test set shape
 (130, 13) (130,)
Houldout set shape
 (130, 13) (130,)
---
