### Vivek Divakarla: Reds Pitch Mix Prediction Project

In [58]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [59]:
data = pd.read_csv('data.csv')
pred = pd.read_csv('predictions.csv')

In [60]:
#Categorize pitch types into FB, BB, OS, based on baseball savant categories
def categorize_pitch(pitch):
    fastball = ['FF', 'FC', 'SI']
    breaking_ball = ['CS', 'CU', 'KC', 'SL', 'ST', 'SV', 'SC']
    off_speed = ['CH', 'FO', 'KN', 'EP', 'FS']
    
    if pitch in fastball:
        return 'FB'
    elif pitch in breaking_ball:
        return 'BB'
    elif pitch in off_speed:
        return 'OS'
    else:
        return None  

In [61]:
data['updated_pitch_type'] = data['PITCH_TYPE'].apply(categorize_pitch)

In [62]:
#Select columns for Model: All Pitches
all_pitches_selected_columns = ['BATTER_ID','GAME_YEAR', 'BAT_SIDE', 'THROW_SIDE', 'PLATE_X', 'PLATE_Z', 
                    'BALLS', 'STRIKES', 'INNING', 'updated_pitch_type', 'DELTA_HOME_WIN_EXP', 'DELTA_RUN_EXP']
df_selected = data[all_pitches_selected_columns]

In [63]:
data['updated_pitch_type'] = le.fit_transform(data['updated_pitch_type'])

pitch_type_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Pitch type to encoded number mapping:", pitch_type_mapping)

Pitch type to encoded number mapping: {'BB': 0, 'FB': 1, 'OS': 2, None: 3}


In [64]:
#Encode categorical variables to be able to be used for modeling
le = LabelEncoder()
df_selected['updated_pitch_type'] = le.fit_transform(df_selected['updated_pitch_type'])
df_selected['BAT_SIDE'] = le.fit_transform(df_selected['BAT_SIDE'])
df_selected['THROW_SIDE'] = le.fit_transform(df_selected['THROW_SIDE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['updated_pitch_type'] = le.fit_transform(df_selected['updated_pitch_type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['BAT_SIDE'] = le.fit_transform(df_selected['BAT_SIDE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['THROW_SIDE'] = le.fit_transform(df_s

In [65]:
#set features and targets, and use train test split
df_selected = df_selected.dropna()

X = df_selected.drop(columns=['updated_pitch_type'])
y = df_selected['updated_pitch_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using Random Forest to Classify
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [66]:
predictions = model.predict_proba(X_test)
X_test[['PITCH_TYPE_BB', 'PITCH_TYPE_FB', 'PITCH_TYPE_OS', 'None']] = predictions

In [67]:
full_model = X_test.groupby("BATTER_ID")[["PITCH_TYPE_FB", "PITCH_TYPE_BB", "PITCH_TYPE_OS"]].mean().reset_index()
full_model

Unnamed: 0,BATTER_ID,PITCH_TYPE_FB,PITCH_TYPE_BB,PITCH_TYPE_OS
0,444482,0.513858,0.293430,0.191189
1,453568,0.560630,0.285289,0.152692
2,456781,0.531343,0.344439,0.121565
3,457705,0.546984,0.326932,0.124771
4,457759,0.556792,0.333482,0.108307
...,...,...,...,...
309,691718,0.506000,0.354000,0.138500
310,693304,0.495843,0.372247,0.130337
311,694384,0.550698,0.301395,0.146860
312,696285,0.492268,0.379175,0.126804


In [68]:
#Second Model: Using batted ball statistics
df_batted_balls = data[data['LAUNCH_SPEED_ANGLE'].notna()]
batted_balls_selected_columns = ['BATTER_ID', 'GAME_YEAR', 'BAT_SIDE', 'THROW_SIDE', 'PLATE_X', 'PLATE_Z', 
                    'BALLS', 'STRIKES', 'INNING', 'updated_pitch_type', 'DELTA_HOME_WIN_EXP', 'DELTA_RUN_EXP', 'ISO_VALUE', 'BABIP_VALUE',
                   'WOBA_DENOM', 'WOBA_VALUE', 'ESTIMATED_BA_USING_SPEEDANGLE', 'LAUNCH_ANGLE', 'LAUNCH_SPEED']

df_batted_balls = df_batted_balls[batted_balls_selected_columns]

In [71]:
df_batted_balls

Unnamed: 0,BATTER_ID,GAME_YEAR,BAT_SIDE,THROW_SIDE,PLATE_X,PLATE_Z,BALLS,STRIKES,INNING,updated_pitch_type,DELTA_HOME_WIN_EXP,DELTA_RUN_EXP,ISO_VALUE,BABIP_VALUE,WOBA_DENOM,WOBA_VALUE,ESTIMATED_BA_USING_SPEEDANGLE,LAUNCH_ANGLE,LAUNCH_SPEED
5,605141,2021,R,R,0.20,1.33,3,2,1,0,-0.036,0.327,0.0,1.0,1.0,0.9,0.829,34.0,68.4
6,608369,2021,L,R,-0.15,2.49,0,0,1,1,-0.052,0.701,0.0,1.0,1.0,0.9,0.577,8.0,96.3
10,457759,2021,R,R,0.09,0.88,1,2,1,1,0.103,-0.572,0.0,0.0,1.0,0.0,0.229,28.0,38.8
22,571970,2021,L,R,-0.05,2.33,2,2,1,1,0.042,-0.390,0.0,0.0,1.0,0.0,0.071,-37.0,71.1
28,621035,2021,R,R,0.01,1.81,1,1,2,0,-0.038,0.414,0.0,1.0,1.0,0.9,0.160,2.0,73.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286146,608701,2023,R,L,-0.20,2.62,3,1,8,1,0.001,-0.166,0.0,0.0,1.0,0.0,0.151,22.0,93.2
1286152,676059,2023,R,R,-0.96,2.16,3,2,8,1,-0.004,-0.317,0.0,0.0,1.0,0.0,0.063,-9.0,65.1
1286161,668939,2023,L,R,-0.22,2.19,1,2,8,1,0.005,0.848,0.0,1.0,1.0,0.9,0.486,-2.0,102.1
1286164,623993,2023,L,R,0.13,2.29,2,0,8,0,-0.005,-0.297,0.0,0.0,1.0,0.0,0.009,43.0,82.0


In [72]:
df_batted_balls = df_batted_balls.dropna()

le = LabelEncoder()
df_batted_balls['updated_pitch_type'] = le.fit_transform(df_batted_balls['updated_pitch_type'])
df_batted_balls['BAT_SIDE'] = le.fit_transform(df_batted_balls['BAT_SIDE'])
df_batted_balls['THROW_SIDE'] = le.fit_transform(df_batted_balls['THROW_SIDE'])

X = df_batted_balls.drop(columns=['updated_pitch_type'])
y = df_batted_balls['updated_pitch_type']

batted_X_train, batted_X_test, batted_y_train, batted_y_test = train_test_split(X, y, test_size=0.2, random_state=42)

batted_ball_model = RandomForestClassifier()
batted_ball_model.fit(batted_X_train, batted_y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batted_balls['updated_pitch_type'] = le.fit_transform(df_batted_balls['updated_pitch_type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batted_balls['BAT_SIDE'] = le.fit_transform(df_batted_balls['BAT_SIDE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batted_balls['THROW_SIDE'] = l

In [74]:
bat_predictions = batted_ball_model.predict_proba(batted_X_test)

In [75]:
batted_X_test[['PITCH_TYPE_BB', 'PITCH_TYPE_FB', 'PITCH_TYPE_OS', 'Four']] = bat_predictions

In [76]:
bat_df = batted_X_test.groupby("BATTER_ID")[["PITCH_TYPE_FB", "PITCH_TYPE_BB", "PITCH_TYPE_OS"]].mean().reset_index()

In [77]:
merged_df = bat_df.copy()
weight_full = 0.83
weight_bat_df = 0.17

merged_df['PITCH_TYPE_FB'] = (
    full_model['PITCH_TYPE_FB'] * weight_full + bat_df['PITCH_TYPE_FB'] * weight_bat_df
)
merged_df['PITCH_TYPE_BB'] = (
    full_model['PITCH_TYPE_BB'] * weight_full + bat_df['PITCH_TYPE_BB'] * weight_bat_df
)
merged_df['PITCH_TYPE_OS'] = (
    full_model['PITCH_TYPE_OS'] * weight_full + bat_df['PITCH_TYPE_OS'] * weight_bat_df
)

In [78]:
predictions_2024 = pd.merge(pred, merged_df, on='BATTER_ID', suffixes=('_original', ''))

predictions_2024 = predictions_2024.drop(columns=['PITCH_TYPE_FB_original', 'PITCH_TYPE_BB_original', 'PITCH_TYPE_OS_original'])

predictions_2024.to_csv("2024_Season_predictions.csv")

In [79]:
mean_values = predictions_2024[['PITCH_TYPE_BB', 'PITCH_TYPE_FB', 'PITCH_TYPE_OS']].mean()

print(mean_values)

PITCH_TYPE_BB    0.307840
PITCH_TYPE_FB    0.547500
PITCH_TYPE_OS    0.142901
dtype: float64
