In [27]:
import pandas as pd
import os
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import pickle

In [31]:
files = os.listdir('data')
data = pd.DataFrame()

for file in files:
    if file.__contains__('GT'):
        data = pd.concat([data, pd.read_csv('data/' + file)], ignore_index=True)

data.loc[data['PitchCall'] == 'InPlay', 'PitchCall'] = data.loc[data['PitchCall'] == 'InPlay', 'PlayResult']

ab_scores = pd.read_csv('ab_scores.csv')

data['TaggedPitchType'] = data['TaggedPitchType'].str.replace(' ', '')
data['TaggedPitchType'] = data['TaggedPitchType'].str.title()

data = data[(data['TaggedPitchType'].isin(['Dropball', 'Fastball', 'Riseball', 'Changeup', 'Curveball', 'Screwball'])) & (data['PitcherTeam'] == 'Georgia tech') & (data['BatterSide'].isin(['Right', 'Left']))]
data = pd.merge(data, ab_scores, left_on='PitchCall', right_on='Feature', how='inner')

# if the previous pitch was in the same half inning with the same pitcher, create new column called LagPitchType
data['LagPitchType'] = data.groupby(['Pitcher', 'GameID', 'Top/Bottom'])['TaggedPitchType'].shift(1)

data = data.dropna(subset=['BatterSide'])

In [37]:
def create_model(pitcher):
    pitcher_data = data[data['Pitcher'] == pitcher]
    pitcher_data = pd.get_dummies(pitcher_data, columns=['TaggedPitchType', 'LagPitchType', 'BatterSide'])

    # create interaction terms for each pitch / lag pitch combination
    for col in pitcher_data.columns:
        if col.startswith('TaggedPitchType') and col != 'TaggedPitchType':
            for lag_col in pitcher_data.columns:
                if lag_col.startswith('LagPitchType') and lag_col != 'LagPitchType':
                    pitcher_data[col + '_' + lag_col] = pitcher_data[col] * pitcher_data[lag_col]

    x_columns = [col for col in pitcher_data.columns if col.startswith('TaggedPitchType') or col.startswith('LagPitchType') or col.startswith('BatterSide') or col.startswith('Balls') or col.startswith('Strikes')]
    print(x_columns)

    X = pitcher_data[x_columns]
    y = pitcher_data['Score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=1000)
    model.fit(X_train, y_train)

    return model, X_train

for pitcher in pd.unique(data['Pitcher']):
    model, X_train = create_model(pitcher)

    with open('models/' + pitcher + '.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    # save feature names in order to txt file
    with open('models/' + pitcher + ' Features.txt', 'w') as f:
        f.write('\n'.join(X_train.columns.tolist())   ) 

['Balls', 'Strikes', 'TaggedPitchType_Changeup', 'TaggedPitchType_Curveball', 'TaggedPitchType_Dropball', 'TaggedPitchType_Fastball', 'TaggedPitchType_Riseball', 'TaggedPitchType_Screwball', 'LagPitchType_Changeup', 'LagPitchType_Curveball', 'LagPitchType_Dropball', 'LagPitchType_Fastball', 'LagPitchType_Riseball', 'LagPitchType_Screwball', 'BatterSide_Left', 'BatterSide_Right', 'TaggedPitchType_Changeup_LagPitchType_Changeup', 'TaggedPitchType_Changeup_LagPitchType_Curveball', 'TaggedPitchType_Changeup_LagPitchType_Dropball', 'TaggedPitchType_Changeup_LagPitchType_Fastball', 'TaggedPitchType_Changeup_LagPitchType_Riseball', 'TaggedPitchType_Changeup_LagPitchType_Screwball', 'TaggedPitchType_Curveball_LagPitchType_Changeup', 'TaggedPitchType_Curveball_LagPitchType_Curveball', 'TaggedPitchType_Curveball_LagPitchType_Dropball', 'TaggedPitchType_Curveball_LagPitchType_Fastball', 'TaggedPitchType_Curveball_LagPitchType_Riseball', 'TaggedPitchType_Curveball_LagPitchType_Screwball', 'TaggedP