In [3]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.linear_model import LinearRegression

In [4]:
files = os.listdir('data')
data = pd.DataFrame()

for file in files:
    if not file.__contains__('half_inning_data'):
        data = pd.concat([data, pd.read_csv('data/' + file)], ignore_index=True)

data['RunsScored'].fillna(0, inplace=True)
data = data[(data['PitchCall'].notna()) & (data['Inning'].notna()) & (data['GameID'].notna())]

In [5]:
game_ids = pd.unique(data['GameID'])

def widen_half_inning(data, game_id, half_inning):
    data.loc[data['PitchCall'] == 'InPlay', 'PitchCall'] = data.loc[data['PitchCall'] == 'InPlay', 'PlayResult']

    half_inning_data = pd.DataFrame({'GameID': game_id, 'HalfInning': half_inning, 'RunScored': sum(data['RunsScored']),
                                     'StrikeCalled': 0, 'BallCalled': 0, 'Foul': 0, 'StrikeSwinging': 0, 'Out': 0, 'Error': 0,
                                     'HitByPitch': 0, 'Sacrifice': 0, 'FoulTip': 0, 'FieldersChoice': 0, 'Single': 0,
                                     'Double': 0, 'Triple': 0, 'HomeRun': 0}, index=[0])

    data = data.reset_index(drop=True)

    for i in range(len(data)):
        if data.loc[i, 'PitchCall'] in ['StrikeCalled', 'BallCalled', 'Foul', 'StrikeSwinging', 'Out', 'Error',
                                         'HitByPitch', 'Sacrifice', 'FoulTip', 'FieldersChoice', 'Single', 'Double',
                                         'Triple', 'HomeRun']:
            half_inning_data.loc[0, data.loc[i, 'PitchCall']] += 1

    return half_inning_data

In [6]:
half_inning_data = pd.DataFrame()

for i in tqdm(range(len(game_ids))):

    game_id = game_ids[i]
    
    game_data = data.loc[data['GameID'] == game_id]

    for inning in range(1, max(game_data['Inning'].astype(int)) + 1):
        inning_data = game_data.loc[game_data['Inning'] == inning]

        home_half_inning_data = widen_half_inning(inning_data.loc[inning_data['Top/Bottom'] == 'Bottom'], game_id, 'bottom' + str(inning))
        away_half_inning_data = widen_half_inning(inning_data.loc[inning_data['Top/Bottom'] == 'Top'], game_id, 'top' + str(inning))

        half_inning_data = pd.concat([half_inning_data, home_half_inning_data, away_half_inning_data], ignore_index=True)

half_inning_data.to_csv('data/half_inning_data.csv', index=False)

100%|██████████| 111/111 [00:05<00:00, 18.93it/s]


In [7]:
X = half_inning_data.drop(['GameID', 'HalfInning', 'RunScored'], axis=1)
y = half_inning_data['RunScored']

model = LinearRegression().fit(X, y)
coefs = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})

coefs = coefs[~coefs['Feature'].isin(['Sacrifice', 'FieldersChoice', 'FoulTip'])]

coefs['Coefficient'] = (coefs['Coefficient'] - coefs['Coefficient'].max()) / (coefs['Coefficient'].min() - coefs['Coefficient'].max())
coefs = coefs.sort_values('Coefficient', ascending=False)

In [8]:
coefs.to_csv('ab_scores.csv', index=False)