In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
import random
import os

In [11]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000000)
pd.options.mode.chained_assignment = None 

In [28]:
# get all of the csvs that have written out from the previous process
csvs = [i for i in os.listdir() if i.startswith('results_df')]
csvs

['results_df_4000.csv',
 'results_df_2500.csv',
 'results_df_2000.csv',
 'results_df_4500.csv',
 'results_df_0.csv',
 'results_df_1500.csv',
 'results_df_3500.csv',
 'results_df_3000.csv',
 'results_df_500.csv',
 'results_df_1000.csv']

In [30]:
remove_cols = ['Fielders Choice', 'Sacrifice Fly', 'Fielder\'s Choice-Adv-2nd', 
               'Sacrifice Bunt', 'Reached on error-Adv-3rd', 'Reached on error-Adv-2nd', 
               'Catcher Interference', 'Sacrifice Bunt-Adv-2nd', 'Sacrifice Bunt', 
               'Dirt Ball', 'Sacrifice Fly-Adv-2nd', 'Fielders Choice - Out at 2nd', 
               'Sacrifice Fly-Adv-1st', 'Sacrifice Bunt-Adv-1st', 'Reached On Error - Out at 2nd', 
               'Balk', 'Reached on error', 'Hit by pitch']

high_level_groupings = {
    'ball' : ['Ball', 'Intentional Ball', 'Pitch Out'], 
    
    'strike' : 
        ['Strike Looking', 'Strike Swinging', ' Strike Swinging - Out at Home', 
         'Strike-swinging-Adv-1st', 'Strike-looking-Adv-1st', 'Single - Out at 3rd', 
         'Strike Looking - Out at 1st'], 
    
    'contact_in_play_out' : 
        ['Pop Out', 'Ground Out', 'Fly Out', 'Line Out'], 
    
    'contact_in_on_base' : 
        ['Homerun', 'Single', 'Double', 'Single - Out at 2nd', 'Triple', 'Single-Adv-2nd', 
         'Double-Adv-3rd', 'Double Out at 3rd', 'Single-Adv-Home', 'Single-Adv-3rd', 'Double-Adv-Home', 
         'Triple-Adv-Home', 'Triple - Out at Home', 'Double-Adv-Home'], 
    
    'contact_foul' : ['Foul Ball', 'Foul Tip', ]
}

# remove any bat_id with 'rare' events (~10% of at-bats)
remove_ids = clean_df.loc[clean_df['outcomeDescription'].isin(remove_cols)]['bat_id'].unique()
clean_df = clean_df.loc[~clean_df['bat_id'].isin(remove_ids)]

# theres currently no assignment for a walk; create new column for outcome called walk
clean_df['walk_flag'] = np.where(
    ((clean_df['startingBalls'] == 3) & 
     (clean_df['outcomeDescription'].isin(high_level_groupings['ball']))), 1, 0
)

# do the same for strikes
clean_df['strikeout_flag'] = np.where(
    ((clean_df['startingStrikes'] == 2) & 
    (clean_df['outcomeDescription'].isin(high_level_groupings['strike']))), 1, 0
)

# put the previous ids in the data
clean_df['prev_state_id'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeId']
      .shift(1)
      .fillna('bFP')
)

clean_df['prev_state_desc'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeDescription']
      .shift(1)
      .fillna('First Pitch')
)

# assign based on the dictonary above
clean_df['assigned_outcome'] = ''
clean_df['prev_assigned_outcome'] = ''
for outcome, grouping in high_level_groupings.items(): 
    clean_df['assigned_outcome'] = np.where(
        clean_df['walk_flag'] == 1, 'walk', 
            np.where(clean_df['strikeout_flag'] == 1, 'strikeout',
                np.where(
                    clean_df['outcomeDescription'].isin(grouping), outcome, clean_df['assigned_outcome']
                )
            )          
        )
    
    # has the previous outcomes as the same grouping (except itis imporssible to get a walk here)
    clean_df['prev_assigned_outcome'] = np.where(
            clean_df['prev_state_desc'] == 'First Pitch', 'first_pitch',
            np.where(
                clean_df['prev_state_desc'].isin(grouping), outcome, clean_df['prev_assigned_outcome']
            )
    )

In [20]:
results_df = pd.DataFrame()
for csv in csvs:
    t = pd.read_csv(csv)
    results_df = pd.concat([results_df, t], axis=0)

In [21]:
results_df = results_df.drop(columns='Unnamed: 0')
clean_df = pd.read_csv('data/play_by_play_2016.csv').drop(columns='Unnamed: 0')

In [25]:
results_df.head()

Unnamed: 0,previous_play,ball,contact_foul,contact_in_on_base,contact_in_play_out,strike,actual_outcome,pitch_id,strikeout,walk,max_prob,predicted_result,correct_flag
0,first_pitch,0.350211,0.122363,0.037975,0.056259,0.433193,ball,b77ac3a4-673b-4644-b7fd-dcc720bca88b565de4be-d...,,,0.433193,strike,0
1,ball,0.322581,0.149194,0.060484,0.104839,0.362903,strike,b77ac3a4-673b-4644-b7fd-dcc720bca88b565de4be-d...,,,0.362903,strike,1
2,strike,0.344444,0.188889,0.066667,0.1,0.3,strike,b77ac3a4-673b-4644-b7fd-dcc720bca88b565de4be-d...,,,0.344444,ball,0
3,strike,0.348837,0.255814,0.116279,0.104651,,contact_in_on_base,b77ac3a4-673b-4644-b7fd-dcc720bca88b565de4be-d...,0.174419,,0.348837,ball,0
4,first_pitch,0.368347,0.107843,0.036415,0.079832,0.407563,strike,b7b37ed4-5752-4a4f-a581-d7312ed598aa565de4be-d...,,,0.407563,strike,1


### Check to see how many I got 'right'

In [26]:
# create a column for predicted outcome
prob_cols = [
    'ball', 'contact_foul', 'contact_in_on_base', 
    'contact_in_play_out', 'strike', 'strikeout', 'walk'
]

results_df['max_prob'] = results_df[prob_cols].max(axis=1)
results_df['predicted_result'] = ''

for prob_col in prob_cols: 
    results_df['predicted_result'] = np.where(
        results_df[prob_col] ==  results_df['max_prob'], 
        prob_col, results_df['predicted_result'] 
        
    )
    
    
# create a marker when I got the 'right' outcome
results_df['correct_flag'] = np.where(
        results_df['predicted_result'] ==  results_df['actual_outcome'], 1, 0
)

In [27]:
# look at accuracy 
results_df['correct_flag'].sum() / results_df['correct_flag'].count()

0.40872987980600267

In [55]:
# if I randomly assign based on percentiles from real data 
# how well do I do 
# not the greatest way - because a 'good' model should almost never predict basehits
pct_df = clean_df.groupby('assigned_outcome').agg({'pitch_id' : 'count'}).reset_index()
pct_df['pct'] = pct_df['pitch_id'] / pct_df['pitch_id'].sum()
pct_df = pct_df[['assigned_outcome', 'pct']]
outcomes, chance = pct_df['assigned_outcome'].tolist(), pct_df['pct'].tolist()

chance_df = results_df
chance_df['random_outcome'] = np.random.choice(a=outcomes, p=chance)

chance_df['random_correct_flag'] = np.where(
    results_df['predicted_result'] ==  results_df['random_outcome'], 1, 0
)

In [57]:
chance_df['random_correct_flag'].sum() / chance_df['random_outcome'].count()

0.09114125723389799

### Look at the accuacy for each moment (if I give a 30% chance of 

In [52]:
l = []
for _ in range(100000): 
    v = np.random.choice(a=outcomes, p=chance)
    l.append(v)
    
from collections import Counter
Counter(l)

Counter({'contact_foul': 18092,
         'ball': 33273,
         'strike': 22308,
         'contact_in_play_out': 12398,
         'strikeout': 5501,
         'contact_in_on_base': 6461,
         'walk': 1967})

In [None]:
# check how accurate i am for each prediction
pd.pivot_table(results_df, index='predicted_result', columns='actual_outcome', values=)

In [None]:
# look at what results I predicted the most
results_df.groupby('predicted_result').agg({'pitch_id' : 'count'})

In [None]:
# look at what results actually happen the most
results_df.groupby('actual_outcome').agg({'pitch_id' : 'count'})