In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
clean_df = pd.read_csv('data/play_by_play_2016.csv').drop(columns='Unnamed: 0')

### Get a single result for each pitch (ball/strike)

In [4]:
clean_df['prev_state_id'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeId']
      .shift(1)
      .fillna('bFP')
)

clean_df['prev_state_desc'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeDescription']
      .shift(1)
      .fillna('First Pitch')
)

QA Check, look at what the most popular paths to each outcome are (this is what you I'll try to beat); Do in the manual way (can speed this up later predicting probability at each state)

In [5]:
# transpose the list into many columns
t = clean_df.groupby('bat_id')['outcomeDescription'].apply(list)
out = pd.DataFrame(t.tolist(), index=t.index)

# create a column that is a list between all values and get only the combos you care about
out['all_states'] = out.values.tolist()
kv_df = out.reset_index()[['bat_id', 'all_states']]

In [6]:
def remove_nulls(row): 
    val = [i for i in row['all_states'] if i != None]
    return val

kv_df['all_states'] = kv_df.apply(remove_nulls, axis=1)

In [7]:
# turn into a dictonary for k,v pair
state_dict = dict(zip(kv_df['bat_id'], kv_df['all_states'].to_numpy()))

In [8]:
# quick look at dictonary
{i:state_dict[i] for index, i in enumerate(state_dict.keys()) if index < 5}

{'000f209b-7132-4020-a2b6-dec9196a1802565de4be-dc80-4849-a7e1-54bc79156cc8090ff436-c1e8-4927-b457-355cf4f9993bcc672f14-6c84-4dde-8beb-90664d4678431BOT1': ['Strike Looking',
  'Strike Looking',
  'Foul Ball',
  'Line Out'],
 '000f209b-7132-4020-a2b6-dec9196a1802565de4be-dc80-4849-a7e1-54bc79156cc8090ff436-c1e8-4927-b457-355cf4f9993bcc672f14-6c84-4dde-8beb-90664d4678433BOT1': ['Ball',
  'Fly Out'],
 '000f209b-7132-4020-a2b6-dec9196a1802565de4be-dc80-4849-a7e1-54bc79156cc8090ff436-c1e8-4927-b457-355cf4f9993bcc672f14-6c84-4dde-8beb-90664d4678436BOT6': ['Ball',
  'Pop Out'],
 '000f209b-7132-4020-a2b6-dec9196a1802565de4be-dc80-4849-a7e1-54bc79156cc8090ff436-c1e8-4927-b457-355cf4f9993bee01fdc0-ef9d-4352-9ad9-b1331ce88f099BOT2': ['Ground Out'],
 '000f209b-7132-4020-a2b6-dec9196a1802565de4be-dc80-4849-a7e1-54bc79156cc8141d06a7-bef6-4d75-8bcb-7ed64be0d16e4dfb03a2-a8f0-40d7-b4eb-dad5e12e24f77TOP2': ['Ball',
  'Ball',
  'Foul Ball',
  'Ground Out']}

In [9]:
def look_start_n(n): 
    # put into sets of n and evaluate
    final = {}

    for i, j in state_dict.items(): 
        vals = []
        for index, action in enumerate(j):
            chain = j[index:index+n]
            vals.append(chain)

        final[i] = vals

    # turn into json format
    new_format = {}
    for k,v in final.items(): 
        json = {}
        for i,c in enumerate(v): 
            # only consider states with at least 3 terms
            if len(c) == n: 
                readable = ','.join(c)
                json[f'combo_{i}'] = readable

        new_format[k] = json
    
    analysis_df = pd.DataFrame(new_format).T.reset_index().rename(columns={'index' : 'bat_id'})
    return analysis_df

In [11]:
analysis_df = look_start_n(n=3)

for combo in ['combo_0', 'combo_1', 'combo_2', 'combo_3', 'combo_4']: 
    show_df = (
        analysis_df
            .groupby(combo)
            .agg({'bat_id' : 'count'})
            .sort_values(by=['bat_id'], ascending=False)
            .head(20)
    )
    
    display(show_df)

Unnamed: 0_level_0,bat_id
combo_0,Unnamed: 1_level_1
"Strike Looking,Ball,Ball",7197
"Ball,Ball,Ball",6257
"Ball,Ball,Strike Looking",5400
"Ball,Strike Looking,Ball",5181
"Strike Looking,Ball,Foul Ball",4380
"Strike Looking,Foul Ball,Ball",3919
"Ball,Ball,Foul Ball",3404
"Ball,Foul Ball,Ball",3334
"Ball,Strike Looking,Foul Ball",2907
"Strike Looking,Strike Looking,Ball",2649


Unnamed: 0_level_0,bat_id
combo_1,Unnamed: 1_level_1
"Ball,Ball,Strike Looking",5048
"Ball,Ball,Ball",4877
"Ball,Foul Ball,Ball",3528
"Foul Ball,Ball,Ball",3000
"Strike Looking,Ball,Ball",2928
"Ball,Strike Looking,Ball",2922
"Ball,Ball,Foul Ball",2634
"Ball,Foul Ball,Foul Ball",2210
"Foul Ball,Ball,Foul Ball",2110
"Strike Looking,Ball,Foul Ball",2031


Unnamed: 0_level_0,bat_id
combo_2,Unnamed: 1_level_1
"Ball,Ball,Ball",2898
"Ball,Foul Ball,Ball",2516
"Ball,Ball,Foul Ball",2507
"Foul Ball,Ball,Ball",2355
"Foul Ball,Ball,Foul Ball",2102
"Ball,Foul Ball,Foul Ball",2048
"Ball,Strike Looking,Ball",1771
"Foul Ball,Foul Ball,Ball",1666
"Ball,Strike Looking,Foul Ball",1471
"Ball,Ball,Strike Looking",1256


Unnamed: 0_level_0,bat_id
combo_3,Unnamed: 1_level_1
"Ball,Foul Ball,Foul Ball",1724
"Ball,Foul Ball,Ball",1693
"Foul Ball,Ball,Foul Ball",1553
"Ball,Ball,Foul Ball",1421
"Foul Ball,Ball,Ball",1361
"Foul Ball,Foul Ball,Ball",1291
"Foul Ball,Foul Ball,Foul Ball",1080
"Ball,Ball,Ball",1079
"Ball,Foul Ball,Strike Swinging",754
"Foul Ball,Ball,Strike Swinging",742


Unnamed: 0_level_0,bat_id
combo_4,Unnamed: 1_level_1
"Ball,Foul Ball,Foul Ball",1034
"Foul Ball,Foul Ball,Foul Ball",1021
"Foul Ball,Ball,Foul Ball",960
"Foul Ball,Foul Ball,Ball",949
"Ball,Foul Ball,Ball",887
"Foul Ball,Ball,Ball",646
"Foul Ball,Foul Ball,Strike Swinging",470
"Ball,Foul Ball,Strike Swinging",427
"Foul Ball,Ball,Strike Swinging",387
"Ball,Foul Ball,Ground Out",347
