In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000000)

In [3]:
clean_df = pd.read_csv('data/play_by_play_2016.csv').drop(columns='Unnamed: 0')

In [4]:
remove_cols = ['Fielders Choice', 'Sacrifice Fly', 'Fielder\'s Choice-Adv-2nd', 
               'Sacrifice Bunt', 'Reached on error-Adv-3rd', 'Reached on error-Adv-2nd', 
               'Catcher Interference', 'Sacrifice Bunt-Adv-2nd', 'Sacrifice Bunt', 
               'Dirt Ball', 'Sacrifice Fly-Adv-2nd', 'Fielders Choice - Out at 2nd', 
               'Sacrifice Fly-Adv-1st', 'Sacrifice Bunt-Adv-1st', 'Reached On Error - Out at 2nd', 
               'Balk', 'Reached on error']

high_level_groupings = {
    'ball' : ['Ball', 'Intentional Ball', 'Pitch Out'], 
    
    'strike' : 
        ['Strike Looking', 'Strike Swinging', ' Strike Swinging - Out at Home', 
         'Strike-swinging-Adv-1st', 'Strike-looking-Adv-1st', 'Single - Out at 3rd', 
         'Strike Looking - Out at 1st'], 
    
    'contact_in_play_out' : 
        ['Pop Out', 'Ground Out', 'Fly Out', 'Line Out'], 
    
    'contact_in_on_base' : 
        ['Homerun', 'Single', 'Double', 'Single - Out at 2nd', 'Triple', 'Single-Adv-2nd', 
         'Double-Adv-3rd', 'Double Out at 3rd', 'Single-Adv-Home', 'Single-Adv-3rd', 'Double-Adv-Home', 
         'Triple-Adv-Home', 'Triple - Out at Home', 'Double-Adv-Home'], 
    
    'on_base_no_contact' : ['Hit by pitch'], 
    
    'contact_foul' : ['Foul Ball', 'Foul Tip', ]
}

In [5]:
clean_df = pd.read_csv('data/play_by_play_2016.csv').drop(columns='Unnamed: 0')

In [6]:
# remove any bat_id with 'rare' events (~10% of at-bats)
remove_ids = clean_df.loc[clean_df['outcomeDescription'].isin(remove_cols)]['bat_id'].unique()
clean_df = clean_df.loc[~clean_df['bat_id'].isin(remove_ids)]

# theres currently no assignment for a walk; create new column for outcome called walk
clean_df['walk_flag'] = np.where(
    ((clean_df['startingBalls'] == 3) & (clean_df['outcomeDescription'] == 'Ball')), 1, 0
)

clean_df['prev_state_id'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeId']
      .shift(1)
      .fillna('bFP')
)

clean_df['prev_state_desc'] = (
    clean_df.sort_values(by=['bat_id', 'event_num'], ascending=True)
      .groupby('bat_id')['outcomeDescription']
      .shift(1)
      .fillna('First Pitch')
)

# assign based on the dictonary above
clean_df['assigned_outcome'] = ''
clean_df['prev_assigned_outcome'] = ''
for outcome, grouping in high_level_groupings.items(): 
    clean_df['assigned_outcome'] = np.where(
        clean_df['walk_flag'] == 1, 'walk', 
        np.where(
            clean_df['outcomeDescription'].isin(grouping), outcome, clean_df['assigned_outcome']
        )
    )
    
    # has the previous outcomes as the same grouping (except itis imporssible to get a walk here)
    clean_df['prev_assigned_outcome'] = np.where(
            clean_df['prev_state_desc'] == 'First Pitch', 'first_pitch',
            np.where(
                clean_df['prev_state_desc'].isin(grouping), outcome, clean_df['prev_assigned_outcome']
            )
    )

In [8]:
len(clean_df)

584282

In [9]:
len(remove_ids)

18804

### If a strike was just thrown - the next pitch will be...

In [10]:
pitcher = 'Justin_Verlander'
pitcher_df = clean_df.loc[clean_df['pitcher_full_name'] == pitcher]

In [11]:
pitcher_df

Unnamed: 0,pitch_id,event_num,bat_id,gameId,seasonId,createdAt,awayTeamId,homeTeamId,venueSurface,venueCity,venueOutfieldDistances,inningNumber,inningHalf,inningEventType,inningHalfEventSequenceNumber,atBatEventType,atBatEventSequenceNumber,outcomeId,outcomeDescription,hitterFirstName,hitterLastName,hitterId,hitterWeight,hitterHeight,hitterBatHand,pitcherFirstName,pitcherLastName,pitcherId,pitcherThrowHand,pitchType,pitchSpeed,pitchZone,pitcherPitchCount,hitterPitchCount,startingBalls,startingStrikes,startingOuts,balls,strikes,outs,is_ab,is_ab_over,is_hit,is_on_base,is_bunt,is_bunt_shown,is_double_play,is_triple_play,is_wild_pitch,is_passed_ball,rob1_outcomeId,rob1_outcomeDescription,rob2_start,rob2_end,rob2_isOut,rob2_outcomeId,rob2_outcomeDescription,rob3_start,rob3_end,rob3_isOut,rob3_outcomeId,rob3_outcomeDescription,description,homeCurrentTotalRuns,awayCurrentTotalRuns,lineupPosition,lineupOrder,pitcher_full_name,hitter_full_name,walk_flag,prev_state_id,prev_state_desc,assigned_outcome,prev_assigned_outcome
1430,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,1,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-04-28 01:10:44+00:00,27a59d3b-ff7c-48ea-b016-4798f560f5e1,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,6,TOP,AT_BAT,1,PITCH,1,bB,Ball,Christopher,Coghlan,8dcb75a4-2928-4cd2-bcbd-cd5f79601e41,195,72,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,CU,76,12,82,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Chris Coghlan strikes out swinging.,5,1,0,0,Justin_Verlander,Christopher_Coghlan,0,bFP,First Pitch,ball,first_pitch
1431,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,2,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-04-28 01:11:04+00:00,27a59d3b-ff7c-48ea-b016-4798f560f5e1,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,6,TOP,AT_BAT,1,PITCH,2,bB,Ball,Christopher,Coghlan,8dcb75a4-2928-4cd2-bcbd-cd5f79601e41,195,72,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,FA,91,11,83,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Chris Coghlan strikes out swinging.,5,1,0,0,Justin_Verlander,Christopher_Coghlan,0,bB,Ball,ball,ball
1432,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-04-28 01:11:19+00:00,27a59d3b-ff7c-48ea-b016-4798f560f5e1,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,6,TOP,AT_BAT,1,PITCH,3,kKL,Strike Looking,Christopher,Coghlan,8dcb75a4-2928-4cd2-bcbd-cd5f79601e41,195,72,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,FA,91,6,84,3,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Chris Coghlan strikes out swinging.,5,1,0,0,Justin_Verlander,Christopher_Coghlan,0,bB,Ball,strike,ball
1433,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,4,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-04-28 01:11:44+00:00,27a59d3b-ff7c-48ea-b016-4798f560f5e1,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,6,TOP,AT_BAT,1,PITCH,4,kKL,Strike Looking,Christopher,Coghlan,8dcb75a4-2928-4cd2-bcbd-cd5f79601e41,195,72,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,SL,82,11,85,4,2,1,0,2,2,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Chris Coghlan strikes out swinging.,5,1,0,0,Justin_Verlander,Christopher_Coghlan,0,kKL,Strike Looking,strike,strike
1434,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,5,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4565de4be-d...,3aec13c1-78dd-4ffb-9aeb-04703ce2cfb4,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-04-28 01:12:17+00:00,27a59d3b-ff7c-48ea-b016-4798f560f5e1,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,6,TOP,AT_BAT,1,PITCH,5,kKS,Strike Swinging,Christopher,Coghlan,8dcb75a4-2928-4cd2-bcbd-cd5f79601e41,195,72,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,SL,85,12,86,5,2,2,0,2,3,1,1,1,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Chris Coghlan strikes out swinging.,5,1,0,0,Justin_Verlander,Christopher_Coghlan,0,kKL,Strike Looking,strike,strike
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665878,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,1,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,e1ee17f8-fb3b-48f5-b152-82121ee43e5a,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-08-05 23:28:36+00:00,f246a5e5-afdb-479c-9aaa-c68beeda7af6,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,2,TOP,AT_BAT,1,PITCH,1,kF,Foul Ball,Jay,Bruce,58c17caa-c048-4336-b296-fa4eaf039eac,225,75,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,FA,91,6,8,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Jay Bruce singles to right field.,1,0,0,0,Justin_Verlander,Jay_Bruce,0,bFP,First Pitch,contact_foul,first_pitch
665879,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,2,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,e1ee17f8-fb3b-48f5-b152-82121ee43e5a,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-08-05 23:28:58+00:00,f246a5e5-afdb-479c-9aaa-c68beeda7af6,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,2,TOP,AT_BAT,1,PITCH,2,kF,Foul Ball,Jay,Bruce,58c17caa-c048-4336-b296-fa4eaf039eac,225,75,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,FA,93,5,9,2,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Jay Bruce singles to right field.,1,0,0,0,Justin_Verlander,Jay_Bruce,0,kF,Foul Ball,contact_foul,contact_foul
665880,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,3,e1ee17f8-fb3b-48f5-b152-82121ee43e5a565de4be-d...,e1ee17f8-fb3b-48f5-b152-82121ee43e5a,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-08-05 23:29:33+00:00,f246a5e5-afdb-479c-9aaa-c68beeda7af6,575c19b7-4052-41c2-9f0a-1c5813d02f99,grass,Detroit,mrf-350:mlcf-406:cf-420:rcf-365:rf-330:mlf-370...,2,TOP,AT_BAT,1,PITCH,3,aS,Single,Jay,Bruce,58c17caa-c048-4336-b296-fa4eaf039eac,225,75,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,FA,94,1,10,3,0,2,0,0,2,0,1,1,1,1,0,0,0,0,0,0,,,,0,False,,,,0,False,,,Jay Bruce singles to right field.,1,0,0,0,Justin_Verlander,Jay_Bruce,0,kF,Foul Ball,contact_in_on_base,contact_foul
666452,fc8d3d2a-8c84-41c6-a150-fce4aaa3333a565de4be-d...,1,fc8d3d2a-8c84-41c6-a150-fce4aaa3333a565de4be-d...,fc8d3d2a-8c84-41c6-a150-fce4aaa3333a,565de4be-dc80-4849-a7e1-54bc79156cc8,2016-08-11 03:05:36+00:00,575c19b7-4052-41c2-9f0a-1c5813d02f99,43a39081-52b4-4f93-ad29-da7f329ea960,grass,Seattle,mrf-353:mlcf-399:cf-405:rcf-387:rf-327:mlf-358...,3,BOT,AT_BAT,5,PITCH,1,kKL,Strike Looking,Adam,Lind,36edf276-6c47-43aa-931c-ca4fd10e1025,195,74,L,Justin,Verlander,2fe6cb9b-c41e-4ee2-9b22-f8b27bf6c5be,R,CU,80,7,59,1,0,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,,,ad04a64e-057a-4576-81bd-28bf71f3a6ea,2,False,,,,0,False,,,Adam Lind pops out to James McCann.,1,0,0,0,Justin_Verlander,Adam_Lind,0,bFP,First Pitch,strike,first_pitch


In [45]:
transition_matrix = pd.pivot_table(
    data=clean_df[['pitch_id', 'assigned_outcome', 'prev_assigned_outcome']], 
    index='prev_assigned_outcome', 
    columns='assigned_outcome',
    aggfunc='count', 
    fill_value=0
)

transition_matrix.columns = [i[1] for i in transition_matrix.columns.to_flat_index()]
transition_matrix.reset_index()

In [51]:
transition_matrix.columns = [i[1] for i in transition_matrix.columns.to_flat_index()]

In [53]:
transition_matrix.reset_index()

Unnamed: 0,prev_assigned_outcome,ball,contact_foul,contact_in_on_base,contact_in_play_out,on_base_no_contact,strike,walk
0,ball,54680,38796,14303,26328,371,52835,6682
1,contact_foul,33182,24281,8029,15684,423,19780,2288
2,contact_in_on_base,42,21,88,78,0,15,0
3,contact_in_play_out,43,20,88,26,0,30,0
4,first_pitch,59663,16627,6437,11320,299,60346,1
5,on_base_no_contact,3,6,0,0,0,2,1
6,strike,47473,26036,8656,17280,388,30018,1613
