In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import glob
import tqdm
pd.set_option('display.max_columns', None)

In [5]:
#define bo_states. The main thing this affects is the index. We want to be consistent, meaning index 4 should 
#always be '1___-0'
bo_states = ['___-0', '__3-0', '_2_-0', '_23-0', '1__-0', '1_3-0', '12_-0',
       '123-0', '___-1', '__3-1', '_2_-1', '_23-1', '1__-1', '1_3-1',
       '12_-1', '123-1', '___-2', '__3-2', '_2_-2', '_23-2', '1__-2',
       '1_3-2', '12_-2', '123-2']

In [6]:
#get the game_pks scraped by Caleb/Tristan
game_files = glob.glob('../baseball-scraping/games/*.csv')
gamepks = [int(game.split('_')[1]) for game in game_files]

#get the roster info 
roster_info = pd.read_csv('roster_info.csv')

In [10]:
def define_start_nodes(gamepk):
    plays = pd.read_csv(f'../baseball-scraping/games/game_{gamepk}_decisions.csv').sort_values(by = 'At_Bat')
    if plays.shape[0] == 0:
        return pd.DataFrame()
    #define the baserunner out state in the format I use 
    plays['thirdbase'] = '_'
    plays.loc[np.isnan(plays.Third_Base) == False, 'thirdbase'] = '3'
    plays['secondbase'] = '_'
    plays.loc[np.isnan(plays.Second_Base) == False, 'secondbase'] = '2'
    plays['firstbase'] = '_'
    plays.loc[np.isnan(plays.First_Base) == False, 'firstbase'] = '1'
    plays['bo_state'] = plays.firstbase + plays.secondbase + plays.thirdbase + '-' + plays.Outs.astype(str)
    
    game_ri = roster_info.loc[roster_info.game_pk == gamepk]
    game_rih = game_ri.loc[game_ri.team == game_ri.game_home_team]
    game_ria = game_ri.loc[game_ri.team == game_ri.game_away_team]

    ### Home Team
    #define home lineup and DH
    home_bo = list(plays[['Home_Lineup_1', 'Home_Lineup_2', 'Home_Lineup_3', 'Home_Lineup_4', 'Home_Lineup_5', 'Home_Lineup_6',
                    'Home_Lineup_7', 'Home_Lineup_8', 'Home_Lineup_9']].values[0])
    home_dh = plays.Home_DH.values[0]
    if np.isnan(home_dh):
        #either we have a pitcher as DH (Ohtani), or we have lost the DH already somehoe
        if plays.Home_Pitcher.values[0] in home_bo:
            home_dh = plays.Home_Pitcher.values[0]
        else:
            raise ValueError('No DH somehow')

    #I set the DH spot of the lineup to 10
    home_bo[home_bo.index(home_dh)] = 10

    #define the bench
    home_starters = set(home_bo) | {home_dh}
    home_pos_players = game_rih.loc[game_rih.position != 'Pitcher']
    home_bench = [i for i in home_pos_players.player_id.unique() if i not in home_starters]

    #define the home starting pitcher and bullpen
    home_pitcher = plays.Home_Pitcher.values[0]
    home_bullpen = game_rih.loc[(game_rih.position == 'Pitcher') & (game_rih.player_id != home_pitcher)].player_id.unique()

    ### Away team
    #define away lineup and DH
    away_bo = list(plays[['Away_Lineup_1', 'Away_Lineup_2', 'Away_Lineup_3', 'Away_Lineup_4', 'Away_Lineup_5', 'Away_Lineup_6',
                    'Away_Lineup_7', 'Away_Lineup_8', 'Away_Lineup_9']].values[0])
    away_dh = plays.Away_DH.values[0]
    if np.isnan(away_dh):
        #either we have a pitcher as DH (Ohtani), or we have lost the DH already somehoe
        if plays.Away_Pitcher.values[0] in away_bo:
            away_dh = plays.Away_Pitcher.values[0]
        else:
            raise ValueError('No DH somehow')
    #I set the DH spot of the lineup to 10
    away_bo[away_bo.index(away_dh)] = 10

    #define the bench
    away_starters = set(away_bo) | {away_dh}
    away_pos_players = game_ria.loc[game_ria.position != 'Pitcher']
    away_bench = [i for i in away_pos_players.player_id.unique() if i not in away_starters]

    #define the away starting pitcher and bullpen
    away_pitcher = plays.Away_Pitcher.values[0]
    away_bullpen = game_ria.loc[(game_ria.position == 'Pitcher') & (game_ria.player_id != away_pitcher)].player_id.unique()

    #define the initial state vector
    ## NOTE: Originally I cut off my bench's at four players and my bullpens at 9 pitchers. This time around I want to include
    #expanded rosters. The numbering of the state dict depends on four bench players and 9 bullpen pitchers though, so
    #I'm going to take the first four bench players and 9 bullpen pitchers to define the same state vector that I used
    #to have, then I'll append any additional pitchers and bench players to the end of the vector. 

    #this is kinda inefficient, but again I'm trying to match formatting
    state_df = game_ri[['game_date', 'game_pk']].iloc[[0]]
    state_df['batter_per_game'] = plays.At_Bat.values[0]
    state_df['home_team_name'] = game_ri.game_home_team.values[0]
    state_df['away_team_name'] = game_ri.game_away_team.values[0]
    if plays.Score_Deficit.values[-1] > 0:
        state_df['home_team_won'] = 1
    else:
        state_df['home_team_won'] = 0

    state_df[['is_top_of_inning' , 'inning', 'bo_state_index', 'home_score_diff', 'home_batter_index', 'away_batter_index']] = np.array([[True, 1, 0, 0, 0, 0]])
    state_df[['home_batting_order_' + str(i) for i in range(9)]] = home_bo
    state_df['home_dh'] = home_dh
    state_df['home_lost_dh'] = False
    state_df['home_pitcher'] = home_pitcher
    for i in range(9):
        if i >= len(home_bullpen):
            state_df['home_bullpen_' + str(i)] = -1
        else:
            state_df['home_bullpen_' + str(i)] = home_bullpen[i]

    for i in range(4):
        if i >= len(home_bench):
            state_df['home_bench_' + str(i)] = -1
        else:
            state_df['home_bench_' + str(i)] = home_bench[i]

    state_df[['away_batting_order_' + str(i) for i in range(9)]] = away_bo
    state_df['away_dh'] = away_dh
    state_df['away_lost_dh'] = False
    state_df['away_pitcher'] = away_pitcher
    for i in range(9):
        if i >= len(away_bullpen):
            state_df['away_bullpen_' + str(i)] = -1
        else:
            state_df['away_bullpen_' + str(i)] = away_bullpen[i]

    for i in range(4):
        if i >= len(away_bench):
            state_df['away_bench_' + str(i)] = -1
        else:
            state_df['away_bench_' + str(i)] = away_bench[i]

    state_df['home_pitcher_batters_faced'] = 0
    state_df['home_pitcher_can_be_subbed'] = False
    state_df['away_pitcher_batters_faced'] = 0
    state_df['away_pitcher_can_be_subbed'] = False
    state_df['node_type'] = 'batter'

    #now here is where we add in the expanded rosters. I found in the roster info dataframe that the max 
    #bullpen size was 15 and the maxbench size was 6
    for i in range(9, 15):
        if i >= len(home_bullpen):
            state_df['home_bullpen_' + str(i)] = -1
        else:
            state_df['home_bullpen_' + str(i)] = home_bullpen[i]
    for i in range(4, 6):
        if i >= len(home_bench):
            state_df['home_bench_' + str(i)] = -1
        else:
            state_df['home_bench_' + str(i)] = home_bench[i]

    for i in range(9, 15):
        if i >= len(away_bullpen):
            state_df['away_bullpen_' + str(i)] = -1
        else:
            state_df['away_bullpen_' + str(i)] = away_bullpen[i]
    for i in range(4, 6):
        if i >= len(away_bench):
            state_df['away_bench_' + str(i)] = -1
        else:
            state_df['away_bench_' + str(i)] = away_bench[i]

    state_df['terminal_value'] = np.nan
    #we're assuming no pinch hits, relievers, or intentional walks to start the game, so we get three nodes
    #batter play at bat, pitcher play at bat, then chance
    state_df['action_taken'] = 'play at bat'
    state_df2 = state_df.copy()
    state_df2['node_type'] = 'pitcher'
    state_df3 = state_df2.copy()
    state_df3['node_type'] = 'chance'
    state_df3['action_taken'] = plays.bo_state.values[1]
    return pd.concat((state_df, state_df2, state_df3), ignore_index = True)

In [11]:
start_nodes = pd.DataFrame()
for gamepk in tqdm.tqdm(gamepks):
    game_start = define_start_nodes(gamepk)
    start_nodes = pd.concat((start_nodes, game_start), ignore_index = True)

100%|██████████████████████████████████████████████████████████████████████████████| 2425/2425 [01:42<00:00, 23.76it/s]


In [13]:
start_nodes.to_csv('start_nodes.csv', index = False)