In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import statsapi
import glob
import tqdm
import pybaseball
pd.set_option('display.max_columns', None)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#define bo_states. The main thing this affects is the index. We want to be consistent, meaning index 4 should 
#always be '1___-0'
bo_states = ['___-0', '__3-0', '_2_-0', '_23-0', '1__-0', '1_3-0', '12_-0',
       '123-0', '___-1', '__3-1', '_2_-1', '_23-1', '1__-1', '1_3-1',
       '12_-1', '123-1', '___-2', '__3-2', '_2_-2', '_23-2', '1__-2',
       '1_3-2', '12_-2', '123-2']

In [3]:
#get the game_pks scraped by Caleb/Tristan
game_files = glob.glob('../baseball-scraping/games/*.csv')
gamepks = [int(game.split('_')[1]) for game in game_files]

In [4]:
#this dictionary stores player info so we save time not having to ping the api for a player if we already have
pinged_players = dict()

In [5]:
def get_roster_info(gamepk):
    plays = pd.read_csv(f'../baseball-scraping/games/game_{gamepk}_decisions.csv').sort_values(by = 'At_Bat')
    if plays.shape[0] == 0:
        return pd.DataFrame()
    #define the baserunner out state in the format I use 
    plays['thirdbase'] = '_'
    plays.loc[np.isnan(plays.Third_Base) == False, 'thirdbase'] = '3'
    plays['secondbase'] = '_'
    plays.loc[np.isnan(plays.Second_Base) == False, 'secondbase'] = '2'
    plays['firstbase'] = '_'
    plays.loc[np.isnan(plays.First_Base) == False, 'firstbase'] = '1'
    plays['bo_state'] = plays.firstbase + plays.secondbase + plays.thirdbase + '-' + plays.Outs.astype(str)

    #we need the teams and rosters from the stats api. Note the format I end up with may not be the most efficient
    #I could probably just leave things in the dictionaries, but I'm copying the format I got with the internal Rangers
    #data so that I can copy and paste code
    sapi_game = statsapi.get('game', {'gamePk': gamepk})
    game_type = sapi_game['gameData']['game']['type']
    if game_type == 'S' or game_type == 'E':
        raise ValueError('Bad Game Type')
    game_date = sapi_game['gameData']['datetime']['officialDate']
    away_team = sapi_game['gameData']['teams']['away']['abbreviation']
    home_team = sapi_game['gameData']['teams']['home']['abbreviation']

    #away player info
    away_players = sapi_game['liveData']['boxscore']['teams']['away']['players']
    away_ids = []
    away_names = []
    away_bo = []
    away_position = []
    for player in away_players.keys():
        player_id = away_players[player]['person']['id']
        away_ids.append(player_id)
        if player_id in pinged_players:
            player_info = pinged_players[player_id]
        else:
            player_info = statsapi.get('person', {'personId': player_id})['people'][0]
            pinged_players.update({player_id: player_info})
        away_names.append(player_info['fullName'])
        away_position.append(player_info['primaryPosition']['type'])
        if 'battingOrder' in away_players[player].keys():
            away_bo.append(away_players[player]['battingOrder'])
        else:
            away_bo.append(None)
    roster_infoa = pd.DataFrame({'player_id': away_ids, 'player_name': away_names, 'batting_order': away_bo, 'position': away_position})
    roster_infoa['team'] = away_team

    #home player info
    home_players = sapi_game['liveData']['boxscore']['teams']['home']['players']
    home_ids = []
    home_names = []
    home_bo = []
    home_position = []
    for player in home_players.keys():
        player_id = home_players[player]['person']['id']
        home_ids.append(player_id)
        if player_id in pinged_players:
            player_info = pinged_players[player_id]
        else:
            player_info = statsapi.get('person', {'personId': player_id})['people'][0]
            pinged_players.update({player_id: player_info})
        home_names.append(player_info['fullName'])
        home_position.append(player_info['primaryPosition']['type'])
        if 'battingOrder' in home_players[player].keys():
            home_bo.append(home_players[player]['battingOrder'])
        else:
            home_bo.append(None)
    roster_infoh = pd.DataFrame({'player_id': home_ids, 'player_name': home_names, 'batting_order': home_bo, 'position': home_position})
    roster_infoh['team'] = home_team
    roster_info = pd.concat((roster_infoa, roster_infoh))
    roster_info['game_pk'] = gamepk
    roster_info['game_date'] = game_date
    roster_info['game_home_team'] = home_team
    roster_info['game_away_team'] = away_team
    return roster_info

ri = pd.DataFrame()
bench_size = 4
bp_size = 9
for game in tqdm.tqdm(gamepks):
    roster_info = get_roster_info(game)
    if roster_info.shape[0] == 0:
        continue
    teams = roster_info.team.unique()
    team1 = roster_info.loc[roster_info.team == teams[0]]
    team1p = team1.loc[team1.position.isin(['Two-Way Player', 'Pitcher'])].player_id.unique().size
    team1pp = team1.loc[team1.position != 'Pitcher'].player_id.unique().size
    team2 = roster_info.loc[roster_info.team == teams[1]]
    team2p = team2.loc[team2.position.isin(['Two-Way Player', 'Pitcher'])].player_id.unique().size
    team2pp = team2.loc[team2.position != 'Pitcher'].player_id.unique().size
    if team2p-1 > bp_size:
        bp_size = team2p - 1
    if team1p-1 > bp_size:
        bp_size = team1p - 1
    if team1pp-9 > bench_size:
        bench_size = team1pp - 9
    if team2pp - 9> bench_size:
        bench_size = team2pp - 9
    ri = pd.concat((ri, roster_info),ignore_index = True)

100%|██████████████████████████████████████████████████████████████████████████████| 2425/2425 [31:43<00:00,  1.27it/s]


In [6]:
ri.to_csv('roster_info.csv', index = False)

In [7]:
bench_size, bp_size

(6, 15)