In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import os
import time
import unidecode
from selenium.webdriver.common.by import By

# Basic functions

In [27]:
def start_driver():
    geckodriver = "C:\Program Files (x86)\geckodriver"
    os.environ["webdriver.gecko.driver"] = geckodriver
    driver = webdriver.Firefox(executable_path = geckodriver)
    return driver

def load_team_mapping():
    os.chdir('T:\Baseball')
    df = pd.read_csv('teams_mapping.csv', index_col=False)
    return df

def get_roster_url(team):
    team_new = team.lower().replace(' ', '-')
    roster_url = 'https://www.fangraphs.com/roster-resource/depth-charts/' + team_new
    return roster_url

def get_roster_source(roster_url,driver):

    driver.set_page_load_timeout(5)
    try :
        driver.get(roster_url)
    except TimeoutException as e:
        # try again
        driver.get(roster_url)
    time.sleep(5)
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')
    table = soup.find("div", {"class":"depth-chart-wrapper is-spring-training"})

    return table

def get_player_url(roster_table):
    player = []
    player_url = []
    
    dcrt = roster_table.find_all('div',{"class":"depth-charts__roster-tables"})
    
    # first seven tables are the 40 man roster
    for i in range(7):        
        for row2 in dcrt[i].tbody.find_all('td',{"data-stat":"PLAYER"}):
                    
            #player list
            player_tmp = row2.text
            player.append(player_tmp)
                    
            #player url list
            player_url_tmp = row2.find('a')['href']
            size = len(player_url_tmp)
            player_url_tmp = 'https:' + player_url_tmp
            player_url.append(player_url_tmp)
                    
    return player, player_url

def get_df_all_players(teams, drive):
    # Get all players
    player_all = []
    player_url_all = []
    team_all = []
    team_abbr = []
    df_team_abbr = load_team_mapping()
    for team in teams:
        team_abbr_tmp = df_team_abbr[df_team_abbr['Teams'] == team]['Teams_abbr'].to_string()[5:]
        roster_url = get_roster_url(team)
        roster_table = get_roster_source(roster_url, driver)
    
        player, player_url = get_player_url(roster_table)
        for p in player:
            player_all.append(p)
            team_all.append(team)
            team_abbr.append(team_abbr_tmp)
        for u in player_url:
            player_url_all.append(u)
            
    df_player_all = pd.DataFrame({'Player':player_all, 'Player_url': player_url_all, 'Team':team_all, 'Team_abbr': team_abbr})
        
    return df_player_all

def get_player_game_log_url(df_player_all, dirver):
    player_game_log_url_all = []
    
    for i in range(len(df_player_all)):
        base_url = df_player_all['Player_url'][i]
        
        driver.set_page_load_timeout(7)
        try :
            driver.get(base_url)
        except TimeoutException as e:
            # try again
            driver.get(base_url)
            try :
                driver.get(base_url)
            except TimeoutException as e:
                # try again
                driver.get(base_url)
        base_url = driver.current_url
        
        stat_ind = base_url.index('stats?')
        before_stat_str = base_url[:stat_ind]
        pos_ind = base_url.index('position')
        after_posi_str = base_url[pos_ind:]
        
        player_game_log_url_all_tmp = before_stat_str + 'game-log?type=1&gds=2020-03-01&gde=2022-10-31&season=&' + after_posi_str
        player_game_log_url_all.append(player_game_log_url_all_tmp)
    return player_game_log_url_all

def batting(url, season, driver):  
    
    batting_url = url + 'splits?season=' + str(season)
    
    driver.set_page_load_timeout(4)
    try :
        driver.get(batting_url)
    except TimeoutException as e:
        try :
            driver.get(batting_url)
        except TimeoutException as e:
            # try again
            driver.get(batting_url)
     
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')
    
    #In case of rediction to pitch 
    if 'position=P' in driver.current_url:
        real_batting_url = soup.find("div", {"id":"portal-player-pages-menu"}).find("ul",
                        {"class":"menu-player-page__batpitch"}).find_all("a")[0]['href']
        real_batting_url = 'https://www.fangraphs.com/' + real_batting_url + '&season=' + str(season)
        driver.get(real_batting_url)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
    table_advanced = soup.find("div", {"id":'advanced',"class":"player-page-table"})
    table_battedball = soup.find("div", {"id":'batted-ball',"class":"player-page-table"})
    
    if (table_advanced is None) or (table_battedball is None):
        print(url,'table_advanced is None')
        # If encountering error page, use Derek Lowe as a dummy pitcher 
        dummy_batting_url = 'https://www.fangraphs.com/players/derek-lowe/199/splits?position=PB&season=' + str(season) 
        driver.get(dummy_batting_url)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')    
        table_advanced = soup.find("div", {"id":'advanced',"class":"player-page-table"})
        table_battedball = soup.find("div", {"id":'batted-ball',"class":"player-page-table"})
        
    #driver.quit()
    return table_advanced, table_battedball


## Define Functions - Batter Game Logs

In [28]:
#stat_tabs = soup.find('ul',{"class":"menu-mega__game-log__type type-mlb"}).find_all("li",{"class":"menu-mega__menu-item"})

#stat_tabs[1].click()

def get_game_log_stats_standard(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' in player_game_log_url):
            continue
        
        
        df_standard = pd.DataFrame(columns=['Player','Date', 'Team', 'Opp', 'BO', 'Pos', 'G', 'AB',
        'PA', 'H', '1B', '2B', '3B', 'HR',  'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG'])

            
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(3)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        print(player_name)
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})
        
        count = 0
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            bo = columns[3].text.strip()
            pos = columns[4].text.strip()
            g = columns[6].text.strip()
            ab = columns[7].text.strip()
            pa = columns[8].text.strip()
            
            h = columns[9].text.strip()
            b1 = columns[10].text.strip()
            b2 = columns[11].text.strip()
            b3 = columns[12].text.strip()
            hr = columns[13].text.strip()
            r = columns[14].text.strip()
            rbi = columns[15].text.strip()
            bb = columns[16].text.strip()
            
            ibb = columns[17].text.strip()
            so = columns[18].text.strip()
            hbp = columns[19].text.strip()
            sf = columns[20].text.strip()
            sh = columns[21].text.strip()
            gdp = columns[22].text.strip()
            sb = columns[23].text.strip()
            cs = columns[24].text.strip()
            avg = columns[25].text.strip()
    
            df_standard = df_standard.append({'Player':player_name, 'Date':date, 'Team':team, 'Opp':opp, 
                                              'BO':bo, 'Pos':pos, 'G':g, 'AB':ab,
                                                'PA':pa, 'H':h, '1B':b1, '2B':b2, '3B':b3, 'HR':hr,  
                                              'R':r, 'RBI':rbi, 'BB':bb, 'IBB':ibb, 
                                              'SO':so, 'HBP':hbp, 'SF':sf, 'SH':sh, 'GDP':gdp, 'SB':sb, 
                                              'CS':cs, 'AVG':avg}, ignore_index=True)
        print(count)
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_standard.csv'
        df_standard.to_csv(file_name, index = False)
    return df_count_rows


def get_game_log_stats_advanced(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    for i in range(len(list_player_game_log_url)):
        list_player_game_log_url[i] = list_player_game_log_url[i].replace('type=1','type=2')

    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' in player_game_log_url):
            continue
    
        df_advanced = pd.DataFrame(columns=['Player','Date', 'Team', 'Opp', 'BO', 'Pos', 'BB%', 'K%', 'BB/K', 'AVG', 'OBP', 'SLG', 'OPS', 'ISO', 'Spd', 'BABIP',
        'wSB', 'wRC', 'wRAA', 'wOBA', 'wRC+'])
    
        
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(3)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        print(player_name)
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})
        
        count = 0
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            bo = columns[3].text.strip()
            pos = columns[4].text.strip()
            bb = columns[6].text.strip()
            k = columns[7].text.strip()
            bbk = columns[8].text.strip()
            
            avg = columns[10].text.strip()
            obp = columns[11].text.strip()
            slg = columns[12].text.strip()
            ops = columns[13].text.strip()
            
            iso = columns[15].text.strip()
            spd = columns[16].text.strip()
            babip = columns[17].text.strip()
            
            wsb = columns[19].text.strip()
            wrc = columns[21].text.strip()
            wraa = columns[22].text.strip()
            woba = columns[23].text.strip()
            wrc_plus = columns[24].text.strip()
            
    
            df_advanced = df_advanced.append({'Player':player_name, 'Date':date, 'Team':team, 'Opp':opp, 'BO':bo, 
                                              'Pos':pos, 'BB%':bb, 'K%':k, 'BB/K':bbk, 'AVG':avg, 'OBP':obp, 'SLG':slg, 
                                              'OPS':ops, 'ISO':iso, 'Spd':spd, 'BABIP':babip,
                                              'wSB':wsb, 'wRC':wrc, 'wRAA':wraa, 'wOBA':woba, 'wRC+':wrc_plus}, ignore_index=True)
        print(count)
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_batter_advanced.csv'
        df_advanced.to_csv(file_name, index = False)
    return df_count_rows



def get_game_log_stats_statcast(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    for i in range(len(list_player_game_log_url)):
        list_player_game_log_url[i] = list_player_game_log_url[i].replace('type=1','type=24')

    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' in player_game_log_url):
            continue
    
        df_statcast = pd.DataFrame(columns=['Player','Date', 'Team', 'Opp', 'BO', 'Pos', 'Events', 'EV', 'maxEV', 'LA', 'Barrels', 
                                        'Barrel%', 'HardHit', 'HardHit%']) 
    
        
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(3)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        print(player_name)
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})
        
        count = 0
        if table_standard is None:
            continue
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            bo = columns[3].text.strip()
            pos = columns[4].text.strip()
            events = columns[6].text.strip()
            ev = columns[7].text.strip()
            maxev = columns[8].text.strip() 
            la = columns[9].text.strip()
            barrels = columns[10].text.strip()
            barrels_per = columns[11].text.strip()
            hardhit = columns[12].text.strip()
            hardhit_per = columns[13].text.strip()

            
    
            df_statcast = df_statcast.append({'Player':player_name,'Date':date, 'Team':team, 'Opp':opp, 'BO':bo, 'Pos':pos, 'Events':events, 
                                              'EV':ev, 'maxEV':maxev, 'LA':la, 'Barrels':barrels, 
                                        'Barrel%':barrels_per, 'HardHit':hardhit, 'HardHit%':hardhit_per}, ignore_index=True)
        print(count)
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_batter_statcast.csv'
        df_statcast.to_csv(file_name, index = False)
    return df_count_rows
    


## Define Functions - Pitcher Game Logs

In [29]:
def get_game_log_stats_standard_p(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' not in player_game_log_url):
            continue
        
        
        df_standard = pd.DataFrame(columns=['Player', 'Date','Team','Opp','GS','W','L','ERA','G',
                                            'GS2', 'CG', 'ShO', 'SV', 'HLD', 'BS', 'IP', 'TBF', 'H', 'R', 'ER', 'HR', 
                                            'BB', 'IBB', 'HBP', 'WP', 'BK', 'SO', 'GSv2'])
            
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(5)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})
        
        if table_standard is None:
            continue
        
        count = 0
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            gs = columns[3].text.strip()
            
            w = columns[5].text.strip()
            
            l = columns[6].text.strip()
            era = columns[7].text.strip()
            g = columns[8].text.strip()
            gs2 = columns[9].text.strip()
            cg = columns[10].text.strip()
            sho = columns[11].text.strip()
            sv = columns[12].text.strip()
            hld = columns[13].text.strip()
            bs = columns[14].text.strip()
            ip = columns[15].text.strip()
            tbf = columns[16].text.strip()
            h = columns[17].text.strip()
            r = columns[18].text.strip()
            er = columns[19].text.strip()
            hr = columns[20].text.strip()
            bb = columns[21].text.strip()
            ibb = columns[22].text.strip()
            hbp = columns[23].text.strip()
            wp = columns[24].text.strip()
            bk = columns[25].text.strip()
            so = columns[26].text.strip()
            
            gsv2 = columns[28].text.strip()
            
            df_standard = df_standard.append({'Player':player_name, 'Date':date, 'Team':team, 
                                              'Opp': opp,'GS':gs,'W':w,'L':l,'ERA':era,'G':g,
                                            'GS2':gs2, 'CG':cg, 'ShO':sho, 'SV':sv, 'HLD':hld, 
                                              'BS':bs, 'IP':ip, 'TBF':tbf, 'H':h, 'R':r, 'ER':er, 'HR':hr, 
                                            'BB':bb, 'IBB':ibb, 'HBP':hbp, 'WP':wp, 'BK':bk, 'SO':so, 'GSv2':gsv2}, ignore_index=True)
        
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_pitcher_standard.csv'
        df_standard.to_csv(file_name, index = False)
        print(player_name)
        print("Games:", count)
    return df_count_rows

def get_game_log_stats_advanced_p(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    for i in range(len(list_player_game_log_url)):
        list_player_game_log_url[i] = list_player_game_log_url[i].replace('type=1','type=2')

    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' not in player_game_log_url):
            continue
    
        df_advanced = pd.DataFrame(columns=['Player','Date', 'Team', 'Opp', 'GS', 'K/9', 'BB/9', 'K/BB', 'HR/9', 'K%', 'BB%', 'K-BB%', 'AVG', 
    'WHIP', 'BABIP', 'LOB%', 'ERA-', 'FIP-', 'FIP'])

        
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(5)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})

        if table_standard is None:
            continue        
        
        count = 0
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            gs = columns[3].text.strip()
            
            k9 = columns[5].text.strip()
            bb9 = columns[6].text.strip()
            kbb = columns[7].text.strip()
            hr9 = columns[8].text.strip()
            
            k_per = columns[10].text.strip()
            bb_per = columns[11].text.strip()
            kbb_per = columns[12].text.strip()
            
            avg = columns[14].text.strip()
            whip = columns[15].text.strip()
            babip = columns[16].text.strip()
            lob_per = columns[17].text.strip()
            
            era_minus = columns[19].text.strip()
            fip_minus = columns[20].text.strip()
            
            fip = columns[22].text.strip()
            
    
            df_advanced = df_advanced.append({'Player':player_name, 'Date':date, 'Team':team, 'Opp':opp, 'GS': gs, 
                                              'K/9':k9, 'BB/9':bb9, 'K/BB':kbb, 'HR/9':hr9, 'K%':k_per, 
                                              'BB%':bb_per, 'K-BB%':kbb_per, 'AVG':avg, 'WHIP':whip, 'BABIP':babip, 
                                              'LOB%':lob_per, 'ERA-':era_minus, 'FIP-':fip_minus, 'FIP':fip}, ignore_index=True)
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_pitcher_advanced.csv'
        df_advanced.to_csv(file_name, index = False)
        print(player_name)
        print("Games:", count)
    return df_count_rows

def get_game_log_stats_statcast_p(list_player_game_log_url, directory, driver):
    
    os.chdir(directory)
    for i in range(len(list_player_game_log_url)):
        list_player_game_log_url[i] = list_player_game_log_url[i].replace('type=1','type=24')

    player_names = []
    count_rows = []
    for i in range(len(list_player_game_log_url)):
        player_game_log_url = list_player_game_log_url[i]
        if ('position=P' not in player_game_log_url):
            continue
    
        df_statcast = pd.DataFrame(columns=['Player','Date', 'Team', 'Opp', 'BO','GS', 'Pos', 'Events', 'EV', 'maxEV', 'LA', 'Barrels', 
                                        'Barrel%', 'HardHit', 'HardHit%']) 
              
        
        driver.set_page_load_timeout(4)
        try :
            driver.get(player_game_log_url)
        except TimeoutException as e:
            try :
                driver.get(player_game_log_url)
            except TimeoutException as e:
                # try again
                driver.get(player_game_log_url)
        time.sleep(5)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        player_name = soup.find('div', {'class':'player-info-box-name'}).find('h1').text.strip()
        table_standard = soup.find("div", {"class":"fg-data-grid undefined with-selected-rows"})
        
        if table_standard is None:
            continue
        
        count = 0
        for row in table_standard.tbody.find_all('tr'):
            count += 1
            columns = row.find_all('td')
            if (columns == []):
                continue
            #if (columns[0].text.strip() == 'Date'):
            #    continue
            date = columns[0].text.strip()
            team = columns[1].text.strip()
            opp = columns[2].text.strip()
            gs = columns[3].text.strip()
            
            events = columns[5].text.strip()
            ev = columns[6].text.strip()
            maxev = columns[7].text.strip()
            la = columns[8].text.strip() 
            barrels = columns[9].text.strip()
            barrels_per = columns[10].text.strip()
            hardhit = columns[11].text.strip()
            hardhit_per = columns[12].text.strip()
            df_statcast = df_statcast.append({'Player':player_name,'Date':date, 'Team':team, 'Opp':opp, 'BO':bo, 
                                              'GS':gs, 'Events':events, 'EV':ev, 'maxEV':maxev, 'LA':la, 'Barrels':barrels, 
                                        'Barrel%':barrels_per, 'HardHit':hardhit, 'HardHit%':hardhit_per}, ignore_index=True)
        player_names.append(player_name)
        count_rows.append(count)
        df_count_rows = pd.DataFrame({'Player':player_names,'Count_rows':count_rows })
       
        player_name = player_name.replace(" ", "_")
        file_name = player_name + '_pitcher_statcast.csv'
        df_statcast.to_csv(file_name, index = False)
        print(player_name)
        print("Games:", count)
    return df_count_rows
    


# Update team roster

<p>
Update local directory regularly

In [31]:
os.chdir("BaseballGit\\Data")

In [32]:
os.getcwd()

'T:\\Baseball\\BaseballGit\\Data'

In [22]:
teams = ['orioles','red sox','white sox', 'guardians', 'tigers', 'astros', 'royals', 'angels', 'twins', 'yankees',
            'athletics', 'mariners' , 'rays', 'rangers', 'blue jays', 'diamondbacks', 'braves','cubs','reds',
             'rockies','dodgers', 'marlins', 'brewers', 'mets', 'phillies', 'pirates', 'padres', 'giants', 'cardinals', 
             'nationals']
driver = start_driver()
df_player_all = get_df_all_players(teams, driver)
# Save file
df_player_all.to_csv("Gamelogs\\TeamRoster\\df_player_all.csv", index = False)
driver.quit()

FileNotFoundError: [Errno 2] No such file or directory: 'Gamelogs\\TeamRoster\\df_player_all.csv'

In [7]:
# Load team roster
df_player_all = pd.read_csv("Gamelogs\\TeamRoster\\df_player_all.csv")

# Update game log urls

<p>
Update local directory regularly

In [8]:
driver = start_driver()
game_log_url_all = get_player_game_log_url(df_player_all, driver)
driver.quit()
df_player_game_log_url = pd.DataFrame()
df_player_game_log_url['playerUrl'] = game_log_url_all
df_player_game_log_url.to_csv("Gamelogs\\GameLogUrls\\df_player_game_log_url.csv", index = False)

Unnamed: 0,Player,Player_url,Team,Team_abbr
0,Cedric Mullins,https://www.fangraphs.com/players/cedric-mulli...,orioles,BAL
1,Trey Mancini,https://www.fangraphs.com/players/trey-mancini...,orioles,BAL
2,Anthony Santander,https://www.fangraphs.com/players/anthony-sant...,orioles,BAL
3,Ryan Mountcastle,https://www.fangraphs.com/players/ryan-mountca...,orioles,BAL
4,Austin Hays,https://www.fangraphs.com/players/austin-hays/...,orioles,BAL
5,Adley Rutschman,https://www.fangraphs.com/players/adley-rutsch...,orioles,BAL
6,Ramón Urías,https://www.fangraphs.com/players/ramon-urias/...,orioles,BAL
7,Rougned Odor,https://www.fangraphs.com/players/rougned-odor...,orioles,BAL
8,Jorge Mateo,https://www.fangraphs.com/players/jorge-mateo/...,orioles,BAL
9,Robinson Chirinos,https://www.fangraphs.com/players/robinson-chi...,orioles,BAL


# Scrape game logs by category

In [None]:
list_player_game_log_url = pd.read_csv("GameLogUrls\\df_player_game_log_url.csv")['player_game_log_url'].tolist()

<p>
1. Update batter standard data game logs

In [12]:
driver = start_driver()
directory = 'Gamelogs\\game_logs\\Batter\\2020_2022\\Standard'
df_count_rows = get_game_log_stats_standard(list_player_game_log_url, directory, driver)
driver.quit()

NameError: name 'get_game_log_stats_standard' is not defined

<p>
2. Update batter advanced data game logs

In [397]:
driver = start_driver()
directory = 'Gamelogs\\game_logs\\Batter\\2020_2022\\Advanced'
df_count_rows = get_game_log_stats_advanced(list_player_game_log_url, directory, driver)
driver.quit()

Jordan Lyles
59
Tyler Wells
59
Dean Kremer
20
Kyle Bradish
10
Jorge López
75
Félix Bautista
31
Dillon Tate
108
Cionel Pérez
60
Joey Krehbiel
30
Keegan Akin
51
Bryan Baker
28
Nick Vespi
7
Austin Voth
87
Chris Ellis
9
John Means
39
Alexander Wells
13
Nick Pivetta
52
Josh Winckowski
3
Rich Hill
54
Michael Wacha
50
Tanner Houck
41
John Schreiber
38
Matt Strahm
52
Jake Diekman
121
Hansel Robles
116
Austin Davis
68
Tyler Danish
20
Hirokazu Sawamura
83
Ryan Brasier
68
Nathan Eovaldi
55
James Paxton
6
Chris Sale
9
Garrett Whitlock
61
Matt Barnes
109
Josh Taylor
72
Dylan Cease
59
Lucas Giolito
56
Johnny Cueto
43
Michael Kopech
58
Lance Lynn
45
Kendall Graveman
96
Joe Kelly
72
Reynaldo López
56
José Ruiz
94
Tanner Banks
17
Jimmy Lambert
12
Davis Martin
4
Vince Velasquez
46
Jonathan Stiever
3
Aaron Bummer
95
Kyle Crick
50
Garrett Crochet
61
Liam Hendriks
123
Shane Bieber
43
Triston McKenzie
47
Zach Plesac
47
Cal Quantrill
73
Emmanuel Clase
106
Eli Morgan
39
Sam Hentges
53
Bryan Shaw
120
Trevor St

Tim Hill
125
Craig Stammen
118
Joe Musgrove
54
Austin Adams
73
Pierce Johnson
97
Drew Pomeranz
49
Robert Suarez
23
Logan Webb
56
Carlos Rodón
43
Alex Wood
50
Alex Cobb
38
Camilo Doval
62
Jake McGee
110
Dominic Leone
100
Tyler Rogers
146
John Brebbia
50
Jarlín García
102
José Álvarez
100
Zack Littell
96
Sam Long
27
Matthew Boyd
28
Anthony DeSclafani
45
Jakob Junis
34
Brendan Donovan
50
Adam Wainwright
57
Dakota Hudson
24
Andre Pallante
22
Miles Mikolas
24
Jack Flaherty
28
Ryan Helsley
88
Giovanny Gallegos
118
Génesis Cabrera
120
Nick Wittgren
116
Zack Thompson
3
Drew VerHagen
15
T.J. McFarland
89
Johan Oviedo
25
Jordan Hicks
19
Steven Matz
49
Alex Reyes
89
Patrick Corbin
58
Josiah Gray
28
Paolo Espino
61
Jackson Tetreault
2
Erick Fedde
55
Tanner Rainey
82
Kyle Finnegan
127
Carl Edwards Jr.
32
Steve Cishek
131
Reed Garrett
3
Francisco Perez
13
Andres Machado
60
Erasmo Ramírez
49
Seth Romero
3
Joe Ross
20
Aníbal Sánchez
11
Stephen Strasburg
8
Víctor Arano
25
Sean Doolittle
76
Will Harris


<p>
3. Update pitcher standard data game logs

In [401]:
driver = start_driver()
directory = 'Gamelogs\\game_logs\\Pitcher\\2020_2022\\Standard'
df_count_rows = get_game_log_stats_statcast_p(list_player_game_log_url, directory, driver)
driver.quit()

Trevor Richards
95
Tayler Saucedo
34
Andrew Vasquez
11
Madison Bumgarner
51
Luke Weaver
29
Merrill Kelly 켈리
48
Zach Davies
60
Zac Gallen
49
Mark Melancon
118
Ian Kennedy
102
Joe Mantiply
93
Noé Ramirez
94
Kyle Nelson
35
Sean Poppen
52
J.B. Wendelken
96
Caleb Smith
72
Humberto Castellanos
34
J.B. Bukauskas
22
Max Fried
55
Spencer Strider
17
Charlie Morton
57
Kyle Wright
24
Ian Anderson
45
Kenley Jansen
131
A.J. Minter
119
Will Smith
122
Collin McHugh
61
Darren O'Day
52
Jackson Stephens
15
Dylan Lee
10
Jesse Chavez
73
Mike Soroka
3
Jay Jackson
24
Luke Jackson
94
Tyler Matzek
108
Kirby Yates
6
Kyle Hendricks
59
Caleb Kilian
3
Matt Swarmer
4
Keegan Thompson
49
Justin Steele
34
David Robertson
35
Rowan Wick
70
Mychal Givens
107
Chris Martin
91
Scott Effross
47
Brandon Hughes
12
Daniel Norris
97
Alec Mills
50
Adbert Alzolay
36
Wade Miley
39
Drew Smyly
47
Marcus Stroman
44
Codi Heuer
90
Ethan Roberts
9
Manuel Rodríguez
20
Brad Wieck
16
Tyler Mahle
59
Luis Castillo
55
Hunter Greene
13
Graham A

<p>
4. Update pitcher advanced data game logs

In [29]:
# Run 4
driver = start_driver()
directory = 'Gamelogs\\game_logs\\Pitcher\\2020_2022\\Advanced'
df_count_rows = get_game_log_stats_standard_p(list_player_game_log_url, directory, driver)
driver.quit()

Adam Wainwright
61
Dakota Hudson
28
Andre Pallante
26
Miles Mikolas
28
Jack Flaherty
30
Ryan Helsley
96
Giovanny Gallegos
125
Génesis Cabrera
123
Nick Wittgren
119
Zack Thompson
10
Drew VerHagen
17
T.J. McFarland
89
Johan Oviedo
30
Jordan Hicks
24
Steven Matz
49
Alex Reyes
89
Patrick Corbin
62
Josiah Gray
31
Paolo Espino
66
Jackson Tetreault
4
Erick Fedde
59
Tanner Rainey
91
Kyle Finnegan
136
Carl Edwards Jr.
40
Steve Cishek
139
Reed Garrett
6
Francisco Perez
14
Andres Machado
68
Erasmo Ramírez
54
Seth Romero
3
Joe Ross
20
Aníbal Sánchez
11
Stephen Strasburg
8
Víctor Arano
25
Sean Doolittle
76
Will Harris
29
Hunter Harvey
24
Evan Lee
4
Josh Rogers
23
Mason Thompson
38


## Merge standard data game logs for pitchers and batters

In [16]:
# Read all bat_stat_standard
directory = 'Gamelogs\\game_logs\\Batter\\2020_2022\\Standard'
count = 0
for filename in os.listdir(directory):
    if '.csv' not in filename:
        continue
    f = os.path.join(directory, filename)
    if count == 0:
        df_read_bat_stat_standard = pd.read_csv(f)
    else:
        df_read_bat_stat_standard_tmp = pd.read_csv(f)
        df_read_bat_stat_standard = df_read_bat_stat_standard.append(df_read_bat_stat_standard_tmp,ignore_index=True)
    count += 1
    
#Change column names
changed_name = []
for i in range(len(df_read_bat_stat_standard.columns)):
    colname = df_read_bat_stat_standard.columns[i]
    if (colname == 'Player') or (colname == 'Date') or (colname in changed_name):
        continue
    colname_new = colname + '_batter'
    df_read_bat_stat_standard = df_read_bat_stat_standard.rename(columns={colname: colname_new})
    changed_name.append(colname_new)
df_read_bat_stat_standard = df_read_bat_stat_standard.rename(columns={'Player': 'Batter'})

In [17]:
# Read all pitch_stat_standard
directory = 'Gamelogs\\game_logs\\Pitcher\\2020_2022\\Standard'
count = 0
for filename in os.listdir(directory):
    if '.csv' not in filename:
        continue
    f = os.path.join(directory, filename)
    if count == 0:
        df_read_pitch_stat_standard = pd.read_csv(f)
    else:
        df_read_pitch_stat_standard_tmp = pd.read_csv(f)
        df_read_pitch_stat_standard = df_read_pitch_stat_standard.append(df_read_pitch_stat_standard_tmp,ignore_index=True)
    count += 1

#  Rename columns
for i in range(len(df_read_pitch_stat_standard.columns)):
    colname = df_read_pitch_stat_standard.columns[i]
    if (colname == 'Player') or (colname == 'Date'):
        continue
    colname_new = colname + '_pitcher'
    df_read_pitch_stat_standard = df_read_pitch_stat_standard.rename(columns={colname: colname_new})

df_read_pitch_stat_standard = df_read_pitch_stat_standard.rename(columns={'Player': 'Pitcher'})


#clean oppnent team column
for i in range(len(df_read_pitch_stat_standard)):
    if df_read_pitch_stat_standard['Opp_pitcher'][i][0] == '@':
        df_read_pitch_stat_standard['Opp_pitcher'][i] = df_read_pitch_stat_standard['Opp_pitcher'][i][1:]
df_read_pitch_stat_standard = df_read_pitch_stat_standard.rename(columns={'Opp_pitcher':'oppTeam'})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_read_pitch_stat_standard['Opp_pitcher'][i] = df_read_pitch_stat_standard['Opp_pitcher'][i][1:]


## Merge matchups, pitcher data, and batter data

In [20]:
pitcher_batter_matchup_stats_std = pd.merge(df_read_pitch_stat_standard,df_read_bat_stat_standard, 
                                           left_on = ['oppTeam','Date'], right_on=['Team_batter', 'Date'], how = 'left')
# Convert percent to float
for i in pitcher_batter_matchup_stats_std.columns:
    for j in range(len(pitcher_batter_matchup_stats_std[i])):
        try:
            pitcher_batter_matchup_stats_std[i][j] = float(pitcher_batter_matchup_stats_std[i][j].strip('%'))/100
        except:
            continue
print("Conversion to float done!")

#'NA' string check
for i in pitcher_batter_matchup_stats_std.columns:
    count = 0 
    for j in range(len(pitcher_batter_matchup_stats_std[i])):
        if pitcher_batter_matchup_stats_std[i][j] == 'NA':
            count += 1
    print(i)
    print(count)
print("NA check done!")

Pitcher
Date
Team_pitcher
oppTeam
GS_pitcher
W_pitcher
L_pitcher
ERA_pitcher
G_pitcher
GS2_pitcher
CG_pitcher
ShO_pitcher
SV_pitcher
HLD_pitcher
BS_pitcher
IP_pitcher
TBF_pitcher
H_pitcher
R_pitcher
ER_pitcher
HR_pitcher
BB_pitcher
IBB_pitcher
HBP_pitcher
WP_pitcher
BK_pitcher
SO_pitcher
GSv2_pitcher
Batter
Team_batter
Opp_batter
BO_batter
Pos_batter
G_batter
AB_batter
PA_batter
H_batter
1B_batter
2B_batter
3B_batter
HR_batter
R_batter
RBI_batter
BB_batter
IBB_batter
SO_batter
HBP_batter
SF_batter
SH_batter
GDP_batter
SB_batter
CS_batter
AVG_batter
Pitcher
0
Date
0
Team_pitcher
0
oppTeam
0
GS_pitcher
0
W_pitcher
0
L_pitcher
0
ERA_pitcher
0
G_pitcher
0
GS2_pitcher
0
CG_pitcher
0
ShO_pitcher
0
SV_pitcher
0
HLD_pitcher
0
BS_pitcher
0
IP_pitcher
0
TBF_pitcher
0
H_pitcher
0
R_pitcher
0
ER_pitcher
0
HR_pitcher
0
BB_pitcher
0
IBB_pitcher
0
HBP_pitcher
0
WP_pitcher
0
BK_pitcher
0
SO_pitcher
0
GSv2_pitcher
0
Batter
0
Team_batter
0
Opp_batter
0
BO_batter
0
Pos_batter
0
G_batter
0
AB_batter
0
PA_

In [None]:
# Save to local
pitcher_batter_matchup_stats_std.to_csv("Gamelogs\\gameLogsAll.csv", index = False)

In [21]:
# Undate local concatenated game logs data file 
def gameLogCleanups(gameLogsDirectory,handPosReport):
    # Load game log data
    pitcherBatterMatchupDataAllSeason = pd.read_csv(gameLogsDirectory)
    
    # Unicode conversion
    for i in range(len(pitcherBatterMatchupDataAllSeason)):
        try:
            pitcherBatterMatchupDataAllSeason['Pitcher'][i] = unidecode.unidecode(pitcherBatterMatchupDataAllSeason['Pitcher'][i]) 
            pitcherBatterMatchupDataAllSeason['Batter'][i] = unidecode.unidecode(pitcherBatterMatchupDataAllSeason['Batter'][i])
        except:
            continue           
    #Append pitcehr handedness to game log.
    pitcherBatterMatchupDataAllSeason = pd.merge(pitcherBatterMatchupDataAllSeason,handPosReport[['PlayerName','Player_pos','Player_hand']],
                                                 left_on = ['Pitcher'], right_on=['PlayerName'], how = 'left')
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.drop(columns=['PlayerName'])
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.rename(columns={"Player_pos": "Pos_pitcher", "Player_hand": "Hand_pitcher"})
    #Append batter hand to game log
    pitcherBatterMatchupDataAllSeason = pd.merge(pitcherBatterMatchupDataAllSeason,handPosReport[['PlayerName','Player_pos','Player_hand']],
                                                 left_on = ['Batter'], right_on=['PlayerName'], how = 'left')
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.drop(columns=['PlayerName'])
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.rename(columns={"Player_pos": "Pos_batter", "Player_hand": "Hand_batter"})
        
    # Clean up "both-hand"
    Hand_batter_copy = []
    for i in range(len(pitcherBatterMatchupDataAllSeason)): 
        if pitcherBatterMatchupDataAllSeason['Hand_batter'][i] == 'S':  
            if pitcherBatterMatchupDataAllSeason['Hand_pitcher'][i] == 'L':
                Hand_batter_copy.append('R')
            elif pitcherBatterMatchupDataAllSeason['Hand_pitcher'][i] == 'R':
                Hand_batter_copy.append('L')
            else:
                Hand_batter_copy.append(pitcherBatterMatchupDataAllSeason['Hand_batter'][i])
        else:
            Hand_batter_copy.append(pitcherBatterMatchupDataAllSeason['Hand_batter'][i])
    
    pitcherBatterMatchupDataAllSeason['Hand_batter_copy'] = Hand_batter_copy
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.drop(columns={'Hand_batter'})
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.rename(columns={'Hand_batter_copy': 'Hand_batter'})
    pitcherBatterMatchupDataAllSeason = pitcherBatterMatchupDataAllSeason.rename(columns={'Pos_batter.1': 'Pos_batter'})
    
    pitcherBatterMatchupDataAllSeason.to_csv("Gamelogs\\gameLogsML.csv", index = False)


In [None]:
# Computationally expensive. Only run after gameLogsAll is updated locally.
gameLogCleanups("Gamelogs\\gameLogsAll.csv",handPosReport)