In [71]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from html2excel import ExcelParser
import time as t
import pandas as pd
import os

current_directory = os.getcwd()

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')

browser = webdriver.Chrome(executable_path="chromedrivers/chromedriver.exe", options=options)
browser.implicitly_wait(2)

In [72]:
# this function enables downloading files with selenium's Chrome webdriver using headless mode
# source --> https://stackoverflow.com/questions/52830115/python-selenium-headless-download
def enable_download_headless(browser,download_dir):
    browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
    params = {'cmd':'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
    browser.execute("send_command", params)

In [74]:
# this is the main function to get batting log data
def get_game_logs(team, year):
    # make sure output folder exists
    output_directory = os.path.join(current_directory, "New Team Batting Logs")
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # enable headless downloading
    download_dir = os.path.abspath('') + "\\New Team Batting Logs"
    enable_download_headless(browser, download_dir)
    
    #access webpage for game logs
    browser.get(f"https://www.baseball-reference.com/teams/tgl.cgi?team={team}&t=b&year={year}")
    gamesTable = browser.find_element_by_xpath('//*[@id="team_batting_gamelogs"]')
    games = gamesTable.find_elements_by_xpath("tbody/tr")
    print("Finding game links...")
    
    # get the link for every page of batting logs
    links = []
    for g in games:
        try:
            link = g.find_element_by_xpath("td[2]").find_element_by_xpath("a").get_attribute("href")
            links.append(link)
        except selenium.common.exceptions.NoSuchElementException:
            continue
            
    # download respective files
    gameCount = 0
    for link in links:
        gameCount += 1
        print(f"{team}: Downloading Game {gameCount}...      ", end='\r')
        download_log(link, team, year)
    print(f"{team}: Downloading Game {gameCount}... done.")
    
    # finally, combine all logs
    combine_logs(team, year)

# filetype is either csv or xls
def download_log(link, team, year):
    # get date from website link
    date = link.split("/")[-1].split(".")[0].split(str(year))[-1]
    gameNo = f" {date[-1]}" if date[-1] != "0" else ""
    date = f"{year}-{date[:2]}-{date[2:4]}"
    
    browser.get(link)
    a = ActionChains(browser)
    menu = browser.find_element_by_xpath('//*[@id="play_by_play_sh"]/div/ul/li[4]/span')
    while True:
        try:
            a.move_to_element(menu).perform()
            download = browser.find_element_by_xpath(f'//*[@id="play_by_play_sh"]/div/ul/li[4]/div/ul/li[3]/button')
            a.move_to_element(download).click().perform()
            break
        # sometimes the dropdown doesnt load because reasons
        except selenium.common.exceptions.MoveTargetOutOfBoundsException as e:
            t.sleep(2)
    
    #rename file
    t.sleep(2)
    filename = f"New Team Batting Logs/{team} {date}{gameNo}.xls"
    if os.path.exists(filename):
        os.remove(filename)
    os.rename(f"New Team Batting Logs/sportsref_download.xls", filename)
    # this line is important- for some reason the files downloaded from bbref are actually .html files masked as .xls files.
    # this line converts the new file to a true .xls file for future parsing
    ExcelParser(filename).to_excel(filename)

def combine_logs(team, year):
    df = pd.DataFrame()
    files = os.listdir(f"{current_directory}/New Team Batting Logs")
    fCount = 0
    for i, filename in enumerate(files):
        path = f"New Team Batting Logs/{filename}"
        # only combine files for that team that year
        if team in filename and str(year) in filename:
            fCount += 1
            while True:
                try:
                    df = df.append(pd.read_excel(path), ignore_index=False)
                    os.remove(path)
                    break
                # extra safety net in case for some reason a file wasn't converted from fake .xls to real .xls
                except Exception as e:
                    ExcelParser(path).to_excel(path)
            print(f"Appended {fCount} files...", end="\r")
    df = df[df["@Bat"]==team]
    df.to_excel(f"New Team Batting Logs/{team}.xlsx")
    print(f"Appended {fCount} files... done.")

In [75]:
get_game_logs("SEA", 2021)

Finding game links...
SEA: Downloading Game 87... done.
Appended 87 files... done.


## Deprecated functions

In [3]:
def get_game_logs_old(team, year):
    browser.get(f"https://www.baseball-reference.com/teams/tgl.cgi?team={team}&t=b&year={year}")
    gamesTable = browser.find_element_by_xpath('//*[@id="team_batting_gamelogs"]')
    games = gamesTable.find_elements_by_xpath("tbody/tr")
    print("Finding game links...")
    links = []
    for g in games:
        try:
            link = g.find_element_by_xpath("td[2]").find_element_by_xpath("a").get_attribute("href")
            links.append(link)
        except selenium.common.exceptions.NoSuchElementException:
            continue
    gameCount = 1
    logs = []
    header = None
    for link in links:
        print(f"Logging Game {gameCount}...", end='\r')
        log, header = log_parser(link, browser, team)
        logs += log
        gameCount += 1
        
    output_directory = os.path.join(current_directory, "Team Batting Logs")
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    df = pd.DataFrame(logs)
    writer = pd.ExcelWriter(f"Team Batting Logs/{team}.xlsx", engine='xlsxwriter')
    df.to_excel(writer, sheet_name="Game Log", header=header, index=False)
    writer.save()
        
    
def log_parser(link, browser, team):
    filename = link.split("/")[-1].split(".")[0]
    output_directory = os.path.join(current_directory, team)
    
    browser.get(link)
    table = browser.find_element_by_xpath('//table[@id="play_by_play"]')
    game = []
    header = table.find_element_by_xpath("thead/tr")
    header = [item.text for item in header.find_elements_by_xpath("th")]

    rows = table.find_elements_by_xpath("tbody/tr")
    for row in rows:
        head = row.find_element_by_xpath("th").text
        if not head or head[0] != "b" and head[0] != "t":
            continue
        items = [item.text for item in row.find_elements_by_xpath("td")]
        if items[5] != team:
            continue
        game.append([head] + items)
        
    return game, header

In [4]:
#get_game_logs_old("SEA", 2021)

Finding game links...
Logging Game 82...