# Parse Actuals

Looks through the URLs to grab batting & pitching actuals and deliver those back to the user.

In [35]:
import requests
import csv
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [74]:
# static urls
season = datetime.datetime.now().year
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=al&qual=y&type=c,36,37,38,40,-1,120,121,217,-1,24,41,42,43,44,-1,117,118,119,-1,6,45,124,-1,62,122,13&season={season}&month=0&season1={season}&ind=0&team=0&rost=0&age=0&filter=&players=0&page=1_100000".format(season=season)
BATTERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=al&qual=y&type=8&season={season}&month=0&season1={season}&ind=0&team=0&rost=0&age=0&filter=&players=0&page=1_10000".format(season=season)

# # request the data
pitchers_html = requests.get(PITCHERS_URL).text
batters_html = requests.get(BATTERS_URL).text

Now take the requests and parse out the relevant header information for each of the positions. This function will take one of the fangraphs pages as input and write out a CSV of that information once it's parsed.

In [75]:
def parse_array_from_fangraphs_html(input_html, out_file_name):
    """
    Take a HTML stats page from fangraphs and parse it out to a CSV file.
    """
    # parse input
    soup = BeautifulSoup(input_html, "lxml")
    table = soup.find("table", {"class": "rgMasterTable"})
    
    # get headers
    headers_html = table.find("thead").find_all("th")
    headers = []
    for header in headers_html:
        headers.append(header.text)
    print(headers)
    
    # get rows
    rows = []
    rows_html = table.find("tbody").find_all("tr")

    for row in rows_html:
        row_data = []
        for cell in row.find_all("td"):
            row_data.append(cell.text)
        rows.append(row_data)
    
    # write to CSV file
    with open(out_file_name, "w") as out_file:
        writer = csv.writer(out_file)
        writer.writerow(headers)
        writer.writerows(rows)
        


Now that we have all of the player data, I'm writing these out to a CSV file if I want to load them again later without having to run the requests to those pages once more.

In [76]:
parse_array_from_fangraphs_html(batters_html, 'batters_actuals.csv')
parse_array_from_fangraphs_html(pitchers_html, 'pitchers_actuals.csv')

['#', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR']
['#', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR']


Load those CSV files using read_csv() in pandas. Since some of the percentage values are stored as strings, we need to parse those into floats.

In [66]:
bdf = pd.read_csv('batters_actuals.csv')
pdf = pd.read_csv('pitchers_actuals.csv')

# create a function to parse out percentage strings to floats
def parse_pctg(value):
    return float(value.split()[0]) / 100

In [67]:
# apply that to all percentage values in the dataframes
bdf['BB%'] = bdf['BB%'].apply(lambda x: parse_pctg(x))
bdf['K%'] = bdf['K%'].apply(lambda x: parse_pctg(x))
pdf['BB%'] = pdf['BB%'].apply(lambda x: parse_pctg(x))
pdf['K%'] = pdf['K%'].apply(lambda x: parse_pctg(x))

AttributeError: 'float' object has no attribute 'split'

In [68]:
pdf.head()

Unnamed: 0,Unnamed: 1,#,Name,Team,G,PA,HR,R,RBI,SB,BB%,...,BABIP,AVG,OBP,SLG,wOBA,wRC+,BsR,Off,Def,WAR
1,Justin Verlander,Astros,10.93,1.77,6.19,0.76,32.0 %,5.2 %,26.9 %,130,0.163,...,88.9 %,40,61,85,1.6,2.49,-0.9,3.51,2.95,107.0
2,Corey Kluber,Indians,9.11,0.97,9.42,1.13,27.2 %,2.9 %,24.3 %,113,0.194,...,88.2 %,50,74,66,2.1,3.06,-0.96,2.72,2.86,111.2
3,Luis Severino,Yankees,10.58,2.24,4.73,0.52,30.2 %,6.4 %,23.8 %,123,0.198,...,80.5 %,54,55,71,2.24,2.33,-0.09,2.93,3.01,104.2
4,Trevor Bauer,Indians,11.78,2.94,4.0,0.42,31.8 %,7.9 %,23.8 %,140,0.205,...,75.0 %,58,54,74,2.44,2.24,0.2,3.03,3.04,107.0
5,Blake Snell,Rays,9.83,3.43,2.86,1.05,27.3 %,9.6 %,17.8 %,103,0.188,...,85.3 %,62,90,89,2.48,3.61,-1.13,3.66,3.72,94.1
