## Web Scraping Fangraphs Batted Ball Metrics


In [1]:
from __future__ import print_function, division

In [2]:
import requests
from bs4 import BeautifulSoup
import pickle



In [3]:
def get_batted_ball(years,pages):
    #f = open('workfile', 'w')
    url = "http://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=y&type=2&season={}&month=0&season1={}&ind=0&team=0&rost=0&age=0&filter=&players=0&page={}_30"
    player_data = []
    year_data = []
    data = {}
    
    for year in years:
        for page in pages:
            file_url = url.format(year,year,page)
            response = requests.get(file_url)
            webpage = response.text
            soup = BeautifulSoup(webpage,"html5lib")
            soup.prettify()
            count = 0
            for element in soup.find_all(class_=['grid_line_regular' , 'grid_line_break' ]):
                player_data.append(element.text)
                if (count % 20 == 19):
                    year_data.append(player_data)
                    player_data = []
                count += 1
                
        data[year] = year_data 
        year_data = []
        
    return data

years = [str(x) for x in range(2010,2018)]
pages = [x for x in range(1,7)]
batted_ball = get_batted_ball(years,pages)

In [4]:
def get_standard(years,pages):
    #f = open('workfile', 'w')
    url = "http://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=y&type=8&season={}&month=0&season1={}&ind=0&page={}_30"    
    player_data = []
    year_data = []
    data = {}
    
    for year in years:
        for page in pages:
            file_url = url.format(year,year,page)
            response = requests.get(file_url)
            webpage = response.text
            soup = BeautifulSoup(webpage,"html5lib")
            soup.prettify()
            count = 0
            for element in soup.find_all(class_=['grid_line_regular' , 'grid_line_break' ]):
                player_data.append(element.text)
                if (count % 22 == 21):
                    year_data.append(player_data)
                    player_data = []
                count += 1
                
        data[year] = year_data 
        year_data = []
        
    return data

years = [str(x) for x in range(2010,2018)]
pages = [x for x in range(1,7)]
standard = get_standard(years,pages)

In [5]:
def clean_strings(data_dict):
    clean_data = {}
    year_stats = []
    player_stats = []
    
    for year in data_dict:
        for player in data_dict[year]:
            for item in player:
                #print(item.encode('ascii'),item)
                player_stats.append(item.encode('utf-8').strip('%').strip(' '))
                #print(player_stats)
            year_stats.append([year] + player_stats[1:])
            player_stats = []
        clean_data[year] = year_stats
        year_stats = []
        
    return clean_data
    
clean_batted_data = clean_strings(batted_ball) 
clean_standard_data = clean_strings(standard) 


In [6]:
len(clean_standard_data['2014'])

172

In [10]:
standard_columns = ['Year','Name','Team','G','PA','HR','R','RBI','SB','BB%','K%','ISO','BABIP','AVG','OBP','SLG','wOBA','wRC+','BsR','Off','Def','WAR']
batted_columns = ['Year','Name','Team','BABIP','GB/FB','LD%','GB%','FB%','IFFB%','HR/FB','IFH','IFH%','BUH','BUH%','Pull%','Cent%','Oppo%','Soft%','Med%','Hard%']

In [11]:
import numpy as np
import pandas as pd

def create_stats_frame(clean_data,columns):
    stats = pd.DataFrame(columns = columns)
    for year in clean_data:
        df = pd.DataFrame(np.array(clean_data[year]),columns = columns)
        print(df[:2])
        stats = stats.append(df,ignore_index=True)
    return stats

standard_f = create_stats_frame(clean_standard_data,standard_columns)
batted_f = create_stats_frame(clean_batted_data,batted_columns)


   Year          Name       Team    G   PA  HR    R RBI  SB   BB% ...  BABIP  \
0  2015  Bryce Harper  Nationals  153  654  42  118  99   6  19.0 ...   .369   
1  2015    Mike Trout     Angels  159  682  41  104  90  11  13.5 ...   .344   

    AVG   OBP   SLG  wOBA wRC+  BsR   Off   Def  WAR  
0  .330  .460  .649  .461  197  3.2  77.6  -8.5  9.5  
1  .299  .402  .590  .415  172  3.3  59.9   2.1  9.0  

[2 rows x 22 columns]
   Year        Name       Team    G   PA  HR    R  RBI  SB   BB% ...  BABIP  \
0  2014  Mike Trout     Angels  157  705  36  115  111  16  11.8 ...   .349   
1  2014  Jose Abreu  White Sox  145  622  36   80  107   3   8.2 ...   .356   

    AVG   OBP   SLG  wOBA wRC+   BsR   Off   Def  WAR  
0  .287  .377  .561  .402  167   6.5  58.1  -8.4  7.9  
1  .317  .383  .581  .411  167  -0.3  45.7   0.0  7.2  

[2 rows x 22 columns]
   Year         Name     Team   G   PA  HR   R RBI  SB   BB% ...  BABIP   AVG  \
0  2017  Aaron Judge  Yankees  84  366  30  75  66   6  16.7 

In [12]:
standard_f.shape , batted_f.shape

((1346, 22), (1346, 20))

In [13]:
dummy = standard_f.drop_duplicates()

In [14]:
dummy.shape

(1176, 22)

In [15]:
stats = pd.merge(batted_f,standard_f,on=['Year','Name','Team','BABIP'])

In [16]:
stats.drop_duplicates(inplace=True)

In [17]:
stats.shape

(1176, 38)

In [22]:
stats = stats.apply(pd.to_numeric,errors = 'ignore')

In [23]:
stats.dtypes

Year       int64
Name      object
Team      object
BABIP    float64
GB/FB    float64
LD%      float64
GB%      float64
FB%      float64
IFFB%    float64
HR/FB    float64
IFH        int64
IFH%     float64
BUH        int64
BUH%     float64
Pull%    float64
Cent%    float64
Oppo%    float64
Soft%    float64
Med%     float64
Hard%    float64
G          int64
PA         int64
HR         int64
R          int64
RBI        int64
SB         int64
BB%      float64
K%       float64
ISO      float64
AVG      float64
OBP      float64
SLG      float64
wOBA     float64
wRC+       int64
BsR      float64
Off      float64
Def      float64
WAR      float64
dtype: object

In [24]:
stats.to_pickle('fangraphs.pkl')