## Web Scraping Baseball Savant Leaderboard

web scraping and data cleaning for MLB Statcast data, 2015 through MLB All-Star break 2017.

Major League Baseball tracks speed, launch angle, exit speed, result and more of every pitch and batted ball. It has been doing so since 2015 through Statcast. 

In [1]:
from __future__ import print_function, division

In [2]:
import requests
from bs4 import BeautifulSoup
import pickle
import numpy as np
import pandas as pd



In [3]:
def get_savant_stats(years):
    #year_stats = {}
    leaders_list = []
    for year in years:
        url = "https://baseballsavant.mlb.com/statcast_leaderboard?year={}&abs=30&player_type=resp_batter_id"
        year_url = url.format(year)  
        response = requests.get(year_url)
        webpage = response.text
        soup = BeautifulSoup(webpage,"html5lib")
        soup.prettify()
        leaders_string = soup.find_all('script')[9].string.encode('ascii')
        start = leaders_string.find('[')
        end = leaders_string.find(']')
        leaders_list = leaders_list + leaders_string[start+1:end].split('}')[:-1]
        
    return leaders_list

years = ['2015','2016','2017']
leaders_list = get_savant_stats(years)

In [4]:
# Create lists of lists. Sub-lists contain player and his stats

savant_stats = []
stat_line = []
for item in leaders_list:
    player_stats = item[1:].split(',')
    for stat in player_stats:
        stat_line.append(stat[stat.find(':')+1:].strip('"'))
    savant_stats.append(stat_line)
    stat_line = []
    


In [5]:
np.array(savant_stats).shape

(1594, 20)

In [6]:
def create_stats_frame(clean_data,columns):
    #stats = pd.DataFrame(columns = columns)
    #for year in clean_data:
    stats = pd.DataFrame(np.array(clean_data),columns = columns)
    #print(df[:2])
    #stats = stats.append(df,ignore_index=True)
    return stats

columns = ["Name","attempts","max_hit_speed","min_hit_speed","avg_hit_speed",
           "fbld","gb","max_distance","avg_distance","avg_hr_distance","player_id",
           "player_type","Year","resp_batter_id","barrels","brl_percent","brl_pa",
           "ev95plus","ev95percent","rowId"]

savant_f = create_stats_frame(savant_stats,columns)


In [7]:
savant_f['brl_percent'] = savant_f['brl_percent'].str.strip('%')
savant_f['brl_pa'] = savant_f['brl_pa'].str.strip('%')
savant_f['ev95percent'] = savant_f['ev95percent'].str.strip('%')

In [8]:
numeric_columns = ["attempts","max_hit_speed","min_hit_speed","avg_hit_speed",
           "fbld","gb","max_distance","avg_distance","avg_hr_distance",
           "Year","barrels","brl_percent","brl_pa","ev95plus","ev95percent"]

savant_f[numeric_columns].apply(pd.to_numeric,errors='coerce',axis=1)

Unnamed: 0,attempts,max_hit_speed,min_hit_speed,avg_hit_speed,fbld,gb,max_distance,avg_distance,avg_hr_distance,Year,barrels,brl_percent,brl_pa,ev95plus,ev95percent
0,187.0,120.3,56.0,95.9,102.4,91.7,479.0,222.0,424.0,2015.0,45.0,24.1,14.2,99.0,52.9
1,427.0,119.0,44.3,92.4,97.5,88.7,483.0,183.0,412.0,2015.0,60.0,14.1,9.2,202.0,47.3
2,422.0,117.7,48.9,92.8,96.8,88.8,477.0,205.0,410.0,2015.0,69.0,16.4,10.1,206.0,48.8
3,428.0,117.1,32.8,89.9,94.4,87.0,468.0,175.0,421.0,2015.0,47.0,11.0,7.7,169.0,39.5
4,370.0,116.6,39.2,91.9,96.6,85.4,458.0,217.0,404.0,2015.0,66.0,17.8,9.9,175.0,47.3
5,334.0,116.6,35.3,90.1,95.0,88.4,481.0,154.0,407.0,2015.0,26.0,7.8,6.0,140.0,41.9
6,377.0,116.5,36.6,92.1,94.5,92.2,464.0,183.0,412.0,2015.0,37.0,9.8,6.8,163.0,43.2
7,385.0,116.5,47.8,91.4,95.6,88.2,471.0,184.0,406.0,2015.0,42.0,10.9,6.8,169.0,43.9
8,445.0,116.3,37.1,92.0,95.6,91.0,461.0,204.0,409.0,2015.0,56.0,12.6,8.4,205.0,46.1
9,248.0,116.2,37.4,91.7,94.8,89.3,427.0,183.0,398.0,2015.0,25.0,10.1,6.2,107.0,43.1


In [9]:
savant_f.dtypes

Name               object
attempts           object
max_hit_speed      object
min_hit_speed      object
avg_hit_speed      object
fbld               object
gb                 object
max_distance       object
avg_distance       object
avg_hr_distance    object
player_id          object
player_type        object
Year               object
resp_batter_id     object
barrels            object
brl_percent        object
brl_pa             object
ev95plus           object
ev95percent        object
rowId              object
dtype: object

In [10]:
savant_f.to_pickle('savant.pkl')