In [1]:
from __future__ import print_function, division

from sklearn.linear_model import LinearRegression

In [2]:
# if needed: pip install requests or conda install requests
import requests
import pandas as pd
import seaborn as sns
import time
from bs4 import BeautifulSoup

requests.__path__

['/Users/scottlew/miniconda3/lib/python3.6/site-packages/requests']

In [3]:
import time
#timeDelay = random.randrange(2000, 4000)/1000
#time.sleep(timeDelay)        
#print("Scraped " + str(url) + "\n") 
#print("Waited " + str(timeDelay))

In [4]:
years = [x for x in range(1881,1890,1)]
years

[1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889]

In [5]:
#years = [1901,1902,1903,1904,1905,1906,1907,1908,1909,1910]

## The functions scrape_bat  & scrape_pitch were used to obtain batting & pitching statistics from baseball-reference.com from 1876-2018 using BeautifulSoup. 

In [6]:
def scrape_bat(year):
    # function reads in url for baseball referenece team batting stats
    # and converts data into a Pandas dataframe which is written to a csv file
    # table headers for batting stats on baseball-reference hardcoded
    head = [ '#Bat', 'BatAge','R/G','G', 'PA','AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS','BB', 'SO', 'BA', 'OBP',
       'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB',
       'LOB']
    year =  year
    url_name = 'https://www.baseball-reference.com/leagues/MLB/{}-standard-batting.shtml'
    #url_name = 'https://www.baseball-reference.com/leagues/MLB/{}.shtml'
    url = url_name.format(year)
    response = requests.get(url)
    if response.status_code == 200:
        print("Success")
    else:
        print("Sorry unable to connect to site")

    page = response.text
    #from bs4 import BeautifulSoup
    soup = BeautifulSoup(page, "html")
    trow = soup.find_all('tr')
    #Extract team names from team column which is in form of link not as <td>
    teams = []
    for row in trow:
        for a in row.find_all('a'):
            teams.append(a.text)

    # Extract table data in this case batting stats
    rows = []
    for tr in trow:
        td = tr.find_all('td')
        row = [i.text for i in td]
        #print(row)
        rows.append(list(row))
    #Create Pandas dataframe using rows
    df = pd.DataFrame.from_records(rows)
    df.columns = head
    df = df.drop(df.index[-2]) # Drop last 2 rows
    df = df.dropna()
    df = df[:-1]
    # Add the teams,Tm, column to DataFrame
    df['Tm'] = teams

    # Rearange dataframe so the last column becomes the first column
    # see the following link for useful help
    # https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns

    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    # Add year column to DataFrame
    df['Year'] = year
    # Now save the dataframe for that year as a csv files
    file_name = "MLB_BAT_{}.csv"
    file = file_name.format(year)
    df.to_csv(file, index = False)

In [7]:
def scrape_pitch(year):
    # this function scrapes team pitching stats
    # from baseball reference to extract Wins & Losses onlyy
    # for each team in a year/season. It creates a csv file with 2 columns
    # for later use
    url_name = 'https://www.baseball-reference.com/leagues/MLB/{}-standard-pitching.shtml'
    url = url_name.format(year)
    year =  year
    response = requests.get(url)
    if response.status_code == 200:
        print("Success")
    else:
        print("Sorry unable to connect to site")

    page = response.text
    #from bs4 import BeautifulSoup
    soup = BeautifulSoup(page, "html")

    #Extract table rows
    trow = soup.find_all('tr')
    rows = []
    for tr in trow:
        td = tr.find_all('td')
        row = [i.text for i in td]
        #print(row)
        rows.append(list(row))
        
    print(len(rows))
    #Extract team names from column of team links
    teams = []
    for row in trow:
        for a in row.find_all('a'):
            teams.append(a.text)
    
    # Now create a pandas dataframe with the extracted rows
    df = pd.DataFrame.from_records(rows)
    df = df.drop(df.index[-2]) # drop last 2-3 rows which are averages 7/or sum of stats
    df = df.dropna()
    # Extract table headers
    headers = soup.find_all('th')
    thead = []
    for h in headers:
        thead.append(h.text)
    print(thead)
    print(len(thead))
    print(teams)
    
    
    # Add headers for new dataframe
    # Get 104 headers only need 35, the first is the team column
    # Add the team column last using teams list derived from find_all('a) above
    #df.columns = thead[1:]
    df.columns = thead[1:35]
    #df.columns = thead
    df = df[:-1] # drop the last row with funky data

    # Add year column to DataFrame
    df['Year'] = year
    # Add teams columns
    df['Tm'] = teams
    # Next, reset the index of df and rearrange columns so Tm is the first columns
    df.reset_index(drop=True)
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    # Now save the dataframe for that year as a csv files
    file_name = "MLB_Pitch_{}.csv"
    file = file_name.format(year)
    df.to_csv(file, index = False)

In [8]:
def scrape_teams(year):
    # function reads in url for baseball reference team batting stats
    # and converts data into a Pandas dataframe which is written to a csv file
    # table headers for batting stats on baseball-reference hardcoded
    head = [ '#Bat', 'BatAge','R/G','G', 'PA','AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS','BB', 'SO', 'BA', 'OBP',
       'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB',
       'LOB']
    year =  year
    url_name = 'https://www.baseball-reference.com/leagues/MLB/{}.shtml'
    url = url_name.format(year)
    response = requests.get(url)
    if response.status_code == 200:
        print("Success")
    else:
        print("Sorry unable to connect to site")

    page = response.text
    #from bs4 import BeautifulSoup
    soup = BeautifulSoup(page, "html")
    trow = soup.find_all('tr')
    links = soup.findAll('a')
    #Extract team names from team column which is in form of link not as <td>
    teams = []
    #table = html.find('table', 't1')
    #links = trow.findAll('a')
    #tags = soup.find_all('a')
    #for row in trow[0]:
        #for a in row.find_all('a'):
            #teams.append(a.text)
    
        

    print(links)
    #print(len(teams))
    

## Pitching Stats

In [9]:
# Scrape for pitchings stats
#years = [1901,1902,1903,1904,1905,1906,1907,1908,1909,1910]
#years = [1880]

for year in years:
    # scrape table for each season
    scrape_pitch(year)
    
    time.sleep(25)
    
print("DONE!")

Success
11
['Tm', '#P', 'PAge', 'RA/G', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'tSho', 'cSho', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/W', 'BSN', 'BUF', 'CHC', 'CLV', 'DTN', 'PRO', 'TRO', 'WOR', 'LgAvg', '']
45
['BSN', 'BUF', 'CHC', 'CLV', 'DTN', 'PRO', 'TRO', 'WOR']
Success
17
['Tm', '#P', 'PAge', 'RA/G', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'tSho', 'cSho', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/W', 'BAL', 'BSN', 'BUF', 'CHC', 'CIN', 'CLV', 'DTN', 'LOU', 'PHA', 'PIT', 'PRO', 'STL', 'TRO', 'WOR', 'LgAvg', '']
51
['BAL', 'BSN', 'BUF', 'CHC', 'CIN', 'CLV', 'DTN', 'LOU', 'PHA', 'PIT', 'PRO', 'STL', 'TRO', 'WOR']
Success
19
['Tm', '#P', 'PAge', 'RA/G', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'tSho', 'cSho', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'B

## Note Too many anchor tags/href are being scrapped... in this case twice as many, team names are appended twice to list called teams. Note: problem seems corrected using new url for batting stats.

## Batting Stats

In [None]:
# scrape for batting stats
#years = [x for x in range(1891,1900,1)]
#years
#years = [1880]
for year in years:
    scrape_bat(year)
   
    time.sleep(20)
    
print("DONE!")

In [None]:
####################################################################################################################

In [None]:
# Scrape for pitchings stats
#years = [1901,1902,1903,1904,1905,1906,1907,1908,1909,1910]
#years = [1960]

#for year in years:
    # scrape table for each season
    #scrape_pitch(year)
    
    #time.sleep(15)
    
#print("DONE!")

## RUN ABOVE CODE FROM HERE TO SCRAPE PITCHING & BATTING STATS FROM BASEBALL_REFERENCE WEB SITE

In [None]:
print("DONE!")

In [None]:
# scrape for batting stats
#years = [1990,1991,1992,1993,1994,1995,1996,1997,1998,1999]
years = [1960]
#for year in years:
    #scrape_teams(year)
   
    #time.sleep(15)
    
print("DONE!")