# Web Scraping from Baseball Reference

In [1]:
import requests, bs4
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import pandas as pd
import time
import random
import pickle

## Section 1: Web Scraping for one page

In [11]:
base_url = 'https://www.baseball-reference.com/leagues/majors/1980.shtml'
#url_list = [base_url + str(i) for i in range(1980,2022)]
response = requests.get(base_url)
status = response.status_code
if status == 200:
    page = response.text
    soup = bs(page)
    # Some content is in the comment section for some reasons
    comments = soup.find_all(string=lambda text:isinstance(text,Comment))
    
else:
  print(f"Oops! Received status code {status}")

# the pitching and fielding tables are in the comments section
for comment in comments:
    if 'teams_standard_pitching' in comment:
        table_pitching = bs(comment, "html.parser")
    if 'teams_standard_fielding' in comment:
        table_fielding = bs(comment, "html.parser")

In [14]:
# initiate an empty dictionary to store all batting statistics
batting_table = {}

# for batting statistics
for item in soup.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):
    
    # to get rid of the last row "league average"
    if 'League Average' in item.parent.text:
        break
    
    team = item.parent.find('a').text
    #print(team)
    stat = {}
    stat['year'] = 1980
    for row in item.parent.find_all('td'):
        stat_name = 'b_' + row['data-stat']
        stat_number = row.text
        stat[stat_name] = stat_number
        #print(stat)
    batting_table[team] = stat

# convert to data frame
batting = pd.DataFrame.from_dict(batting_table, orient='index')

# initiate an empty dictionary to store all pitching statistics
pitching_table = {}

# for pitching statistics
for item in table_pitching.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):
    
    # to get rid of the last row "league average"
    if 'League Average' in item.parent.text:
        break
    
    team = item.parent.find('a').text
    #print(team)
    stat = {}
    for row in item.parent.find_all('td'):
        stat_name = 'p_' + row['data-stat']
        stat_number = row.text
        stat[stat_name] = stat_number
        #print(stat)
    pitching_table[team] = stat

# convert to data frame
pitching = pd.DataFrame.from_dict(pitching_table, orient='index')

# initiate an empty dictionary to store all fielding statistics
fielding_table = {}

# for fielding statistics
for item in table_fielding.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):
    
    # to get rid of the last row "league average"
    if 'League Average' in item.parent.text:
        break
       
    team = item.parent.find('a').text
    #print(team)
    stat = {}
    for row in item.parent.find_all('td'):
        stat_name = 'f_' + row['data-stat']
        stat_number = row.text
        stat[stat_name] = stat_number
        #print(stat)
    fielding_table[team] = stat

# convert to data frame
fielding = pd.DataFrame.from_dict(fielding_table, orient='index')

# Merge tables horizontally
statistics = pd.concat([batting, pitching, fielding], axis=1)

## Section 2: Web Scraping Pipeline

In [28]:
base_url = 'https://www.baseball-reference.com/leagues/majors/'
url_ending = '.shtml'

# Store yearly data frames in a list to be concatenated later
results = []

# year 1980 to 2021
for i in range(1980, 2022):
    url = base_url + str(i) + url_ending
    time.sleep(1 + 2 * random.random())
    print(url)
    
    response = requests.get(url)
    status = response.status_code
    if status == 200:
        page = response.text
        soup = bs(page)
        # Some content is in the comment section for some reasons
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        # the pitching and fielding tables are in the comments section
        for comment in comments:
            if 'teams_standard_pitching' in comment:
                table_pitching = bs(comment, "html.parser")
            if 'teams_standard_fielding' in comment:
                table_fielding = bs(comment, "html.parser")
    else:
      print(f"Oops! Received status code {status}")

    # Start parsing for statistics    
    # initiate an empty dictionary to store all batting statistics
    batting_table = {}
    
    # Get batting statistics
    for item in soup.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):

        # to get rid of the last row "league average"
        if 'League Average' in item.parent.text:
            break

        team = item.parent.find('a').text
        stat = {}
        # insert the year in the dictionary
        stat['year'] = i
        for row in item.parent.find_all('td'):
            stat_name = 'b_' + row['data-stat']
            stat_number = row.text
            stat[stat_name] = stat_number
        batting_table[team] = stat

    # convert to data frame
    batting = pd.DataFrame.from_dict(batting_table, orient='index')

    # initiate an empty dictionary to store all pitching statistics
    pitching_table = {}

    # Get pitching statistics
    for item in table_pitching.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):

        # to get rid of the last row "league average"
        if 'League Average' in item.parent.text:
            break

        team = item.parent.find('a').text
        stat = {}
        for row in item.parent.find_all('td'):
            stat_name = 'p_' + row['data-stat']
            stat_number = row.text
            stat[stat_name] = stat_number
        pitching_table[team] = stat

    # convert to data frame
    pitching = pd.DataFrame.from_dict(pitching_table, orient='index')

    # initiate an empty dictionary to store all fielding statistics
    fielding_table = {}

    # Get fielding statistics
    for item in table_fielding.find_all(attrs={'class': 'left', 'scope': 'row', 'data-stat': 'team_name'}):

        # to get rid of the last row "league average"
        if 'League Average' in item.parent.text:
            break

        team = item.parent.find('a').text
        stat = {}
        for row in item.parent.find_all('td'):
            stat_name = 'f_' + row['data-stat']
            stat_number = row.text
            stat[stat_name] = stat_number
        fielding_table[team] = stat

    # convert to data frame
    fielding = pd.DataFrame.from_dict(fielding_table, orient='index')

    # Merge tables horizontally
    statistics = pd.concat([batting, pitching, fielding], axis=1)
    
    # Put one year of statistics in our results list
    results.append(statistics)

https://www.baseball-reference.com/leagues/majors/1980.shtml
https://www.baseball-reference.com/leagues/majors/1981.shtml
https://www.baseball-reference.com/leagues/majors/1982.shtml
https://www.baseball-reference.com/leagues/majors/1983.shtml
https://www.baseball-reference.com/leagues/majors/1984.shtml
https://www.baseball-reference.com/leagues/majors/1985.shtml
https://www.baseball-reference.com/leagues/majors/1986.shtml
https://www.baseball-reference.com/leagues/majors/1987.shtml
https://www.baseball-reference.com/leagues/majors/1988.shtml
https://www.baseball-reference.com/leagues/majors/1989.shtml
https://www.baseball-reference.com/leagues/majors/1990.shtml
https://www.baseball-reference.com/leagues/majors/1991.shtml
https://www.baseball-reference.com/leagues/majors/1992.shtml
https://www.baseball-reference.com/leagues/majors/1993.shtml
https://www.baseball-reference.com/leagues/majors/1994.shtml
https://www.baseball-reference.com/leagues/majors/1995.shtml
https://www.baseball-ref

## Section 3: Merge all results and export to a pickle file

In [29]:
result = pd.concat(results, ignore_index=True)

# Change types of all columns to float
result = result.astype(float, errors='ignore')

# Pickle the result
with open('data_v2.pickle', 'wb') as f:
    pickle.dump(result, f)

result.shape

(1198, 82)

In [30]:
result.dtypes

year                                float64
b_batters_used                      float64
b_age_bat                           float64
b_runs_per_game                     float64
b_G                                 float64
                                     ...   
f_tz_runs_total                     float64
f_tz_runs_total_per_season          float64
f_bis_runs_total_team                object
f_bis_runs_total_per_season_team     object
f_bis_runs_good_plays                object
Length: 82, dtype: object