In [1]:
import pandas as pd
import numpy as np
from batting_order import *
import warnings
warnings.filterwarnings('ignore')

This jupyter notebook intends to check the quality of the plate appearnce stats scraping from baseball-reference's play-by-play data.  
Validation datasets are from baseball-reference's season pages.  
Example: https://www.baseball-reference.com/leagues/MLB/2012.shtml

In [2]:
# Read the data
seasons = list(range(2011,2019))
season_val_dict = {season:pd.read_csv(f'season_val/{season}.csv') for season in seasons}
season_scraped_dict = {season:pd.read_csv(f'season_scraped/{season}.csv') for season in seasons}

## Step 1: Cross-validate Scraped Data vs. Baseball-Reference's Season Page

In [3]:
# Cross validate
for season in seasons:
    team_merge = season_scraped_dict[season].groupby(by="Team")\
                                            .PA.sum().reset_index()\
                                            .merge(season_val_dict[season][["Tm", 'PA']]\
                                            .rename(columns = {'Tm':'Team', 'PA':'PA_val'}))
    team_merge["diff"] = team_merge.PA - team_merge.PA_val
    # Check if number of teams for each season is 30
    print(f'{season}: {len(set(season_scraped_dict[season].Team.values))} Teams') 
    # Check if the total difference in PA for a season is 0
    print(f'Total Difference: {team_merge["diff"].sum()}')
    # Print out teams in a year that has a non-zero difference
    anomalies = team_merge[team_merge['diff'] != 0].values
    if len(anomalies) != 0:
        for anomaly in anomalies: print(anomaly)
    print()

2011: 30 Teams
Total Difference: 0

2012: 30 Teams
Total Difference: -1
['MIL' 6224 6225 -1]

2013: 30 Teams
Total Difference: 0

2014: 30 Teams
Total Difference: 0

2015: 30 Teams
Total Difference: 0

2016: 30 Teams
Total Difference: 0

2017: 30 Teams
Total Difference: 0

2018: 30 Teams
Total Difference: 0



## Step 2: Cross-validate Scraped Data vs. Baseball-Reference's Game Pages

According to the results above, there are one team (2012 Milwaukee Brewers) that has a non-zero difference when cross-validating the scraped data vs. the Baseball-Reference's season page.  
For the next step of data quality check, use scraper to get the team's total plate appearances for each game and sum it up to see if the numbers are correct.

In [4]:
anomaly_dict = {'MIL':['2012', 'Milwaukee Brewers']}

In [5]:
for key in anomaly_dict.keys():
    season, team_title = anomaly_dict[key]
    team_page_url = f"https://www.baseball-reference.com/teams/{key}/{season}.shtml"
    game_pages = get_game_page(team_page_url)
    sum_pa = 0
    count = 0
    print(f'{key} {season}')
    for game_page in game_pages:
        _, pa_table = get_info(game_page, team_title) 
        team_actual_pa = get_val_pa(pa_table)
        sum_pa += team_actual_pa
        count += 1
        if count % 30 == 0:
            print(f"{count} Games Done")
    print(f'{count} Games in Total')
    print(f'Total PA: {sum_pa}\n')

MIL 2012
30 Games Done
60 Games Done
90 Games Done
120 Games Done
150 Games Done
162 Games in Total
Total PA: 6224



After scraping the actual plate appearances from every game page of the 2012 Milwaukee Brewers, the total PA number is 6224, which matches the scraping result from the play-by-play tables.