<a href="https://colab.research.google.com/github/timseymour42/MLB-Build-a-Team/blob/main/Automated_Data_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from google.colab import files
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import plotly
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

1.) CSV containing games from 2015+ (1 for all stats)
2.) CSV with all player stats by season
3.) CSV with all team stats by season

# Creating CSV with all games from 2015+

Scraping team data from 2015 and later

In [22]:
def date_to_str(date):
  '''
  Args:
        date (datetime): datetime object for the day of the season

  Returns:
        str: string representation of the given date

  '''
  month = str(date.month)
  day = str(date.day)
  if date.day <= 9:
    day = str(0) + day
  if date.month <= 9:
    month = str(0) + month
  return str(date.year) + '-' + month + '-' + day

Scraping process takes ~20 minutes; CSV stored for convenience

In [1]:
def collect_team_data():
  '''
    Scrapes FanGraphs data from each day between April 1, 2015 and today's date

    Returns:
        hit (pd.DataFrame) contains hitting stats with each record representing one game for a team
        pit (pd.DataFrame) contains pitching stats with each record representing one game for a team
'''
  # beginning of sample is 2015
  first_date = datetime.datetime(year = 2015, month = 4, day = 1)
  # When date reaches last date, date resets to first_date (plus one year)
  last_date = datetime.datetime(year = 2015, month = 10, day = 3)
  date = datetime.datetime(year = 2015, month = 4, day = 1)
  # collects team hitting stats for each day
  hit = pd.DataFrame()
  # collects team pitching stats for each day
  pit = pd.DataFrame()
  # sustainable way of changing year without change in code
  while (date < datetime.datetime.now()):
      date_str = date_to_str(date)
      # scrape hitting data
      hit_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={date.year}&month=1000&season1={date.year}&ind=0&team=0%2Cts&rost=0&age=0&filter=&players=0&startdate={date_str}&enddate={date_str}')
      # getting rid of the final row with non-numeric data
      hit_df = hit_df[16][:-1]
      hit_df[('temp', 'Date')] = date_str
      hit_df.columns = hit_df.columns.droplevel(0)
      if len(hit_df['#']) > 1:
        hit = hit.append(hit_df)
      # scrape pitching data
      pit_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={date.year}&month=1000&season1={date.year}&ind=0&team=0%2Cts&rost=0&age=0&filter=&players=0&startdate={date_str}&enddate={date_str}')
      # getting rid of the final row with non-numeric data
      pit_df = pit_df[16][:-1]
      pit_df[('temp', 'Date')] = date_str
      pit_df.columns = pit_df.columns.droplevel(0)
      if len(pit_df['#']) > 1:
        pit = pit.append(pit_df)
      if (date < last_date):
        date += datetime.timedelta(days = 1)
      else:
        print(date.year)
        last_date = datetime.datetime(year = last_date.year + 1, month = last_date.month, day = last_date.day)
        first_date = datetime.datetime(year = first_date.year + 1, month = first_date.month, day = first_date.day)
        date = first_date
  return hit, pit

In [None]:
# CODE USED FOR INITIAL SCRAPING

# hit, pit = collect_team_data()

In [20]:
def collect_new_team_data(df):
  '''
    Scrapes FanGraphs data from each day the most recent record scraped and today's date

    Returns:
        hit (pd.DataFrame) contains hitting stats with each record representing one game for a team
        pit (pd.DataFrame) contains pitching stats with each record representing one game for a team
'''
  recent_record = datetime.datetime.strptime(df['Date'].max(), '%Y-%m-%d')
  # beginning of sample is the most recent day data was collected
  date = recent_record + datetime.timedelta(days = 1)
  # collects team hitting stats for each day
  hit = pd.DataFrame()
  # collects team pitching stats for each day
  pit = pd.DataFrame()
  # sustainable way of changing year without change in code
  while (date < datetime.datetime.now()):
      date_str = date_to_str(date)
      # scrape hitting data
      hit_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={date.year}&month=1000&season1={date.year}&ind=0&team=0%2Cts&rost=0&age=0&filter=&players=0&startdate={date_str}&enddate={date_str}')
      # getting rid of the final row with non-numeric data
      hit_df = hit_df[16][:-1]
      hit_df[('temp', 'Date')] = date_str
      hit_df.columns = hit_df.columns.droplevel(0)
      if len(hit_df['#']) > 1:
        hit = hit.append(hit_df)
      # scrape pitching data
      pit_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={date.year}&month=1000&season1={date.year}&ind=0&team=0%2Cts&rost=0&age=0&filter=&players=0&startdate={date_str}&enddate={date_str}')
      # getting rid of the final row with non-numeric data
      pit_df = pit_df[16][:-1]
      pit_df[('temp', 'Date')] = date_str
      pit_df.columns = pit_df.columns.droplevel(0)
      if len(pit_df['#']) > 1:
        pit = pit.append(pit_df)
      date += datetime.timedelta(days = 1)
  return hit, pit

In [None]:
#CODE USED FOR INITIAL SCRAPING

# pit.drop(columns = ['G'], inplace = True)
# # Joining hitting and pitching dataframes on team and date
# all_stats = pd.merge(hit, pit, left_on = ['Team', 'Date'], right_on = ['Team', 'Date'], how = 'inner')
# # Excludes data from days where team played a double header
# all_stats = all_stats[all_stats.GS == '1']
# all_stats.to_csv('daily_game_stats.csv') 
# files.download('daily_game_stats.csv')

In [17]:
all_stats = pd.read_csv('https://github.com/timseymour42/MLB-Build-a-Team/blob/a3774339cb04887ba2026cab07c2923b27422b60/daily_stats%20(2).csv?raw=true', header = 0, index_col = 0)

In [18]:
all_stats

Unnamed: 0,#_x,Team,G,PA,HR,R,RBI,SB,BB%,K%,...,BABIP_y,LOB%,GB%,HR/FB,vFA (pi),ERA,xERA,FIP,xFIP,WAR_y
0,1,STL,14,40,0,3,3,4,10.0%,27.5%,...,0.250,100.0%,50.0%,0.0%,92.6,0.00,,1.13,1.79,0.4
1,2,CHC,16,34,0,0,0,1,5.9%,35.3%,...,0.400,78.6%,40.0%,0.0%,93.2,3.00,,2.02,3.33,0.2
2,1,KCR,13,42,2,10,9,2,11.9%,7.1%,...,0.167,100.0%,56.0%,14.3%,95.3,1.00,,4.91,4.62,0.0
3,2,BOS,13,42,5,8,8,1,16.7%,21.4%,...,0.136,100.0%,52.4%,0.0%,93.6,0.00,,1.80,2.62,0.4
4,3,COL,15,44,2,10,10,0,2.3%,13.6%,...,0.333,100.0%,33.3%,0.0%,90.6,0.00,,1.47,2.62,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33354,25,HOU,12,33,0,1,1,0,3.0%,24.2%,...,0.357,63.6%,42.9%,0.0%,94.0,4.50,,1.99,3.66,0.3
33356,27,SDP,14,36,0,3,3,0,5.6%,19.4%,...,0.276,43.5%,19.4%,11.1%,94.5,6.75,,6.11,6.21,-0.1
33357,28,MIA,13,32,0,0,0,0,3.1%,15.6%,...,0.391,77.6%,29.2%,10.0%,94.3,4.50,,3.61,3.84,0.1
33358,29,NYY,14,31,0,0,0,0,9.7%,19.4%,...,0.227,85.7%,27.3%,0.0%,94.4,1.13,,2.61,4.10,0.2


In [23]:
collect_new_team_data(all_stats)

(Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [])

In [27]:
# Need to load in CSV identify the most recent date, scrape from most recent date to today, append
all_stats = pd.read_csv('https://github.com/timseymour42/MLB-Build-a-Team/blob/a3774339cb04887ba2026cab07c2923b27422b60/daily_stats%20(2).csv?raw=true', header = 0, index_col = 0)
hit, pit = collect_new_team_data(all_stats)
if len(pit) > 0:
  pit.drop(columns = ['G'], inplace = True)
  # Joining hitting and pitching dataframes on team and date
  new_stats = pd.merge(hit, pit, left_on = ['Team', 'Date'], right_on = ['Team', 'Date'], how = 'inner')
  all_stats = pd.concat(all_stats, new_stats)
  all_stats.drop_duplicates(inplace = True)
  all_stats.to_csv('https://github.com/timseymour42/MLB-Build-a-Team/blob/a3774339cb04887ba2026cab07c2923b27422b60/daily_stats%20(2).csv')

# Scraping Player Data

In [28]:
# beginning of sample is 1900
year = 1900
wrc = pd.DataFrame()
pitch = pd.DataFrame()
field = pd.DataFrame()
# sustainable way of changing year without change in code
while year < datetime.datetime.now().year + 1:
    for num in range(int(pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page=1_50')[16].columns[0][0][-8:-6].strip())):
        # scrape hitting data
        if (num < 1):
            temp = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page={str(num + 1)}_50')[16][:-1]   
            temp.columns = temp.columns.droplevel(0)
            wrc_df = temp
        else:
            temp = (pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page={str(num + 1)}_50')[16][:-1])
            temp.columns = temp.columns.droplevel(0)
            wrc_df = wrc_df.append(temp)
        # getting rid of the final row with non-numeric data above
    wrc_df['Season'] = year
    wrc = wrc.append(wrc_df)
    # scrape pitching data
    for num in range(int(pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page=1_50')[16].columns[0][0][-8:-6].strip())):
        # scrape hitting data
        if (num < 1):
            temp = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page={str(num + 1)}_50')[16][:-1]   
            temp.columns = temp.columns.droplevel(0)
            pitch_df = temp
        else:
            temp = (pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page={str(num + 1)}_50')[16][:-1])
            temp.columns = temp.columns.droplevel(0)
            pitch_df = pitch_df.append(temp)

        # getting rid of the final row with non-numeric data above
    pitch_df['Season'] = year
    pitch = pitch.append(pitch_df)
    if year % 5 == 0:
      print(year)
    year+=1

1900
1905
1910
1915
1920
1925
1930
1935
1940
1945
1950
1955
1960
1965
1970
1975
1980
1985
1990
1995
2000
2005
2010
2015
2020


In [29]:
def string_to_num(string):
    if(type(string) == str):
        if('%' in string):
            string = string.replace('%', '')
    return float(string)

In [30]:
def clean_player_data(hit_df, pitch_df):
  '''
  function intended to make statistics numerical, manually calculate statistics, and set the indices to Name and Season

  Args:
    wrc (pd.DataFrame) contains individual player data by season
    pitch (pd.DataFrame) contains individual pitcher data by season

  Returns wrc, pitch as clean datasets for use in App'''

  # applying the function to each column to ensure all data points are numerical
  for col in hit_df.columns:
      if col not in ['Name', 'Team', 'Season', 'GB', 'Pos']:
          hit_df[col] = hit_df[col].apply(string_to_num)
  for col in pitch_df.columns:
      if col not in ['Name', 'Team', 'Season', 'GB']:
          pitch_df[col] = pitch_df[col].apply(string_to_num)
  #Determining home runs allowed for each player for easier calculation
  pitch_df['HR'] = pitch_df['HR/9'] * pitch_df['IP'] * 9
  #Determining total bases for each player for more accurate slugging percentage calculation
  # First must find at bats by subtracting walks using walk percentage
  # Calculation ignores HBP
  hit_df['AB'] = hit_df['PA'] * (1 - (hit_df['BB%'] * .01))
  # Calculation necessary for determining slugging percentage over multiple seasons
  hit_df['TB'] = hit_df['SLG'] * hit_df['AB']
  pitch_df.set_index(['Name', 'Season'], inplace = True)
  hit_df.set_index(['Name', 'Season'], inplace = True)
  return hit_df, pitch_df

In [31]:
hit_df, pitch_df = clean_player_data(wrc, pitch)

In [33]:
pitch_df

Unnamed: 0_level_0,Unnamed: 1_level_0,#,Team,W,L,SV,G,GS,IP,K/9,BB/9,...,LOB%,GB%,HR/FB,vFA (pi),ERA,xERA,FIP,xFIP,WAR,HR
Name,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Cy Young,1900,1.0,STL,19.0,19.0,0.0,41.0,35.0,321.1,3.22,1.01,...,63.5,,,,3.00,,2.73,,6.4,577.980
Noodles Hahn,1900,2.0,CIN,16.0,20.0,0.0,39.0,37.0,311.1,3.82,2.57,...,64.8,,,,3.27,,3.00,,5.3,335.988
Deacon Phillippe,1900,3.0,PIT,20.0,13.0,0.0,38.0,33.0,279.0,2.42,1.35,...,62.7,,,,2.84,,3.07,,4.2,577.530
Ed Scott,1900,4.0,CIN,17.0,20.0,1.0,42.0,35.0,315.0,2.49,1.86,...,59.0,,,,3.86,,3.31,,4.1,822.150
Bill Dineen,1900,5.0,BSN,20.0,14.0,0.0,40.0,37.0,320.2,3.00,2.95,...,63.8,,,,3.12,,3.59,,4.1,893.358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Matt Swarmer,2022,774.0,CHC,2.0,3.0,0.0,11.0,5.0,34.0,9.53,5.29,...,74.6,35.1,27.3,90.6,5.03,5.59,7.35,4.68,-0.7,973.080
Sergio Romo,2022,775.0,- - -,0.0,1.0,0.0,23.0,0.0,18.0,7.00,3.00,...,59.2,17.2,20.0,85.2,7.50,5.24,7.61,5.45,-0.7,567.000
Joe Smith,2022,776.0,MIN,1.0,1.0,0.0,34.0,0.0,27.1,5.60,2.96,...,75.3,57.4,26.9,86.2,4.61,5.17,6.29,4.38,-0.8,560.970
Adam Oller,2022,777.0,OAK,1.0,5.0,0.0,13.0,8.0,43.2,5.98,4.95,...,63.3,30.3,15.0,93.9,7.63,6.60,7.14,6.29,-0.9,960.336


In [None]:
def add_new_player_data(hit_df, pit_df):
  # Setting up current CSV data to be appended to
  curr_year = hit_df['Season'].max()
  hit_df.drop(hit_df[hit_df.Season == curr_year], inplace = True)
  pit_df.drop(hit_df[pit_df.Season == curr_year], inplace = True)

  while curr_year < datetime.datetime.now().year + 1:
    for num in range(int(pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page=1_50')[16].columns[0][0][-8:-6].strip())):
        # scrape hitting data
        if (num < 1):
            temp = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page={str(num + 1)}_50')[16][:-1]   
            temp.columns = temp.columns.droplevel(0)
            wrc_df = temp
        else:
            temp = (pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=&enddate=&page={str(num + 1)}_50')[16][:-1])
            temp.columns = temp.columns.droplevel(0)
            wrc_df = wrc_df.append(temp)
        # getting rid of the final row with non-numeric data above
    wrc_df['Season'] = year
    wrc = wrc.append(wrc_df)
    # scrape pitching data
    for num in range(int(pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page=1_50')[16].columns[0][0][-8:-6].strip())):
        # scrape hitting data
        if (num < 1):
            temp = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page={str(num + 1)}_50')[16][:-1]   
            temp.columns = temp.columns.droplevel(0)
            pitch_df = temp
        else:
            temp = (pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate={year}-01-01&enddate={year}-12-31&sort=21,d&page={str(num + 1)}_50')[16][:-1])
            temp.columns = temp.columns.droplevel(0)
            pitch_df = pitch_df.append(temp)
        # getting rid of the final row with non-numeric data above
    pitch_df['Season'] = year
    pitch = pitch.append(pitch_df)
    curr_year+=1
  hit_a, pitch_a = clean_player_data(wrc, pitch)
  hit_df = pd.concat(hit_df, hit_a)
  pit_df = pd.concat(pit_df, pitch_a)
  return hit_df, pit_df


In [40]:
a = hit_df.reset_index()
b = pitch_df.reset_index()
a[a.Season > 2015]

Unnamed: 0,Name,Season,#,Team,G,PA,HR,R,RBI,SB,...,SLG,wOBA,xwOBA,wRC+,BsR,Off,Def,WAR,AB,TB
78647,Mike Trout,2016,1.0,LAA,159.0,681.0,29.0,123.0,100.0,30.0,...,0.550,0.418,0.427,170.0,9.6,67.0,-6.9,8.5,565.230,310.876500
78648,Mookie Betts,2016,2.0,BOS,158.0,730.0,31.0,122.0,113.0,26.0,...,0.534,0.379,0.336,136.0,10.6,41.8,13.6,8.2,681.090,363.702060
78649,Kris Bryant,2016,3.0,CHC,155.0,699.0,39.0,121.0,102.0,8.0,...,0.554,0.396,0.386,148.0,7.1,48.7,6.3,7.9,624.207,345.810678
78650,Josh Donaldson,2016,4.0,TOR,155.0,700.0,37.0,122.0,99.0,7.0,...,0.549,0.403,0.403,157.0,0.0,47.6,-3.6,7.0,590.800,324.349200
78651,Robinson Cano,2016,5.0,SEA,161.0,715.0,39.0,107.0,103.0,0.0,...,0.533,0.370,0.348,139.0,-2.0,31.3,10.5,6.8,667.810,355.942730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88297,Nick Castellanos,2022,1354.0,PHI,106.0,441.0,10.0,45.0,54.0,4.0,...,0.385,0.301,0.306,91.0,-3.9,-8.8,-17.4,-1.2,417.186,160.616610
88298,Robinson Chirinos,2022,1355.0,BAL,51.0,169.0,3.0,8.0,17.0,1.0,...,0.282,0.245,0.242,54.0,-0.6,-9.4,-7.7,-1.2,154.973,43.702386
88299,Yoshi Tsutsugo,2022,1356.0,PIT,50.0,193.0,2.0,11.0,19.0,0.0,...,0.229,0.221,0.255,38.0,-0.7,-14.6,-4.3,-1.3,174.086,39.865694
88300,Jonathan Villar,2022,1357.0,- - -,59.0,220.0,3.0,25.0,18.0,7.0,...,0.302,0.250,0.247,56.0,2.1,-9.0,-11.3,-1.3,205.040,61.922080


In [34]:
hit_df.to_csv('hitters_yearly.csv') 
pitch_df.to_csv('pitchers_yearly.csv')
files.download('hitters_yearly.csv')
files.download('pitchers_yearly.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load in each CSV, clean
# Drop all records from current year, scrape all records from current year, append them
# try drop_duplicates
# Save each CSV to GitHub
hit_df = pd.read_csv('', header = 0)
pitch_df = pd.read_csv('', header = 0)
hit, pit = collect_new_team_data(all_stats)
if len(pit) > 0:
  
  all_stats.drop_duplicates(inplace = True)
  all_stats.to_csv('https://github.com/timseymour42/MLB-Build-a-Team/blob/a3774339cb04887ba2026cab07c2923b27422b60/daily_stats%20(2).csv')

# Collecting team data to compare model predictions to actual full season win totals

- key question is what model or combination of models minimizes error in predicting team success historically

In [None]:
def collect_team_data_yearly(year):

  '''
  Args:
    year (integer): year to start collecting data from
  Collecting team data to use as testing data
  '''
  
  wrc = pd.DataFrame()
  pitch = pd.DataFrame()
  field = pd.DataFrame()
  # sustainable way of changing year without change in code
  while year < datetime.datetime.now().year + 1:
      # scrape hitting data
      wrc_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate=&enddate=')
      # getting rid of the final row with non-numeric data
      wrc_df = wrc_df[16][:-1]
      wrc_df[('temp', 'Season')] = year
      wrc_df.columns = wrc_df.columns.droplevel(0)
      wrc = pd.concat([wrc, wrc_df], axis = 0)
      # scrape pitching data
      pitch_df = pd.read_html(f'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season={year}&month=0&season1={year}&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate=&enddate=')
      # getting rid of the final row with non-numeric data
      pitch_df = pitch_df[16][:-1]
      pitch_df[('temp', 'Season')] = year
      pitch_df.columns = pitch_df.columns.droplevel(0)
      pitch = pd.concat([pitch, pitch_df], axis = 0)
      year += 1
  return wrc, pitch

w, p = collect_team_data_yearly(1900)

In [None]:
team_data = pd.merge(w, p, left_on = ['Season', 'Team'], right_on = ['Season', 'Team'], how = 'outer')

Ensuring seasons of more or less than 162 games are normalized

Decisions to make

- How many years should I use?
- Is correcting predictions based on historical trendlines a good idea?
- 

Changing each column to be numerical

In [None]:
# applying the function to each column to ensure all data points are numerical
for col in team_data.columns:
    if col not in ['Team', 'Season', 'GB']:
        team_data[col] = team_data[col].apply(string_to_num)

In [None]:
team_data['W'] = team_data['W'] * (162 / team_data['GS'])

In [None]:
# Sacing a copy of the scraped data 
saved_team_data = team_data.copy()

In [None]:
team_data = saved_team_data

In [None]:
# Load in each CSV
# Drop all records from current year, scrape all records from current year, append them
# try drop_duplicates
# Save each CSV to GitHub