In [8]:
#import key packages
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import random
pd.set_option('mode.chained_assignment',None)

In [9]:
pitcher_date = (datetime.today() - timedelta(days = 0)).strftime("%Y-%m-%d") #create an object representing todays date
fangraph_url = f"https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=10&type=8&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=p{pitcher_date}&startdate=&enddate="
# scrape for statistics about the probable pitchers of today
pitchers = pd.read_html(requests.get(fangraph_url).content, match = 'Name')[0]
pitchers.columns = pitchers.columns.droplevel()
pitchers.head() #This DataFrame contains the expected pitchers for each game and their stats

Unnamed: 0,#,Name,Team,W,L,SV,G,GS,IP,K/9,...,BABIP,LOB%,GB%,HR/FB,vFA (pi),ERA,xERA,FIP,xFIP,WAR
0,1,Sandy Alcantara,MIA,9,3,0,17,17,123.1,7.81,...,0.237,78.3%,56.5%,6.3%,97.8,1.82,2.54,2.85,3.38,3.4
1,2,Shohei Ohtani,LAA,8,4,0,14,14,81.0,12.33,...,0.284,83.1%,39.3%,10.5%,97.4,2.44,2.62,2.42,2.54,2.6
2,3,Nick Pivetta,BOS,8,6,0,17,17,100.1,8.52,...,0.269,74.5%,34.9%,8.3%,93.5,3.68,4.2,3.64,4.13,1.9
3,4,Jameson Taillon,NYY,9,2,0,16,16,89.1,7.46,...,0.303,76.3%,40.2%,9.7%,94.1,3.63,3.82,3.4,3.67,1.9
4,5,Taijuan Walker,NYM,7,2,0,14,14,78.2,6.98,...,0.281,76.1%,49.8%,5.9%,94.0,2.86,3.66,3.09,3.72,1.8


In [10]:
#create the dictionaries that will map each team acronym onto the same format
class MissingDict(dict):
    __missing__ = lambda self, key: key
cbs_acronyms = {
'Arizona' : 'ARI', 'L.A. Dodgers' : 'LAD', 'N.Y. Mets' : 'NYM', 'N.Y. Yankees' : 'NYY', 'Tampa Bay' :  'TBR', 
'Oakland' : 'OAK', 'Baltimore' : 'BAL', 'St. Louis' : 'STL', 'Kansas City' : 'KCR', 
'Milwaukee' : 'MIL', 'Toronto' : 'TOR', 'L.A. Angels' : 'LAA', 'Boston' : 'BOS', 'Seattle' : 'SEA', 
'Pittsburgh' : 'PIT', 'Miami' : 'MIA', 'Cleveland' : 'CLE', 'Texas' : 'TEX', 'Atlanta' : 'ATL', 'Chi. Cubs' : 'CHC', 
'Chi. White Sox' : 'CHW', 'Detroit' : 'DET', 'Minnesota' : 'MIN', 'Cincinnati' : 'CIN', 'Philadelphia' : 'PHI', 
'Washington' : 'WSN', 'San Francisco' : 'SFG', 'San Diego' : 'SDP', 'Colorado' : 'COL', 'Houston' : 'HOU'
}
maps = MissingDict(**cbs_acronyms)

class MissingDict(dict):
    __missing__ = lambda self, key: key
espn_br_conversions = {
'Arizona Diamondbacks' : 'ARI', 'Los Angeles Dodgers' : 'LAD', 'New York Mets' : 'NYM', 'New York Yankees' : 'NYY', 'Tampa Bay Rays' :  'TBR', 
'Oakland Athletics' : 'OAK', 'Baltimore Orioles' : 'BAL', 'St. Louis Cardinals' : 'STL', 'Kansas City Royals' : 'KCR', 
'Milwaukee Brewers' : 'MIL', 'Toronto Blue Jays' : 'TOR', 'Los Angeles Angels' : 'LAA', 'Boston Red Sox' : 'BOS', 'Seattle Mariners' : 'SEA', 
'Pittsburgh Pirates' : 'PIT', 'Miami Marlins' : 'MIA', 'Cleveland Indians' : 'CLE', 'Cleveland Guardians' : 'CLE', 'Texas Rangers' : 'TEX', 'Atlanta Braves' : 'ATL', 'Chicago Cubs' : 'CHC', 
'Chicago White Sox' : 'CHW', 'Detroit Tigers' : 'DET', 'Minnesota Twins' : 'MIN', 'Cincinnati Reds' : 'CIN', 'Philadelphia Phillies' : 'PHI', 
'Washington Nationals' : 'WSN', 'San Francisco Giants' : 'SFG', 'San Diego Padres' : 'SDP', 'Colorado Rockies' : 'COL', 'Houston Astros' : 'HOU',
'Florida Marlins' : 'MIA', 'Montreal Expos' : 'WAS', 'Anaheim Angels' : 'LAA', 'Tampa Bay Devil Rays' : 'TB'
}
mapping = MissingDict(**espn_br_conversions)

class MissingDict(dict):
    __missing__ = lambda self, key: key
winner_cons = {
'ARI' : 'ARI', 'LAD' : 'LAD', 'NYM' : 'NYM', 'NYY' : 'NYY', 'TB' :  'TBR', 
'OAK' : 'OAK', 'BAL' : 'BAL', 'STL' : 'STL', 'KC' : 'KCR', 
'MIL' : 'MIL', 'TOR' : 'TOR', 'LAA' : 'LAA', 'BOS' : 'BOS', 'SEA' : 'SEA', 
'PIT' : 'PIT', 'MIA' : 'MIA', 'CLE' : 'CLE', 'CLE' : 'CLE', 'TEX' : 'TEX', 'ATL' : 'ATL', 'CHC' : 'CHC', 
'CHW' : 'CHW', 'DET' : 'DET', 'MIN' : 'MIN', 'CIN' : 'CIN', 'PHI' : 'PHI', 
'WAS' : 'WSN', 'SF' : 'SFG', 'SD' : 'SDP', 'COL' : 'COL', 'HOU' : 'HOU',
'MIA' : 'MIA', 'LAA' : 'LAA', 'TB' : 'TB'
}
mapping = MissingDict(**winner_cons)

In [11]:
#create the parameters used for the pitcher specific data
useful = ['Home Away', 'Home Home', 'Home Winner', 'Home Loser','Home Name','Home W', 'Home L', 'Home SV',
       'Home G', 'Home GS', 'Home IP', 'Home K/9', 'Home BB/9', 'Home HR/9',
       'Home BABIP', 'Home LOB%', 'Home GB%', 'Home HR/FB', 'Home vFA (pi)',
       'Home ERA', 'Home xERA', 'Home FIP', 'Home xFIP', 'Home WAR', 'Away Name','Away W', 'Away L', 'Away SV',
       'Away G', 'Away GS', 'Away IP', 'Away K/9', 'Away BB/9', 'Away HR/9',
       'Away BABIP', 'Away LOB%', 'Away GB%', 'Away HR/FB', 'Away vFA (pi)',
       'Away ERA', 'Away xERA', 'Away FIP', 'Away xFIP', 'Away WAR','Home Run Line','Home Winner Score','Home Loser Score']

In [12]:
import time

year = 2022
data = requests.get(f"https://www.espn.com/mlb/stats/team/_/season/{year}/seasontype/2")
time.sleep(0.25)
teams = pd.read_html(data.content,match = 'RK')
stats = pd.read_html(data.content,match = 'GP')
hits = pd.DataFrame()
hits[teams[0].columns] = teams[0]
hits[stats[0].columns] = stats[0]
hits['Team'] = hits['Team'].map(espn_br_conversions)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml")
pit_br = pd.read_html(data.content)[0]
pit_br['Tm'] = pit_br['Tm'].map(espn_br_conversions)
pit_br = pit_br[pit_br.index < 30]
pit_br = pit_br.rename(columns = {"Tm" : "Team"})
pit_br
time.sleep(0.25)


data = requests.get(f"https://www.espn.com/mlb/stats/team/_/view/fielding/season/{year}/seasontype/2")
time.sleep(0.25)
teams = pd.read_html(data.content,match = 'RK')
stats = pd.read_html(data.content,match = 'GP')
field = pd.DataFrame()
field[teams[0].columns] = teams[0]
field[stats[0].columns] = stats[0]
field['Team'] = field['Team'].map(espn_br_conversions)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-sabermetric-batting.shtml")
hit_saber = pd.read_html(data.content)[0]
hit_saber['Tm'] = hit_saber['Tm'].map(espn_br_conversions)
hit_saber = hit_saber[hit_saber.index < 30]
hit_saber = hit_saber.rename(columns = {"Tm" : "Team"})
time.sleep(0.25)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-win_probability-pitching.shtml")
pit_prob = pd.read_html(data.content)[0]
pit_prob['Tm'] = pit_prob['Tm'].map(espn_br_conversions)
pit_prob = pit_prob[pit_prob.index < 30]
pit_prob = pit_prob.rename(columns = {"Tm" : "Team"})
pit_prob
time.sleep(0.25)

In [15]:
season_start = datetime(2022,4,7)

In [13]:
all_matches = []
for date1 in np.arange(season_start, datetime.today() - timedelta(days = 1), timedelta(days=1)).astype(datetime):
  date = date1.strftime("%Y-%m-%d")

  url = f"https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=10&type=8&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=p{date}&startdate=&enddate="
  try:
    pitchers = pd.read_html(requests.get(url).content, match = 'Name')[0]
    pitchers.columns = pitchers.columns.droplevel()


    cbs_url = f"https://www.cbssports.com/mlb/schedule/{date.replace('-','')}/"
    matches = pd.read_html(requests.get(cbs_url).content)[0]
    # mathes = matches.loc[matches['Result'] != 'Postponed']
    matches['Away'] = matches['Away'].map(cbs_acronyms)
    matches['Home'] = matches['Home'].map(cbs_acronyms)
    matches['Winner'] = np.empty(len(matches.index))
    matches['Loser'] = np.empty(len(matches.index))
    matches['Run Line'] = np.zeros(len(matches.index))
    matches['Winner Score'] = np.zeros(len(matches.index))
    matches['Loser Score'] = np.zeros(len(matches.index))

    for j in matches.index:
        matches.loc[j,'Winner'] = str(matches[matches.index == j]['Result']).split('    ')[1].split(' ')[0]
        matches.loc[j,'Loser'] = str(matches[matches.index == j]['Result']).split('    ')[1].split(' ')[3]
        if (matches.loc[j,'Result']).count('-') != 0:
          matches.loc[j,'Run Line'] = int(str(matches.loc[j,'Result']).split(' ')[1]) + int(str(matches.loc[j,'Result']).split(' ')[4])
          matches.loc[j, 'Winner Score'] = int(str(matches.loc[j,'Result']).split(' ')[1])
          matches.loc[j,'Loser Score'] = int(str(matches.loc[j,'Result']).split(' ')[4])
    home_pitchers = pd.merge(matches,pitchers,left_on='Home',right_on = 'Team').add_prefix('Home ')
    away_pitchers = pd.merge(matches,pitchers,left_on='Away',right_on = 'Team').add_prefix('Away ')
    game_info = pd.merge(home_pitchers,away_pitchers,left_on=['Home Home','Home Away','Home Run Line', 'Home Winner Score', 'Home Loser Score'],right_on =['Away Home', 'Away Away','Away Run Line', 'Away Winner Score','Away Loser Score'])
    game_info = game_info[useful]
    game_info = game_info.rename(columns={"Home Away": "Away", "Home Home": "Home", 'Home Winner' : 'Winner', 'Home Loser': 'Loser','Home Run Line': 'Run Line', 'Home Winner Score' : 'Winner Score', 'Home Loser Score' : 'Loser Score'})
    game_info = game_info.drop_duplicates(subset = 'Home').reset_index().drop(columns = 'index')
  except ValueError or KeyError:
    continue

  df_h = []
  df_a = []
  testing111 = hits.drop(columns = ['RK','GP']).join(field.drop(columns = ['RK', 'GP']).set_index('Team').add_prefix('Field '), on='Team').join(pit_br.set_index('Team').add_prefix('Pit_Br '), on = 'Team').join(hit_saber.set_index('Team').add_prefix('Hit_Saber '), on = 'Team').join(pit_prob.set_index('Team').add_prefix('Pit_Prob '), on = 'Team')
  for i in game_info.index:
    df_home = pd.DataFrame(testing111[testing111['Team'].isin(game_info[game_info.index == i]['Home'])])
    df_h.append(df_home)
  for j in game_info.index:
    df_away = pd.DataFrame(testing111[testing111['Team'].isin(game_info[game_info.index == j]['Away'])])
    df_a.append(df_away)
  quick_fixtures = game_info.join(pd.concat(df_h).reset_index().add_prefix('Home ').drop(columns = ['Home index', 'Home Team'])).join(pd.concat(df_a).reset_index().add_prefix('Away ').drop(columns = ['Away index', 'Away Team']))
  # quick_fixtures['Run Line'] = matches['Run Line']
  all_matches.append(quick_fixtures)
ready_games = pd.concat(all_matches).reset_index().drop(columns = 'index')
ready_games = ready_games[ready_games['Winner'] != 'Postponed\nName:'].reset_index().drop(columns = 'index')
ready_games['Winner'] = ready_games['Winner'].map(winner_cons)
ready_games['Loser'] = ready_games['Loser'].map(winner_cons)
ready_games['Target'] = ready_games['Home'] == ready_games['Winner'] #Target true when Home team wins, false otherwise

In [23]:
ready_games.head() #This contains all of the game data for the season

Unnamed: 0,Away,Home,Winner,Loser,Home Name,Home W,Home L,Home SV,Home G,Home GS,...,Away Pit_Prob acLI,Away Pit_Prob cClutch,Away Pit_Prob RE24,Away Pit_Prob REW,Away Pit_Prob boLI,Away Pit_Prob RE24/boLI,Away Pit_Prob LevHi,Away Pit_Prob LevMd,Away Pit_Prob LevLo,Target
0,MIL,CHC,CHC,MIL,Kyle Hendricks,4,6,0,16,16,...,1.26,0.6%,38.0,4.3,0.94,53.4,103,64,105,True
1,CLE,KCR,KCR,CLE,Zack Greinke,2,5,0,13,13,...,0.99,0.5%,25.8,3.1,0.93,36.1,85,44,126,True
2,PIT,STL,STL,PIT,Adam Wainwright,6,7,0,17,17,...,0.53,-0.3%,-60.9,-5.1,0.98,-49.7,82,74,110,True
3,CIN,ATL,CIN,ATL,Max Fried,9,2,0,17,17,...,0.2,-3.6%,-44.0,-3.6,0.96,-27.3,74,59,166,False
4,NYM,WSN,NYM,WSN,Patrick Corbin,4,11,0,18,18,...,1.15,0.6%,42.5,4.9,0.92,53.0,77,69,115,False


In [21]:
ready_games.to_csv('Game_Data.csv')