In [None]:
#import key packages
import pandas as pd
import numpy as np
import joblib
import requests
from datetime import datetime, timedelta
import random
import time
pd.set_option('mode.chained_assignment',None)

In [None]:
ready_games = pd.read_csv('Game_Data.csv').drop(columns = 'Unnamed: 0') # read in the training data

In [None]:
# convert the probabilities to odds
def odds_formula(percent):
    if percent > .5:
        val =  100 + 100 / (percent - 1)
        return f"-{int(abs(val))}"
    else:
        val = 100 / percent - 100
        return f"-{int(abs(val))}"

In [None]:
# determine who is predicted to win
def who_won(row):
    if row['Home Probability'] >= .5:
        return row['Home Teams']
    else:
        return row['Away Teams']

In [None]:
#create the dictionaries that will map each team acronym onto the same format
class MissingDict(dict):
    __missing__ = lambda self, key: key
cbs_acronyms = {
'Arizona' : 'ARI', 'L.A. Dodgers' : 'LAD', 'N.Y. Mets' : 'NYM', 'N.Y. Yankees' : 'NYY', 'Tampa Bay' :  'TBR', 
'Oakland' : 'OAK', 'Baltimore' : 'BAL', 'St. Louis' : 'STL', 'Kansas City' : 'KCR', 
'Milwaukee' : 'MIL', 'Toronto' : 'TOR', 'L.A. Angels' : 'LAA', 'Boston' : 'BOS', 'Seattle' : 'SEA', 
'Pittsburgh' : 'PIT', 'Miami' : 'MIA', 'Cleveland' : 'CLE', 'Texas' : 'TEX', 'Atlanta' : 'ATL', 'Chi. Cubs' : 'CHC', 
'Chi. White Sox' : 'CHW', 'Detroit' : 'DET', 'Minnesota' : 'MIN', 'Cincinnati' : 'CIN', 'Philadelphia' : 'PHI', 
'Washington' : 'WSN', 'San Francisco' : 'SFG', 'San Diego' : 'SDP', 'Colorado' : 'COL', 'Houston' : 'HOU'
}
maps = MissingDict(**cbs_acronyms)

class MissingDict(dict):
    __missing__ = lambda self, key: key
espn_br_conversions = {
'Arizona Diamondbacks' : 'ARI', 'Los Angeles Dodgers' : 'LAD', 'New York Mets' : 'NYM', 'New York Yankees' : 'NYY', 'Tampa Bay Rays' :  'TBR', 
'Oakland Athletics' : 'OAK', 'Baltimore Orioles' : 'BAL', 'St. Louis Cardinals' : 'STL', 'Kansas City Royals' : 'KCR', 
'Milwaukee Brewers' : 'MIL', 'Toronto Blue Jays' : 'TOR', 'Los Angeles Angels' : 'LAA', 'Boston Red Sox' : 'BOS', 'Seattle Mariners' : 'SEA', 
'Pittsburgh Pirates' : 'PIT', 'Miami Marlins' : 'MIA', 'Cleveland Indians' : 'CLE', 'Cleveland Guardians' : 'CLE', 'Texas Rangers' : 'TEX', 'Atlanta Braves' : 'ATL', 'Chicago Cubs' : 'CHC', 
'Chicago White Sox' : 'CHW', 'Detroit Tigers' : 'DET', 'Minnesota Twins' : 'MIN', 'Cincinnati Reds' : 'CIN', 'Philadelphia Phillies' : 'PHI', 
'Washington Nationals' : 'WSN', 'San Francisco Giants' : 'SFG', 'San Diego Padres' : 'SDP', 'Colorado Rockies' : 'COL', 'Houston Astros' : 'HOU',
'Florida Marlins' : 'MIA', 'Montreal Expos' : 'WAS', 'Anaheim Angels' : 'LAA', 'Tampa Bay Devil Rays' : 'TB'
}
mapping = MissingDict(**espn_br_conversions)

class MissingDict(dict):
    __missing__ = lambda self, key: key
winner_cons = {
'ARI' : 'ARI', 'LAD' : 'LAD', 'NYM' : 'NYM', 'NYY' : 'NYY', 'TB' :  'TBR', 
'OAK' : 'OAK', 'BAL' : 'BAL', 'STL' : 'STL', 'KC' : 'KCR', 
'MIL' : 'MIL', 'TOR' : 'TOR', 'LAA' : 'LAA', 'BOS' : 'BOS', 'SEA' : 'SEA', 
'PIT' : 'PIT', 'MIA' : 'MIA', 'CLE' : 'CLE', 'CLE' : 'CLE', 'TEX' : 'TEX', 'ATL' : 'ATL', 'CHC' : 'CHC', 
'CHW' : 'CHW', 'DET' : 'DET', 'MIN' : 'MIN', 'CIN' : 'CIN', 'PHI' : 'PHI', 
'WAS' : 'WSN', 'SF' : 'SFG', 'SD' : 'SDP', 'COL' : 'COL', 'HOU' : 'HOU',
'MIA' : 'MIA', 'LAA' : 'LAA', 'TB' : 'TB'
}
mapping = MissingDict(**winner_cons)

In [None]:
# all of the useful pitcher information
today_useful = ['Home Away', 'Home Home', 'Home Name','Home W', 'Home L', 'Home SV',
       'Home G', 'Home GS', 'Home IP', 'Home K/9', 'Home BB/9', 'Home HR/9',
       'Home BABIP', 'Home LOB%', 'Home GB%', 'Home HR/FB', 'Home vFA (pi)',
       'Home ERA', 'Home xERA', 'Home FIP', 'Home xFIP', 'Home WAR', 'Away Name','Away W', 'Away L', 'Away SV',
       'Away G', 'Away GS', 'Away IP', 'Away K/9', 'Away BB/9', 'Away HR/9',
       'Away BABIP', 'Away LOB%', 'Away GB%', 'Away HR/FB', 'Away vFA (pi)',
       'Away ERA', 'Away xERA', 'Away FIP', 'Away xFIP', 'Away WAR']

In [None]:
# retrieve all of the team stats
year = 2022
data = requests.get(f"https://www.espn.com/mlb/stats/team/_/season/{year}/seasontype/2")
time.sleep(0.25)
teams = pd.read_html(data.content,match = 'RK')
stats = pd.read_html(data.content,match = 'GP')
hits = pd.DataFrame()
hits[teams[0].columns] = teams[0]
hits[stats[0].columns] = stats[0]
hits['Team'] = hits['Team'].map(espn_br_conversions)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml")
pit_br = pd.read_html(data.content)[0]
pit_br['Tm'] = pit_br['Tm'].map(espn_br_conversions)
pit_br = pit_br[pit_br.index < 30]
pit_br = pit_br.rename(columns = {"Tm" : "Team"})
pit_br
time.sleep(0.25)


data = requests.get(f"https://www.espn.com/mlb/stats/team/_/view/fielding/season/{year}/seasontype/2")
time.sleep(0.25)
teams = pd.read_html(data.content,match = 'RK')
stats = pd.read_html(data.content,match = 'GP')
field = pd.DataFrame()
field[teams[0].columns] = teams[0]
field[stats[0].columns] = stats[0]
field['Team'] = field['Team'].map(espn_br_conversions)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-sabermetric-batting.shtml")
hit_saber = pd.read_html(data.content)[0]
hit_saber['Tm'] = hit_saber['Tm'].map(espn_br_conversions)
hit_saber = hit_saber[hit_saber.index < 30]
hit_saber = hit_saber.rename(columns = {"Tm" : "Team"})
time.sleep(0.25)

data = requests.get(f"https://www.baseball-reference.com/leagues/majors/{year}-win_probability-pitching.shtml")
pit_prob = pd.read_html(data.content)[0]
pit_prob['Tm'] = pit_prob['Tm'].map(espn_br_conversions)
pit_prob = pit_prob[pit_prob.index < 30]
pit_prob = pit_prob.rename(columns = {"Tm" : "Team"})
pit_prob
time.sleep(0.25)

In [None]:
# combine the team stats with the pitcher information
today = (datetime.today()).strftime("%Y-%m-%d")
url = f"https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=p{today}&startdate=&enddate="
pitchers = pd.read_html(requests.get(url).content, match = 'Name')[0]
pitchers.columns = pitchers.columns.droplevel()

cbs_url = f"https://www.cbssports.com/mlb/schedule/{today.replace('-','')}/"
matches = pd.read_html(requests.get(cbs_url).content)[0]
matches['Away'] = matches['Away'].map(cbs_acronyms)
matches['Home'] = matches['Home'].map(cbs_acronyms)
home_pitchers = pd.merge(matches,pitchers,left_on='Home',right_on = 'Team').add_prefix('Home ')
away_pitchers = pd.merge(matches,pitchers,left_on='Away',right_on = 'Team').add_prefix('Away ')
game_info = pd.merge(home_pitchers,away_pitchers,left_on=['Home Home','Home Away'],right_on =['Away Home', 'Away Away'])
game_info = game_info[today_useful]
game_info = game_info.rename(columns={"Home Away": "Away", "Home Home": "Home", 'Home Winner' : 'Winner', 'Home Loser': 'Loser'})
game_info = game_info.drop_duplicates(subset = 'Home').reset_index().drop(columns = 'index')

df_h = []
df_a = []
testing111 = hits.drop(columns = ['RK','GP']).join(field.drop(columns = ['RK', 'GP']).set_index('Team').add_prefix('Field '), on='Team').join(pit_br.set_index('Team').add_prefix('Pit_Br '), on = 'Team').join(hit_saber.set_index('Team').add_prefix('Hit_Saber '), on = 'Team').join(pit_prob.set_index('Team').add_prefix('Pit_Prob '), on = 'Team')
for i in game_info.index:
  df_home = pd.DataFrame(testing111[testing111['Team'].isin(game_info[game_info.index == i]['Home'])])
  df_h.append(df_home)
for j in game_info.index:
  df_away = pd.DataFrame(testing111[testing111['Team'].isin(game_info[game_info.index == j]['Away'])])
  df_a.append(df_away)
today_fixtures = game_info.join(pd.concat(df_h).reset_index().add_prefix('Home ').drop(columns = ['Home index', 'Home Team'])).join(pd.concat(df_a).reset_index().add_prefix('Away ').drop(columns = ['Away index', 'Away Team']))
today_fixtures['Target'] = np.empty(len(today_fixtures.index))

In [None]:
# Retrieve the parameters used in creating the model
test_predictors = pd.read_csv('Params.csv').drop(columns = ['Unnamed: 0','Target']).columns

In [None]:
gb = joblib.load('finalized_GB.sav')

In [None]:
from sklearn.model_selection import train_test_split
def prob_test(iterations,fixtures):
  accuracy_array = np.empty(iterations)


  def winner(row):
        if row['Home Probs'] > .5:
            return row['Home']
        else:
            return row['Away']
  game_preds = np.zeros(len(fixtures.index))
  for iteration in range(iterations):
    X1 = ready_games[test_predictors]
    y1 = ready_games['Target']
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=random.randint(0,iterations),train_size=.25)
    gb.fit(X1_train,y1_train)
    new_preds = gb.predict_proba(fixtures[test_predictors])[:,1]
    game_preds += new_preds
  fixtures.loc[:,'Home Probs'] = game_preds / (iterations * 1.0)
  fixtures.loc[:,'Away Probs'] = 1 - (game_preds / iterations)

  fixtures['Projected Winner'] = fixtures.apply(lambda x: winner(x),axis = 1)
  time.sleep(1)
  return today_fixtures[['Home Probs', 'Away Probs']] # acc_df.describe(),

In [None]:
#Run and time the predictor function without parallelizing to obtain a benchmark time
start_time = datetime.now()
prob_test(400, today_fixtures)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
# implement the function in a way compatible with multiprocessing
repetitions = 50
import ray
from sklearn.model_selection import train_test_split

ray.shutdown()
ray.init()
@ray.remote
# alls = []

def prob_test(iterations):
  accuracy_array = np.empty(iterations)


  def winner(row):
        if row['Home Probs'] > .5:
            return row['Home']
        else:
            return row['Away']
  game_preds = np.zeros(len(today_fixtures.index))
  for iteration in range(iterations):
    X1 = ready_games[test_predictors]
    y1 = ready_games['Target']
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=random.randint(0,1000),train_size=.25)
    gb.fit(X1_train,y1_train)
    new_preds = gb.predict_proba(today_fixtures[test_predictors])[:,1]
    game_preds += new_preds
  today_fixtures.loc[:,'Home Probs'] = game_preds / (iterations * 1.0)
  today_fixtures.loc[:,'Away Probs'] = 1 - (game_preds / iterations)

  today_fixtures['Projected Winner'] = today_fixtures.apply(lambda x: winner(x),axis = 1)
  time.sleep(1)
  return today_fixtures[['Home Probs', 'Away Probs']] # acc_df.describe(),

def function_y(listt):
    pass

listt = list(np.zeros(8).astype('int') + repetitions)

start_time = datetime.now()

# Process the items in parallel.
results = ray.get([prob_test.remote(item) for item in listt])

function_y(listt)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
# create the final DataFrame to display the predictions
vals = sum(results) / len(listt)
final = pd.DataFrame()
final['Home Teams'] = today_fixtures['Home']
final['Away Teams'] = today_fixtures['Away']
final['Home Probability'] = np.round(vals['Home Probs'],4)                          
final['Away Probability'] = np.round(vals['Away Probs'],4) 
final['Projected Winner'] = final.apply(lambda x: who_won(x),axis = 1)
final['Winner Odds'] = final['Home Probability'].apply(lambda x: odds_formula(x)).astype('int')
final