## Import Libraries

In [1]:
# utils
import sys
import os

# dates
import datetime as dt

# data
import numpy as np
import pandas as pd

# baseball
import fcasttools as fcbb
import pybaseball as pbb

# viz
import plotly.express as px

# progress
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# display
pd.options.display.max_columns = 99
pd.options.display.max_rows = 999
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# make tab complete faster
%config Completer.use_jedi = False

## Download Lahman Baseball Data

In [None]:
# get core team season data
pbb.teams_core().to_parquet("./data/tm_core_season.pq")

# get player stats data
pbb.pybaseball.lahman.batting().to_parquet("./data/pl_batting.pq")
pbb.pybaseball.lahman.fielding().to_parquet("./data/pl_fielding.pq")
pbb.pybaseball.lahman.pitching().to_parquet("./data/pl_pitching.pq")

# get people data
pbb.lahman.people().to_parquet("./data/pl_people.pq")

## Import Lahman Data

In [2]:
# import lahman dfs
lhb = pd.read_parquet("./data/pl_batting.pq")
lhf = pd.read_parquet("./data/pl_fielding.pq")
lhp = pd.read_parquet("./data/pl_pitching.pq")
lhpp = pd.read_parquet("./data/pl_people.pq")
lhtm = pd.read_parquet("./data/tm_core_season.pq")

## Data & Model Prep

In [3]:
from pybaseball.analysis.projections.marcels import MarcelProjectionsBatting
from pybaseball.analysis.projections.marcels import MarcelProjectionsPitching
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [4]:
# run batting etl class
dfb = fcbb.BattingETL(df_batting=lhb,
                      df_fielding=lhf,
                      df_players=lhpp,
                      dt_filter=1970,
                      season_filter=None,
                      pstn_filter=["P"]).run_etl()

In [5]:
# run pitching etl class
dfp = fcbb.PitchingETL(df_pitching=lhp,
                       df_players=lhpp,
                       dt_filter=1970,
                       pstn_filter=None,
                       season_filter=[2020]).run_etl()

In [6]:
### batting models

# instantiate marcel's class
marcel_batting = MarcelProjectionsBatting(stats_df=dfb.df_primary)
marcel_batting.model_name = "mc"

# instantiate batting forecasting classes
fcast_rfr_batting = fcbb.PlayerForecastAR(model_name="ar_rfr",
                                          data_class=dfb,
                                          groupers=["primaryPos","lgLast","seasonNo"],
                                          ar_type="var",
                                          player_model=RandomForestRegressor(n_estimators=100),
                                          group_model="mean",
                                          nlags=3,
                                          lookback=5)

fcast_ols_batting = fcbb.PlayerForecastAR(model_name="ar_ols",
                                          data_class=dfb,
                                          groupers=["primaryPos","lgLast","seasonNo"],
                                          ar_type="ar",
                                          player_model=LinearRegression(fit_intercept=True),
                                          group_model="mean",
                                          nlags=3,
                                          lookback=5)

In [7]:
### pitching models

# instantiate marcel's class
marcel_pitching = MarcelProjectionsPitching()
marcel_pitching.model_name = "mc"

# instantiate forecasting classes
fcast_rfr_pitching = fcbb.PlayerForecastAR(model_name="ar_rfr",
                                           data_class=dfp,
                                           groupers=["primaryPos","lgLast"],
                                           ar_type="var",
                                           player_model=RandomForestRegressor(n_estimators=100),
                                           group_model="mean",
                                           nlags=3,
                                           lookback=5)

fcast_ols_pitching = fcbb.PlayerForecastAR(model_name="ar_ols",
                                           data_class=dfp,
                                           groupers=["primaryPos","lgLast"],
                                           ar_type="ar",
                                           player_model=LinearRegression(fit_intercept=True),
                                           group_model="mean",
                                           nlags=3,
                                           lookback=5)

## Run Models

In [8]:
# aggregate multiple pitching forecasts
stats = ["R"]
seasons = [x for x in np.arange(1990,2023) if x != 2020]
aggdf_pitching = fcbb.AggForecasts(models=[marcel_pitching,fcast_ols_pitching,fcast_rfr_pitching],
                                   seasons=seasons,
                                   stats=stats,
                                   ensemble=True,
                                   xvars=["ageAdj"]).agg()

  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# aggregate multiple batting forecasts
stats = ["R"]
seasons = [x for x in np.arange(1990,2023) if x != 2020]
aggdf_batting = fcbb.AggForecasts(models=[marcel_batting,fcast_ols_batting,fcast_rfr_batting],
                                  seasons=seasons,
                                  stats=stats,
                                  ensemble=True,
                                  xvars=["ageAdj"]).agg()

  0%|          | 0/3 [00:00<?, ?it/s]

## Pitcher Model Performance

In [None]:
# get player data
player_data = dfp.df_primary[dfp.cols_player + stats].set_index(["playerID","yearID"]).copy()

# merge
player_data = player_data.join(aggdf_pitching.df_agg).reset_index()

# filter nans
player_data2 = player_data.copy().dropna()
player_data.shape
player_data2.shape

# compute forecast errors
for stat in stats:
    player_data2[f"{stat}_err_ar_ols"] = player_data2[stat] - player_data2[f"{stat}ar_ols"]
    player_data2[f"{stat}_err_ar_rfr"] = player_data2[stat] - player_data2[f"{stat}ar_rfr"]
    player_data2[f"{stat}_err_mc"] = player_data2[stat] - player_data2[f"{stat}mc"]
    player_data2[f"{stat}_err_comb"] = player_data2[stat] - player_data2[f"{stat}comb"]

# note nans
# player_data.isna().sum()

In [None]:
# RMSE by stat & model
rmse = lambda x: (sum(x ** 2) / len(x))**.5
err_cols = [c for c in player_data2 if "err" in c]
player_data2[err_cols].agg(rmse).round(2)

In [None]:
# get RMSE by year
player_data2.groupby(["yearID"])[err_cols].agg(rmse).round(2).tail()

## Batter Model Performance

In [None]:
# get player data
player_data = dfb.df_primary[dfb.cols_player + stats].set_index(["playerID","yearID"]).copy()

# merge
player_data = player_data.join(aggdf_batting.df_agg).reset_index()

# filter nans
player_data2 = player_data.copy().dropna()
player_data.shape
player_data2.shape

# compute forecast errors
for stat in stats:
    player_data2[f"{stat}_err_ar_ols"] = player_data2[stat] - player_data2[f"{stat}ar_ols"]
    player_data2[f"{stat}_err_ar_rfr"] = player_data2[stat] - player_data2[f"{stat}ar_rfr"]
    player_data2[f"{stat}_err_mc"] = player_data2[stat] - player_data2[f"{stat}mc"]
    player_data2[f"{stat}_err_comb"] = player_data2[stat] - player_data2[f"{stat}comb"]

# note nans
# player_data.isna().sum()

In [None]:
# RMSE by stat & model
rmse = lambda x: (sum(x ** 2) / len(x))**.5
err_cols = [c for c in player_data2 if "err" in c]
player_data2[err_cols].agg(rmse).round(2)

In [None]:
# get RMSE by year
player_data2.groupby(["yearID"])[err_cols].agg(rmse).round(2).tail()

## Aggregate Forecasts to Team Level

In [10]:
# append forecasts to data class
dfb.forecasts = aggdf_batting
dfp.forecasts = aggdf_pitching

In [11]:
# agg player forecasts by team
tm_fcast = fcbb.AggByTeam(dfb=dfb,dfp=dfp).agg(corpus="op-day",forecast="comb",gross_up=False)

In [12]:
# team-level error
test = lhtm.set_index(["teamID","yearID"])[["R","RA"]].join(tm_fcast[["R","RA"]],rsuffix="proj")
np.mean((test["R"] - test["Rproj"])**2)**.5
np.mean((test["RA"] - test["RAproj"])**2)**.5

107.44125682147151

129.80962360137602

## Instantiate Playoff Likelihood Model

In [13]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
# run team etl class
dft = fcbb.CoreTeamETL(df_tm=lhtm,dt_filter=1995).run_etl()

# create a point differential column
dft.df_primary["PDiff"] = dft.df_primary["R"] - dft.df_primary["RA"]

In [15]:
# run another team etl which brings in forecast data
dft2 = dft.df_primary[dft.cols_team + dft.cols_y].set_index(["teamID","yearID"]).join(tm_fcast).reset_index()

# create a point differential column
dft2["PDiff"] = dft2["R"] - dft2["RA"]

In [16]:
# instantiate team prediction models

### models trained on original data and tested with original data
tm_logreg = fcbb.TeamForecast(model_name="tm_logreg",
                              train_corpus=dft.df_primary,
                              test_corpus=dft.df_primary,
                              data_class=dft,
                              model=LogisticRegression(fit_intercept=True),
                              lookback=30)

tm_xgb = fcbb.TeamForecast(model_name="tm_xgb",
                           train_corpus=dft.df_primary,
                           test_corpus=dft.df_primary,
                           data_class=dft,
                           model=XGBClassifier(n_estimators=1000),
                           lookback=30)

tm_rfr = fcbb.TeamForecast(model_name="tm_rfr",
                           train_corpus=dft.df_primary,
                           test_corpus=dft.df_primary,
                           data_class=dft,
                           model=RandomForestClassifier(n_estimators=1000),
                           lookback=30)

### models trained on original data and tested with forecast data
tm_logreg2 = fcbb.TeamForecast(model_name="tm_logreg",
                               train_corpus=dft.df_primary,
                               test_corpus=dft2,
                               data_class=dft,
                               model=LogisticRegression(fit_intercept=True),
                               lookback=30)

tm_rfr2 = fcbb.TeamForecast(model_name="tm_rfr",
                            train_corpus=dft.df_primary,
                            test_corpus=dft2,
                            data_class=dft,
                            model=RandomForestClassifier(n_estimators=1000),
                            lookback=30)

## Playoff Forecast Model: Lahman Test Sample

In [None]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg,tm_rfr],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="class").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

# merge data
test = dft.df_primary.set_index(["teamID","yearID"])[dft.cols_y].join(tm_agg.df_agg).dropna()
test.tail()

# get accuracies
print("Accuracy: All Data")
"Logistic: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test["PSWin"],test["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Positives")
test2 = test.query("PSWin==1")
"Logistic: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Negatives")
test3 = test.query("PSWin==0")
"Logistic: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWincomb"]),4))

## Playoff Forecast Model: Projections Test Sample

In [None]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg2,tm_rfr2],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="class").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

# merge data
test = dft.df_primary.set_index(["teamID","yearID"])[dft.cols_y].join(tm_agg.df_agg).dropna()
test.tail()

# get accuracies
print("Accuracy: All Data")
"Logistic: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test["PSWin"],test["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Positives")
test2 = test.query("PSWin==1")
"Logistic: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Negatives")
test3 = test.query("PSWin==0")
"Logistic: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWincomb"]),4))

## Spot Check: Projected Top Playoff Contenders in Selected Years

In [22]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg2,tm_rfr2],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="prob").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

# merge data
test = dft.df_primary.set_index(["teamID","yearID"])[dft.cols_y].join(tm_agg.df_agg).dropna()

  0%|          | 0/2 [00:00<?, ?it/s]

### 2022

Houston Astros are the 2022 winners.

In [24]:
test.reset_index().query("yearID == 2022").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,WCWin,DivWin,LgWin,WSWin,PSWin,Lg+Win,Div+Win,PSWintm_logreg,PSWintm_rfr,PSWincomb
656,SFN,2022,0,0,0,0,0,0,0,0.7345,1.0,0.86725
57,ATL,2022,0,1,0,0,1,0,1,0.5563,0.4845,0.5204
526,OAK,2022,0,0,0,0,0,0,0,0.3082,0.9272,0.6177
136,CHA,2022,0,0,0,0,0,0,0,0.1638,0.0021,0.0
630,SEA,2022,1,0,0,0,1,0,0,0.1638,0.0021,0.0
109,BOS,2022,0,0,0,0,0,0,0,0.1597,0.0011,0.0
308,HOU,2022,0,1,1,1,1,1,1,0.1263,0.0278,0.0
474,NYA,2022,0,1,0,0,1,0,1,0.1165,0.2567,0.0
552,PHI,2022,1,0,1,0,1,1,1,0.0963,0.0,0.0
411,MIL,2022,0,0,0,0,0,0,0,0.0937,0.0,0.0


### 2021

Atlanta Braves are the 2021 winners

In [25]:
test.reset_index().query("yearID == 2021").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,WCWin,DivWin,LgWin,WSWin,PSWin,Lg+Win,Div+Win,PSWintm_logreg,PSWintm_rfr,PSWincomb
213,CLE,2021,0,0,0,0,0,0,0,0.9843,1.0,0.99215
473,NYA,2021,1,0,0,0,1,0,0,0.2748,0.0008,0.0
386,MIA,2021,0,0,0,0,0,0,0,0.0487,0.0,0.0
135,CHA,2021,0,1,0,0,1,0,1,0.0385,0.0,0.0
525,OAK,2021,0,0,0,0,0,0,0,0.0352,0.0,0.0
774,WAS,2021,0,0,0,0,0,0,0,0.0342,0.0546,0.0
187,CIN,2021,0,0,0,0,0,0,0,0.0261,0.0,0.0
161,CHN,2021,0,0,0,0,0,0,0,0.0253,0.0,0.0
30,ARI,2021,0,0,0,0,0,0,0,0.0171,0.0,0.0
82,BAL,2021,0,0,0,0,0,0,0,0.0161,0.0,0.0


### 2019

Washington Nationals are the 2019 winners.

In [26]:
test.reset_index().query("yearID == 2019").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,WCWin,DivWin,LgWin,WSWin,PSWin,Lg+Win,Div+Win,PSWintm_logreg,PSWintm_rfr,PSWincomb
306,HOU,2019,0,1,1,0,1,1,1,0.9998,1.0,0.9999
107,BOS,2019,0,0,0,0,0,0,0,0.9971,1.0,0.99855
472,NYA,2019,0,1,0,0,1,0,1,0.9839,0.5849,0.7844
680,SLN,2019,0,1,0,0,1,0,1,0.9177,1.0,0.95885
212,CLE,2019,0,0,0,0,0,0,0,0.7783,0.998,0.88815
602,SDN,2019,0,0,0,0,0,0,0,0.7723,0.966,0.86915
385,MIA,2019,0,0,0,0,0,0,0,0.7345,1.0,0.86725
524,OAK,2019,1,0,0,0,1,0,0,0.6556,0.9245,0.79005
186,CIN,2019,0,0,0,0,0,0,0,0.4572,0.6669,0.56205
550,PHI,2019,0,0,0,0,0,0,0,0.432,0.1268,0.0


### 2012

San Francisco Giants are the 2012 winners.

In [27]:
test.reset_index().query("yearID == 2012").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,WCWin,DivWin,LgWin,WSWin,PSWin,Lg+Win,Div+Win,PSWintm_logreg,PSWintm_rfr,PSWincomb
465,NYA,2012,0,1,0,0,1,0,1,0.9995,1.0,0.99975
543,PHI,2012,0,0,0,0,0,0,0,0.9958,1.0,0.9979
100,BOS,2012,0,0,0,0,0,0,0,0.9951,1.0,0.99755
517,OAK,2012,0,1,0,0,1,0,1,0.9925,1.0,0.99625
342,LAA,2012,0,0,0,0,0,0,0,0.9814,0.99,0.9857
697,TBA,2012,0,0,0,0,0,0,0,0.9782,0.775,0.8766
723,TEX,2012,1,0,0,0,1,0,0,0.9775,0.361,0.66925
179,CIN,2012,0,1,0,0,1,0,1,0.9424,1.0,0.9712
257,DET,2012,0,1,1,0,1,1,1,0.8601,0.8112,0.83565
74,BAL,2012,1,0,0,0,1,0,0,0.821,0.9722,0.8966
