## Import Libraries

In [2]:
# utils
import sys
import os

# dates
import datetime as dt

# data
import numpy as np
import pandas as pd

# baseball
import fcasttools as fcbb
import pybaseball as pbb

# viz
import plotly.express as px

# progress
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# display
pd.options.display.max_columns = 99
pd.options.display.max_rows = 999
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# make tab complete faster
%config Completer.use_jedi = False

## Download Lahman Baseball Data

In [None]:
# get core team season data
pbb.teams_core().to_parquet("./data/tm_core_season.pq")

# get player stats data
pbb.pybaseball.lahman.batting().to_parquet("./data/pl_batting.pq")
pbb.pybaseball.lahman.fielding().to_parquet("./data/pl_fielding.pq")
pbb.pybaseball.lahman.pitching().to_parquet("./data/pl_pitching.pq")

# get people data
pbb.lahman.people().to_parquet("./data/pl_people.pq")

## Import Lahman Data

In [3]:
# import lahman dfs
lhb = pd.read_parquet("./data/pl_batting.pq")
lhf = pd.read_parquet("./data/pl_fielding.pq")
lhp = pd.read_parquet("./data/pl_pitching.pq")
lhpp = pd.read_parquet("./data/pl_people.pq")
lhtm = pd.read_parquet("./data/tm_core_season.pq")

## Data & Model Prep

In [4]:
from pybaseball.analysis.projections.marcels import MarcelProjectionsBatting
from pybaseball.analysis.projections.marcels import MarcelProjectionsPitching
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [5]:
# run batting etl class
dfb = fcbb.BattingETL(df_batting=lhb,
                      df_fielding=lhf,
                      df_players=lhpp,
                      dt_filter=1970,
                      season_filter=None,
                      pstn_filter=["P"]).run_etl()

In [6]:
# run pitching etl class
dfp = fcbb.PitchingETL(df_pitching=lhp,
                       df_players=lhpp,
                       dt_filter=1970,
                       pstn_filter=None,
                       season_filter=[2020]).run_etl()

In [7]:
### batting models

# instantiate marcel's class
marcel_batting = MarcelProjectionsBatting(stats_df=dfb.df_primary)
marcel_batting.model_name = "mc"

# instantiate batting forecasting classes
fcast_rfr_batting = fcbb.PlayerForecastAR(model_name="ar_rfr",
                                          data_class=dfb,
                                          groupers=["primaryPos","lgLast","seasonNo"],
                                          ar_type="var",
                                          player_model=RandomForestRegressor(n_estimators=100),
                                          group_model="mean",
                                          nlags=3,
                                          lookback=5)

fcast_ols_batting = fcbb.PlayerForecastAR(model_name="ar_ols",
                                          data_class=dfb,
                                          groupers=["primaryPos","lgLast","seasonNo"],
                                          ar_type="ar",
                                          player_model=LinearRegression(fit_intercept=True),
                                          group_model="mean",
                                          nlags=3,
                                          lookback=5)

In [8]:
### pitching models

# instantiate marcel's class
marcel_pitching = MarcelProjectionsPitching()
marcel_pitching.model_name = "mc"

# instantiate forecasting classes
fcast_rfr_pitching = fcbb.PlayerForecastAR(model_name="ar_rfr",
                                           data_class=dfp,
                                           groupers=["primaryPos","lgLast"],
                                           ar_type="var",
                                           player_model=RandomForestRegressor(n_estimators=100),
                                           group_model="mean",
                                           nlags=3,
                                           lookback=5)

fcast_ols_pitching = fcbb.PlayerForecastAR(model_name="ar_ols",
                                           data_class=dfp,
                                           groupers=["primaryPos","lgLast"],
                                           ar_type="ar",
                                           player_model=LinearRegression(fit_intercept=True),
                                           group_model="mean",
                                           nlags=3,
                                           lookback=5)

## Run Models

In [53]:
# aggregate multiple pitching forecasts
stats = ["R"]
seasons = [x for x in np.arange(1990,2023) if x != 2020]
aggdf_pitching = fcbb.AggForecasts(models=[marcel_pitching,fcast_ols_pitching,fcast_rfr_pitching],
                                   seasons=seasons,
                                   stats=stats,
                                   ensemble=True,
                                   xvars=["ageAdj"]).agg()

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# aggregate multiple batting forecasts
stats = ["R"]
seasons = [x for x in np.arange(1990,2023) if x != 2020]
aggdf_batting = fcbb.AggForecasts(models=[marcel_batting,fcast_ols_batting,fcast_rfr_batting],
                                  seasons=seasons,
                                  stats=stats,
                                  ensemble=True,
                                  xvars=["ageAdj"]).agg()

  0%|          | 0/3 [00:00<?, ?it/s]

## Pitcher Model Performance

In [None]:
# get player data
player_data = dfp.df_primary[dfp.cols_player + stats].set_index(["playerID","yearID"]).copy()

# merge
player_data = player_data.join(aggdf_pitching.df_agg).reset_index()

# filter nans
player_data2 = player_data.copy().dropna()
player_data.shape
player_data2.shape

# compute forecast errors
for stat in stats:
    player_data2[f"{stat}_err_ar_ols"] = player_data2[stat] - player_data2[f"{stat}ar_ols"]
    player_data2[f"{stat}_err_ar_rfr"] = player_data2[stat] - player_data2[f"{stat}ar_rfr"]
    player_data2[f"{stat}_err_mc"] = player_data2[stat] - player_data2[f"{stat}mc"]
    player_data2[f"{stat}_err_comb"] = player_data2[stat] - player_data2[f"{stat}comb"]

# note nans
# player_data.isna().sum()

In [None]:
# RMSE by stat & model
rmse = lambda x: (sum(x ** 2) / len(x))**.5
err_cols = [c for c in player_data2 if "err" in c]
player_data2[err_cols].agg(rmse).round(2)

In [None]:
# get RMSE by year
player_data2.groupby(["yearID"])[err_cols].agg(rmse).round(2).tail()

## Batter Model Performance

In [None]:
# get player data
player_data = dfb.df_primary[dfb.cols_player + stats].set_index(["playerID","yearID"]).copy()

# merge
player_data = player_data.join(aggdf_batting.df_agg).reset_index()

# filter nans
player_data2 = player_data.copy().dropna()
player_data.shape
player_data2.shape

# compute forecast errors
for stat in stats:
    player_data2[f"{stat}_err_ar_ols"] = player_data2[stat] - player_data2[f"{stat}ar_ols"]
    player_data2[f"{stat}_err_ar_rfr"] = player_data2[stat] - player_data2[f"{stat}ar_rfr"]
    player_data2[f"{stat}_err_mc"] = player_data2[stat] - player_data2[f"{stat}mc"]
    player_data2[f"{stat}_err_comb"] = player_data2[stat] - player_data2[f"{stat}comb"]

# note nans
# player_data.isna().sum()

In [None]:
# RMSE by stat & model
rmse = lambda x: (sum(x ** 2) / len(x))**.5
err_cols = [c for c in player_data2 if "err" in c]
player_data2[err_cols].agg(rmse).round(2)

In [None]:
# get RMSE by year
player_data2.groupby(["yearID"])[err_cols].agg(rmse).round(2).tail()

## Aggregate Forecasts to Team Level

In [23]:
# append forecasts to data class
dfb.forecasts = aggdf_batting
dfp.forecasts = aggdf_pitching

In [87]:
# agg player forecasts by team
tm_fcast = fcbb.AggByTeam(dfb=dfb,dfp=dfp).agg(corpus="op-day",forecast="comb",gross_up=False)

In [90]:
# team-level error
test = lhtm.set_index(["teamID","yearID"])[["R","RA"]].join(tm_fcast[["R","RA"]],rsuffix="proj")
np.mean((test["R"] - test["Rproj"])**2)**.5
np.mean((test["RA"] - test["RAproj"])**2)**.5

107.64856527555392

129.8127015243921

## Instantiate Playoff Likelihood Model

In [63]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [64]:
# run team etl class
dft = fcbb.CoreTeamETL(df_tm=lhtm,dt_filter=1995).run_etl()

# create a point differential column
dft.df_primary["PDiff"] = dft.df_primary["R"] - dft.df_primary["RA"]

In [72]:
# run another team etl which brings in forecast data
dft2 = dft.df_primary[dft.cols_team + dft.cols_y].set_index(["teamID","yearID"]).join(tm_fcast).reset_index()

# create a point differential column
dft2["PDiff"] = dft2["R"] - dft2["RA"]

In [73]:
# instantiate team prediction models

### models trained on original data and tested with original data
tm_logreg = fcbb.TeamForecast(model_name="tm_logreg",
                              train_corpus=dft.df_primary,
                              test_corpus=dft.df_primary,
                              data_class=dft,
                              model=LogisticRegression(fit_intercept=True),
                              lookback=30)

tm_xgb = fcbb.TeamForecast(model_name="tm_xgb",
                           train_corpus=dft.df_primary,
                           test_corpus=dft.df_primary,
                           data_class=dft,
                           model=XGBClassifier(n_estimators=1000),
                           lookback=30)

tm_rfr = fcbb.TeamForecast(model_name="tm_rfr",
                           train_corpus=dft.df_primary,
                           test_corpus=dft.df_primary,
                           data_class=dft,
                           model=RandomForestClassifier(n_estimators=1000),
                           lookback=30)

### models trained on original data and tested with forecast data
tm_logreg2 = fcbb.TeamForecast(model_name="tm_logreg",
                               train_corpus=dft.df_primary,
                               test_corpus=dft2,
                               data_class=dft,
                               model=LogisticRegression(fit_intercept=True),
                               lookback=30)

tm_rfr2 = fcbb.TeamForecast(model_name="tm_rfr",
                            train_corpus=dft.df_primary,
                            test_corpus=dft2,
                            data_class=dft,
                            model=RandomForestClassifier(n_estimators=1000),
                            lookback=30)

## Playoff Forecast Model: Lahman Test Sample

In [None]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg,tm_rfr],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="class").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

# merge data
test = dft.df_primary.set_index(["teamID","yearID"])[dft.cols_y].join(tm_agg.df_agg).dropna()
test.tail()

# get accuracies
print("Accuracy: All Data")
"Logistic: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test["PSWin"],test["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Positives")
test2 = test.query("PSWin==1")
"Logistic: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Negatives")
test3 = test.query("PSWin==0")
"Logistic: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWincomb"]),4))

## Playoff Forecast Model: Projections Test Sample

In [None]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg2,tm_rfr2],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="class").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

# merge data
test = dft.df_primary.set_index(["teamID","yearID"])[dft.cols_y].join(tm_agg.df_agg).dropna()
test.tail()

# get accuracies
print("Accuracy: All Data")
"Logistic: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test["PSWin"],test["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test["PSWin"],test["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Positives")
test2 = test.query("PSWin==1")
"Logistic: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test2["PSWin"],test2["PSWincomb"]),4))

# get accuracies
print("")
print("Accuracy: Negatives")
test3 = test.query("PSWin==0")
"Logistic: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_logreg"]),4))
"RFC: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWintm_rfr"]),4))
"Ensemble: {}".format(round(accuracy_score(test3["PSWin"],test3["PSWincomb"]),4))

## Spot Check: Projected Top Playoff Contenders in Selected Years

In [74]:
# get multiple forecasts
seasons = [x for x in np.arange(1996,2023) if x != 2020]
tm_agg = fcbb.AggForecasts(models=[tm_logreg2,tm_rfr2],
                           seasons=seasons,
                           stats=["PSWin"],
                           ensemble=True,
                           xvars=["PDiff"],
                           pred_type="prob").agg()

# enesemble classifier
tm_agg.df_agg["PSWincomb"] = tm_agg.df_agg["PSWincomb"].apply(lambda x: 0 if x <= .5 else x)

  0%|          | 0/2 [00:00<?, ?it/s]

### 2022

Houston Astros are the 2022 winners.

In [75]:
tm_agg.df_agg.reset_index().query("yearID == 2022").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,PSWintm_logreg,PSWintm_rfr,PSWincomb
770,SFN,2022,0.7562,0.364,0.5601
747,ATL,2022,0.5223,0.7427,0.6325
765,OAK,2022,0.343,0.0012,0.0
750,CHA,2022,0.2205,0.3197,0.0
769,SEA,2022,0.1662,0.3146,0.0
749,BOS,2022,0.1515,0.1611,0.0
763,NYA,2022,0.1452,0.4585,0.0
756,HOU,2022,0.1401,0.3338,0.0
766,PHI,2022,0.1194,0.3261,0.0
761,MIL,2022,0.1067,0.0011,0.0


### 2021

Atlanta Braves are the 2021 winners

In [76]:
tm_agg.df_agg.reset_index().query("yearID == 2021").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,PSWintm_logreg,PSWintm_rfr,PSWincomb
723,CLE,2021,0.9838,1.0,0.9919
733,NYA,2021,0.3012,0.5433,0.0
720,CHA,2021,0.0405,0.0,0.0
730,MIA,2021,0.0389,0.0,0.0
745,WAS,2021,0.0385,0.0,0.0
735,OAK,2021,0.0383,0.0,0.0
722,CIN,2021,0.0252,0.0,0.0
721,CHN,2021,0.0246,0.0,0.0
718,BAL,2021,0.0226,0.0,0.0
716,ARI,2021,0.0172,0.0,0.0


### 2019

Washington Nationals are the 2019 winners.

In [77]:
tm_agg.df_agg.reset_index().query("yearID == 2019").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,PSWintm_logreg,PSWintm_rfr,PSWincomb
696,HOU,2019,0.9998,1.0,0.9999
689,BOS,2019,0.997,1.0,0.9985
703,NYA,2019,0.9847,0.584,0.78435
711,SLN,2019,0.9402,0.92,0.9301
693,CLE,2019,0.7795,1.0,0.88975
700,MIA,2019,0.7744,0.986,0.8802
708,SDN,2019,0.7441,0.869,0.80655
705,OAK,2019,0.6241,0.9705,0.7973
692,CIN,2019,0.4905,0.7495,0.62
706,PHI,2019,0.4492,0.5233,0.0


### 2012

San Francisco Giants are the 2012 winners.

In [78]:
tm_agg.df_agg.reset_index().query("yearID == 2012").sort_values("PSWintm_logreg",ascending=False).head(10)

Unnamed: 0,teamID,yearID,PSWintm_logreg,PSWintm_rfr,PSWincomb
493,NYA,2012,0.9996,1.0,0.9998
479,BOS,2012,0.9965,1.0,0.99825
496,PHI,2012,0.9959,1.0,0.99795
495,OAK,2012,0.9894,1.0,0.9947
488,LAA,2012,0.9865,1.0,0.99325
503,TEX,2012,0.9822,1.0,0.9911
502,TBA,2012,0.9768,0.363,0.6699
482,CIN,2012,0.9376,0.99,0.9638
485,DET,2012,0.8522,0.5833,0.71775
478,BAL,2012,0.8348,0.7587,0.79675
