Example Basic Analaysis

Make a basic predictor for WAR

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pybaseball import batting_stats
%matplotlib inline


Statcast era batting data where to qualify you need at least 200 PA

In [2]:
batting = batting_stats(2015, 2022, qual=200)

In [3]:
batting.to_csv("batting.csv")

In [4]:
batting = batting.groupby("IDfg", group_keys = False).filter(lambda x: x.shape[0] >1)

In [5]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
1,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246,0.609,404,0.169,0.287,,,,11.2
3,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217,0.500,434,0.220,0.270,,,,10.4
4,10155,2018,Mike Trout,LAA,26,140,471,608,147,80,...,118.0,162,0.460,352,0.201,0.261,,,,9.6
0,11579,2015,Bryce Harper,WSN,22,153,521,654,172,91,...,116.0,188,0.477,394,0.118,0.226,,,,9.3
33,10155,2015,Mike Trout,LAA,23,159,575,682,172,93,...,117.7,205,0.486,422,0.207,0.282,,,,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2308,4613,2016,Prince Fielder,TEX,32,89,326,370,69,45,...,113.0,85,0.317,268,0.161,0.265,,,,-1.8
2487,2579,2015,Carlos Ruiz,PHI,36,86,284,320,60,44,...,104.3,43,0.176,245,0.217,0.265,,,,-1.9
2147,393,2015,Victor Martinez,DET,36,120,440,485,108,77,...,108.9,131,0.332,395,0.163,0.223,,,,-2.0
2612,3448,2019,Jeff Mathis,TEX,36,88,228,244,36,25,...,105.5,37,0.261,142,0.155,0.322,,,,-2.1


In [6]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys = False).apply(next_season)

In [7]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
2147,Victor Martinez,2015,-2.0,1.0
545,Victor Martinez,2016,1.0,-1.1
1801,Victor Martinez,2017,-1.1,-1.7
2231,Victor Martinez,2018,-1.7,
1317,Juan Uribe,2015,2.0,-0.5
...,...,...,...,...
2139,Owen Miller,2022,0.6,
1716,Andrew Vaughn,2021,-0.4,-0.5
1120,Andrew Vaughn,2022,-0.5,
2397,Ha-seong Kim,2021,0.5,3.7


In [8]:
null_count = batting.isnull().sum()

In [9]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         2388
xSLG        2388
xwOBA       2388
L-WAR          0
Next_WAR     584
Length: 321, dtype: int64

In [10]:
complete_cols = list(batting.columns[null_count ==0])

In [11]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [12]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 244, dtype: object

In [13]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [14]:
del batting["Dol"]

In [15]:
del batting["Age Rng"]

In [16]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [17]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [18]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha = 1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv = split, n_jobs=4)


In [19]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])


In [21]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,...,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0
mean,10543.62694,2017.748337,0.385665,0.623127,0.480198,0.473874,0.420729,0.353581,0.350481,0.142498,...,0.268249,0.47983,0.363767,0.526659,0.450154,0.422954,0.605853,0.37586,1.78071,0.480968
std,4894.976918,1.987167,0.155003,0.279916,0.264,0.281928,0.224139,0.18115,0.170613,0.145548,...,0.159249,0.146399,0.203233,0.146657,0.231613,0.148639,0.126736,0.134316,1.896891,0.306562
min,393.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.6,0.0
25%,6184.0,2016.0,0.26087,0.405172,0.247601,0.223035,0.229839,0.203822,0.214286,0.066667,...,0.144531,0.378378,0.186992,0.435754,0.246055,0.326923,0.518519,0.281481,0.4,0.2
50%,10950.0,2018.0,0.391304,0.689655,0.49904,0.486289,0.413978,0.343949,0.339286,0.133333,...,0.25,0.477477,0.345528,0.53352,0.451677,0.418269,0.606481,0.348148,1.5,0.466667
75%,13853.0,2019.0,0.478261,0.87069,0.710173,0.724406,0.602151,0.484076,0.482143,0.2,...,0.363281,0.581081,0.520325,0.623836,0.641026,0.519231,0.689815,0.444444,2.8,0.733333
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.5,1.0


In [22]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [23]:
predictors = list(selected_columns[sfs.get_support()])

In [24]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []

    years = sorted(data["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis = 1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions, axis=0)
    

In [25]:
predictions = backtest(batting, rr, predictors)

In [26]:
predictions

Unnamed: 0,actual,prediction
1246,-0.6,-0.136526
948,1.7,0.700564
1414,-0.5,0.911765
42,2.0,-0.413262
1137,2.1,1.468268
...,...,...
603,2.4,3.709241
2089,0.9,2.181531
2577,0.6,1.644611
1716,-0.5,2.479944


In [27]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

3.188344394840154

In [28]:
batting["Next_WAR"].describe()

count    1804.000000
mean        1.780710
std         1.896891
min        -2.600000
25%         0.400000
50%         1.500000
75%         2.800000
max        11.500000
Name: Next_WAR, dtype: float64

In [29]:
3.188344394840154 ** .5

1.7855935693321014

In [30]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] =range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace=True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace = True)

    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)


In [31]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [32]:
batting["war_season"] = batting.groupby("Season", group_keys = False).apply(group_averages)

In [33]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [34]:
predictions = backtest(batting, rr, new_predictors)

In [35]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.7684488598335424

In [36]:
2.7684488598335424 ** .5

1.6638656375541694

In [37]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age               -2.853777
Hard%             -2.368224
Z-Contact% (sc)   -1.593385
HR/FB             -1.300336
BABIP             -1.058984
WAR               -0.964616
BU                -0.905703
Clutch            -0.810275
Soft%+            -0.675485
war_diff          -0.476014
PH                -0.429315
3B                -0.288282
war_corr          -0.241934
player_season      0.025376
L-WAR              0.360703
SI-Z (pi)          0.452623
Pos                0.640165
maxEV              0.657361
SI% (sc)           0.774239
Spd                1.351300
EV                 1.620146
Contact%           1.930112
war_season         2.265801
Hard%+             2.748966
dtype: float64

In [38]:
pd.Series(rr.coef_, index=new_predictors).sort_values(ascending = False, key=abs)


Age               -2.853777
Hard%+             2.748966
Hard%             -2.368224
war_season         2.265801
Contact%           1.930112
EV                 1.620146
Z-Contact% (sc)   -1.593385
Spd                1.351300
HR/FB             -1.300336
BABIP             -1.058984
WAR               -0.964616
BU                -0.905703
Clutch            -0.810275
SI% (sc)           0.774239
Soft%+            -0.675485
maxEV              0.657361
Pos                0.640165
war_diff          -0.476014
SI-Z (pi)          0.452623
PH                -0.429315
L-WAR              0.360703
3B                -0.288282
war_corr          -0.241934
player_season      0.025376
dtype: float64

Age ,Hardhit%+, HardHit%, and war season are the standout weights.
Of note is that being a 3rd baseman negatively affects your next year predicted WAR in this model.
Interestingly, EV had much a larger weight than max EV.

In [39]:
diff = predictions["actual"] - predictions["prediction"]

In [111]:
predictions

Unnamed: 0,actual,prediction
1246,-0.6,0.037046
948,1.7,1.200760
1414,-0.5,1.261897
42,2.0,0.408590
1137,2.1,1.960108
...,...,...
603,2.4,3.452680
2089,0.9,1.698151
2577,0.6,0.949018
1716,-0.5,1.766136


In [40]:
merged = predictions.merge(batting, left_index = True, right_index = True)

In [41]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [42]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
1124,19599,2021,Chas McCormick,0.374046,2.0,0.000358
1668,10762,2020,Corey Dickerson,0.213740,0.9,0.001925
796,3473,2021,Anthony Rizzo,0.358779,2.4,0.002132
136,19326,2021,Bryan Reynolds,0.664122,2.8,0.002525
196,5343,2021,Brandon Crawford,0.671756,2.1,0.007070
...,...,...,...,...,...,...
830,15998,2020,Cody Bellinger,0.305344,-1.0,4.211235
452,5417,2021,Jose Altuve,0.603053,6.6,4.312653
1902,12533,2020,Marcus Semien,0.213740,6.2,4.361991
860,9777,2021,Nolan Arenado,0.511450,7.3,4.420135


Obviously, this model does not deal with injury. WAR is a cumulative stat, and injury can derail an otherwise productive season.
Additionally, my limit to the statcast era made the sample size extremely small. 
Player trend being linear was not realistic.