In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
START = 2002
END = 2022

In [3]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")


In [4]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [5]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
4,15640,2022,Aaron Judge,NYY,30,142,525,631,166,82,...,0.263,118.4,224.0,0.596,376,0.175,0.291,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6864,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,,0.0,,0,0.166,0.252,,,
7026,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6662,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6971,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [6]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player 

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [7]:
null_count = batting.isnull().sum()

In [8]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [9]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
5551,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,86,107,113,143,109,63,0,0.188,0.256,2.0
5001,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,92,101,112,109,113,75,0,0.175,0.227,1.2
5243,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,99,101,101,123,111,64,0,0.178,0.244,
1165,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
865,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6030,24655,2022,Owen Miller,CLE,25,122,403,446,100,69,...,93,110,98,129,101,82,325,0.189,0.268,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,87,104,116,84,99,110,321,0.185,0.285,0.5
1998,26197,2022,Andrew Vaughn,CHW,24,122,469,511,135,89,...,87,108,109,94,98,106,392,0.203,0.286,
6610,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,126,99,59,137,96,88,201,0.216,0.303,2.8


In [10]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [11]:
del batting["Age Rng"]
del batting["Dol"]

In [12]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [13]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [14]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr,n_features_to_select=20,direction="forward",cv=split,n_jobs=4)

In [15]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [17]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR,team_code
5551,1,2006,Alfredo Amezaga,FLA,0.346154,0.735043,0.312950,0.307958,0.245690,0.278302,...,0.539326,0.503759,0.662921,0.652174,0.210884,0.000000,0.582979,0.524229,2.0,0.352941
5001,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,0.323276,0.316038,...,0.471910,0.496241,0.471910,0.710145,0.292517,0.000000,0.527660,0.396476,1.2,0.352941
1165,2,2002,Garret Anderson,ANA,0.423077,0.957265,0.859712,0.826990,0.711207,0.443396,...,0.359551,0.255639,0.224719,0.478261,0.659864,0.000000,0.365957,0.418502,5.1,0.029412
865,2,2003,Garret Anderson,ANA,0.461538,0.965812,0.859712,0.818339,0.737069,0.500000,...,0.471910,0.255639,0.365169,0.507246,0.523810,0.000000,0.480851,0.506608,0.8,0.029412
2571,2,2004,Garret Anderson,ANA,0.500000,0.564103,0.507194,0.475779,0.443966,0.400943,...,0.494382,0.218045,0.297753,0.608696,0.448980,0.000000,0.531915,0.585903,-0.2,0.029412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,0.186851,0.219828,0.179245,...,0.617978,0.390977,0.421348,0.608696,0.394558,0.409015,0.391489,0.352423,1.2,0.911765
5868,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,0.160900,0.099138,0.070755,...,0.415730,0.315789,0.376404,0.347826,0.619048,0.265442,0.514894,0.788546,1.0,0.558824
7015,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,0.003460,0.038793,0.066038,...,0.584270,0.593985,0.331461,0.681159,0.394558,0.230384,0.548936,0.700441,0.5,0.264706
4881,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,0.465398,0.293103,0.226415,...,0.505618,0.526316,0.331461,0.507246,0.530612,0.535893,0.570213,0.651982,0.5,0.205882


In [18]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [19]:
predictors = list(selected_columns[sfs.get_support()])

In [20]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range (start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [21]:
predictions = backtest(batting, rr, predictors)

In [22]:
predictions

Unnamed: 0,actual,prediction
5001,1.2,1.482739
1923,1.4,0.777661
3109,-0.1,0.561445
5785,0.6,0.902726
1104,4.8,2.262996
...,...,...
1913,1.2,2.730220
5868,1.0,1.933971
7015,0.5,1.534304
4881,0.5,1.700277


In [23]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7749952533272704

In [24]:
batting["Next_WAR"].describe()

count    5567.000000
mean        1.789581
std         1.991347
min        -3.400000
25%         0.300000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [25]:
2.7749952533272704 ** .5

1.6658317001808047

In [26]:
def player_history(df):
    df = df.sort_values("Season")
    
    df["player_season"] = range(0, df.shape[0])
    df["war_cor"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None),"player_season"), "WAR"])
    df["war_cor"].fillna(1, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df 

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [27]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [28]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [31]:
new_predictors = predictors + ["player_season", "war_season", "war_diff"]

In [32]:
predictions = backtest(batting, rr, new_predictors)

In [33]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 

2.681607871919152

In [34]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.629118
WAR             -2.010158
BABIP           -1.643434
Soft%+          -1.211024
SLG+            -1.177148
SwStr%          -1.106554
BU              -1.021704
Z-Contact%      -0.747314
PH              -0.736138
SO              -0.685603
war_diff        -0.612651
wGDP            -0.485367
CB%             -0.265569
Pull%+          -0.253032
OBP+            -0.192741
player_season    0.007839
IFH%             0.422227
Oppo%            0.631053
Spd              0.716638
SB               1.074867
IBB              1.962066
Hard%+           2.284398
war_season       3.451180
dtype: float64