In [43]:
import pandas as pd

## (1) Import Data

In [44]:
df = pd.read_csv("nba_data_up_to.csv", index_col=0)
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.360,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,...,34.2,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True
17768,240.0,240.0,37.0,74.0,0.500,13.0,25.0,0.520,26.0,37.0,...,25.0,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True
17769,240.0,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
17770,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True


## (2) Data Cleaning

In [45]:
# Sort the DataFrame by date
df = df.sort_values("date")

In [46]:
# Change index to another set of values
df = df.reset_index(drop=True)
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
1,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
2,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
3,240.0,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,...,37.5,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True
4,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,...,42.9,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False
17768,240.0,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,...,45.0,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True
17769,240.0,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,...,33.3,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False
17770,240.0,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,...,33.3,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False


In [47]:
# Clean data - remove duplicate columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [48]:
# Create target that tells us how the team did in their next game (win or loss)
def team_target(team):
    team["target"] = team["won"].shift(-1) # Pulls "won" column from next game and pull it back one row
    return team

# Split df into one df per team to specify it for the team_target function
df = df.groupby("team", group_keys=False).apply(team_target)

In [49]:
# Take rows from DataFrame where the team code is Golden State Warriors
df[df["team"] == "GSW"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,True
44,240.0,43.0,93.0,0.462,9.0,26.0,0.346,17.0,25.0,0.680,...,37.5,151.0,118.0,HOU,92,1,2016,2015-10-30,True,True
67,240.0,46.0,84.0,0.548,17.0,30.0,0.567,25.0,35.0,0.714,...,36.1,218.0,131.0,NOP,120,1,2016,2015-10-31,True,True
98,240.0,43.0,84.0,0.512,11.0,25.0,0.440,22.0,30.0,0.733,...,44.3,106.0,126.0,MEM,69,0,2016,2015-11-02,True,True
137,240.0,39.0,85.0,0.459,10.0,26.0,0.385,24.0,31.0,0.774,...,32.9,250.0,122.0,LAC,108,0,2016,2015-11-04,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17762,240.0,39.0,86.0,0.453,15.0,37.0,0.405,14.0,20.0,0.700,...,35.2,300.0,117.0,BOS,88,0,2022,2022-06-05,True,False
17764,240.0,36.0,78.0,0.462,15.0,40.0,0.375,13.0,15.0,0.867,...,28.8,175.0,117.0,BOS,116,1,2022,2022-06-08,False,True
17766,240.0,40.0,91.0,0.440,15.0,43.0,0.349,12.0,15.0,0.800,...,32.4,205.0,120.0,BOS,97,1,2022,2022-06-10,True,True
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,True


In [50]:
# Next steps: use other columns to predict Target

In [51]:
# Extract null values in target column and set value to (2)
df["target"][pd.isnull(df["target"])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [52]:
# Rename target values: True (1), False (0)
df["target"] = df["target"].astype(int, errors="ignore")
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


In [53]:
# Check to see if target columns are balanced
df["target"].value_counts()

1    8872
0    8870
2      30
Name: target, dtype: int64

In [54]:
# Check to see if won columns are balanced
df["won"].value_counts()

False    8886
True     8886
Name: won, dtype: int64

In [55]:
# Next steps: remove or replace any null values for ML training

In [56]:
# Extract all non-null columns
missing = pd.isnull(df)
missing = missing.sum()
missing = missing[missing > 0]

nonempty_columns = df.columns[~df.columns.isin(missing.index)]
nonempty_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [57]:
# Set exisitng DataFrame to a copy of the original without null values
df = df[nonempty_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


## (3) Feature Selection

In [58]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [59]:
# Import feature selector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

# Ridge regression for classification: lose or win?
ridge_regression = RidgeClassifier(alpha=1)
data_split = TimeSeriesSplit(n_splits=3)

# Train the model using different sets of features: forward 
sequential_feature_selector = SequentialFeatureSelector(ridge_regression, n_features_to_select=30, direction="forward", cv=data_split)

In [60]:
# Scale columns in data for ridge regression

In [61]:
# Extract columns in the data that we don't want to scale
removed_model_cols = ["won", "target", "team", "date", "season", "team_opp"]

In [62]:
# Columns in the data that we do want to scale
scale_columns = df.columns[~df.columns.isin(removed_model_cols)]

In [63]:
from sklearn.preprocessing import MinMaxScaler

# This scaler will rescale values to fall between 0 and 1 for better Ridge Regression
scaler = MinMaxScaler()

# Use scaler to scale selected columns
df[scale_columns] = scaler.fit_transform(df[scale_columns])
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [64]:
sequential_feature_selector.fit(df[scale_columns], df["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30)

In [65]:
# Derive list of predictor columns (believed to influence the dependent variable: win/loss)
predictor_columns = list(scale_columns[sequential_feature_selector.get_support()])
predictor_columns 

['mp',
 'fg%',
 '3p%',
 'orb',
 'ts%',
 'usg%',
 '3p%_max',
 'ft_max',
 'fta_max',
 '+/-_max',
 'drb%_max',
 'trb%_max',
 'tov%_max',
 'usg%_max',
 'mp_opp',
 'fg_opp',
 '3p_opp',
 'ft%_opp',
 'blk_opp',
 'usg%_opp',
 'fga_max_opp',
 '3p_max_opp',
 'ft_max_opp',
 'ft%_max_opp',
 'blk_max_opp',
 'pf_max_opp',
 'pts_max_opp',
 'drb%_max_opp',
 'blk%_max_opp',
 'usg%_max_opp']

## (4) Training Predictor Model

In [66]:
# In this backtesting, we are splitting the data up by season and using the past seasons to predict future seasons (train/test data)
def backtest(data, model, predictors, start=2, step=1):
    prediction_dfs = [] # Each DataFrame within List[dfs] is the predictions for a singular season
    
    all_seasons = sorted(data["season"].unique())
    
    for i in range(start, len(all_seasons), step):
        curr_season = all_seasons[i]
        
        train = data[data["season"] < curr_season] # Training data is made up of all data before the current season
        test = data[data["season"] == curr_season]
        
        model.fit(train[predictors], train["target"])
        
        test_prediction = model.predict(test[predictors]) # Make predictions on test set
        test_prediction = pd.Series(test_prediction, index=test.index)
        
        # Concatenate actual values and prediction values
        combined_vals = pd.concat([test["target"], test_prediction], axis=1)
        combined_vals.columns = ["actual", "prediction"]
        
        prediction_dfs.append(combined_vals)
    return pd.concat(prediction_dfs)

In [67]:
derived_predictions = backtest(df, ridge_regression, predictor_columns)
derived_predictions

Unnamed: 0,actual,prediction
5250,1,1
5251,1,1
5252,0,0
5253,1,0
5254,0,1
...,...,...
17767,0,0
17768,1,1
17769,0,1
17770,2,1


In [68]:
from sklearn.metrics import accuracy_score

accuracy_score(derived_predictions["actual"], derived_predictions["prediction"])

0.5471969333972209

In [69]:
# Can remove (2) values for better accuracy
derived_predictions = derived_predictions[derived_predictions["actual"] != 2]
accuracy_score(derived_predictions["actual"], derived_predictions["prediction"])

0.5485110470701249

In [70]:
# Set baseline accuracy for model (what is considered good accuracy)
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

# Find winning percentage for @ home and away

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [71]:
df

# Note: as of now, prediction is only based on current game, which is not a proper indicator

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [72]:
# Use average of last 8 games instead
df_avgs = df[list(scale_columns) + ["won", "team", "season"]]

In [73]:
df_avgs

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.079,0.679245,0.277279,0.554502,0.317647,0.451923,1.0,False,NOP,2016
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.140,0.509434,0.160462,0.345972,0.317647,0.317308,1.0,False,CLE,2016
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.185,0.270440,0.088575,0.232227,0.329412,0.298077,0.0,True,CHI,2016
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.063,0.344864,0.215661,0.530806,0.505882,0.298077,0.0,True,GSW,2016
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.047,0.300839,0.019255,0.203791,0.317647,0.403846,0.0,False,ATL,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.103,0.401468,0.182285,0.208531,0.411765,0.413462,0.0,False,BOS,2022
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.124,0.423480,0.928113,1.000000,0.411765,0.288462,0.0,True,GSW,2022
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.076,0.300839,0.181001,0.630332,0.352941,0.384615,1.0,False,BOS,2022
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.063,0.300839,0.120668,0.459716,0.400000,0.375000,0.0,False,BOS,2022


In [74]:
# Function to find team averages for previous 8 games
def team_avgs(team):
    rolling = team.rolling(8).mean()
    return rolling

df_avgs = df_avgs.groupby(["team", "season"], group_keys=False).apply(team_avgs)

  rolling = team.rolling(8).mean()


In [75]:
df_avgs

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.366477,0.303309,0.397727,0.448276,0.475379,0.430226,0.421512,0.365079,0.751896,...,0.054500,0.121500,0.528826,0.177471,0.438981,0.497059,0.335337,0.500,0.625,2022.0
17768,0.0,0.474432,0.375000,0.472488,0.465517,0.509470,0.425475,0.319767,0.275794,0.781359,...,0.068500,0.117250,0.383648,0.324615,0.719194,0.550000,0.399038,0.500,0.625,2022.0
17769,0.0,0.366477,0.297794,0.402512,0.461207,0.471591,0.446407,0.389535,0.351190,0.727830,...,0.055875,0.118250,0.528826,0.185494,0.499408,0.472059,0.361779,0.625,0.500,2022.0
17770,0.0,0.369318,0.314338,0.390550,0.465517,0.462121,0.459768,0.372093,0.329365,0.753792,...,0.056875,0.122000,0.441431,0.182606,0.491114,0.470588,0.389423,0.500,0.375,2022.0


In [76]:
# Remove missing values (first 5 games don't have average to calculate)
rolling_columns = [f"{col}_5" for col in df_avgs.columns]
df_avgs.columns = rolling_columns

df = pd.concat([df, df_avgs], axis=1)
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_5,blk%_max_opp_5,tov%_max_opp_5,usg%_max_opp_5,ortg_max_opp_5,drtg_max_opp_5,total_opp_5,home_opp_5,won_5,season_5
175,0.0,0.545455,0.470588,0.473684,0.448276,0.439394,0.467933,0.325581,0.222222,1.000000,...,0.060500,0.074625,0.404219,0.119544,0.387441,0.451471,0.329327,0.500,0.875,2016.0
197,0.0,0.340909,0.352941,0.325359,0.241379,0.227273,0.437055,0.279070,0.285714,0.631272,...,0.065250,0.112875,0.464885,0.145539,0.434242,0.401471,0.448317,0.250,0.125,2016.0
198,0.0,0.272727,0.264706,0.306220,0.344828,0.439394,0.359857,0.255814,0.253968,0.656943,...,0.071625,0.066500,0.295466,0.143453,0.377962,0.386765,0.361779,0.625,0.375,2016.0
199,0.0,0.340909,0.250000,0.413876,0.379310,0.318182,0.522565,0.395349,0.333333,0.787631,...,0.063500,0.087875,0.546777,0.148909,0.280213,0.369118,0.306490,0.375,0.500,2016.0
200,0.0,0.568182,0.382353,0.581340,0.448276,0.409091,0.497625,0.116279,0.095238,0.833139,...,0.069375,0.079000,0.426101,0.135751,0.406398,0.467647,0.342548,0.500,0.875,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.054500,0.121500,0.528826,0.177471,0.438981,0.497059,0.335337,0.500,0.625,2022.0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.068500,0.117250,0.383648,0.324615,0.719194,0.550000,0.399038,0.500,0.625,2022.0
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.055875,0.118250,0.528826,0.185494,0.499408,0.472059,0.361779,0.625,0.500,2022.0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.056875,0.122000,0.441431,0.182606,0.491114,0.470588,0.389423,0.500,0.375,2022.0


In [77]:
# Provide more information to model for improved accuracy (home/away, opponent)
def shift_column(team, col_name):
    next_column = team[col_name].shift(-1)
    return next_column

def add_column(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_column(x, col_name))

# Look at home column, find if next game will be home or away, and put into current row 
df["home_next"] = add_column(df, "home")
df["date_next"] = add_column(df, "date")
df["team_opponent_next"] = add_column(df, "team_opp")
df = df.copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_5,ortg_max_opp_5,drtg_max_opp_5,total_opp_5,home_opp_5,won_5,season_5,home_next,date_next,team_opponent_next
175,0.0,0.545455,0.470588,0.473684,0.448276,0.439394,0.467933,0.325581,0.222222,1.000000,...,0.119544,0.387441,0.451471,0.329327,0.500,0.875,2016.0,1.0,2015-11-09,MIN
197,0.0,0.340909,0.352941,0.325359,0.241379,0.227273,0.437055,0.279070,0.285714,0.631272,...,0.145539,0.434242,0.401471,0.448317,0.250,0.125,2016.0,1.0,2015-11-11,DET
198,0.0,0.272727,0.264706,0.306220,0.344828,0.439394,0.359857,0.255814,0.253968,0.656943,...,0.143453,0.377962,0.386765,0.361779,0.625,0.375,2016.0,1.0,2015-11-11,LAL
199,0.0,0.340909,0.250000,0.413876,0.379310,0.318182,0.522565,0.395349,0.333333,0.787631,...,0.148909,0.280213,0.369118,0.306490,0.375,0.500,2016.0,0.0,2015-11-11,BOS
200,0.0,0.568182,0.382353,0.581340,0.448276,0.409091,0.497625,0.116279,0.095238,0.833139,...,0.135751,0.406398,0.467647,0.342548,0.500,0.875,2016.0,1.0,2015-11-11,NOP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.177471,0.438981,0.497059,0.335337,0.500,0.625,2022.0,0.0,2022-06-13,GSW
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.324615,0.719194,0.550000,0.399038,0.500,0.625,2022.0,0.0,2022-06-16,BOS
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.185494,0.499408,0.472059,0.361779,0.625,0.500,2022.0,1.0,2022-06-16,GSW
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.182606,0.491114,0.470588,0.389423,0.500,0.375,2022.0,,,


In [78]:
# Pull Stats about opponent for model refining
# Inner JOIN - team we're trying to predict and their next opponent based on their next game
full = df.merge(df[rolling_columns + ["team_opponent_next", "date_next", "team"]], 
                left_on=["team", "date_next"], right_on=["team_opponent_next", "date_next"])
full[["team_x", "team_opponent_next_x", "team_y", "team_opponent_next_y", "date_next"]]

Unnamed: 0,team_x,team_opponent_next_x,team_y,team_opponent_next_y,date_next
0,CHI,CHO,CHO,CHI,2015-11-13
1,MEM,GSW,GSW,MEM,2015-11-11
2,GSW,MEM,MEM,GSW,2015-11-11
3,CLE,NYK,NYK,CLE,2015-11-13
4,OKC,PHI,PHI,OKC,2015-11-13
...,...,...,...,...,...
16197,BOS,GSW,GSW,BOS,2022-06-10
16198,GSW,BOS,BOS,GSW,2022-06-13
16199,BOS,GSW,GSW,BOS,2022-06-13
16200,GSW,BOS,BOS,GSW,2022-06-16


In [79]:
# Run through sequential feature selector to find the 30 predictors
# Remove non-numeric columns that invalidate ML model training
removed_model_cols = list(full.columns[full.dtypes == "object"]) + removed_model_cols
removed_model_cols

['team_x',
 'team_opp',
 'date',
 'date_next',
 'team_opponent_next_x',
 'team_opponent_next_y',
 'team_y',
 'won',
 'target',
 'team',
 'date',
 'season',
 'team_opp']

In [80]:
scale_columns = full.columns[~full.columns.isin(removed_model_cols)]
sequential_feature_selector.fit(full[scale_columns], full["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30)

In [81]:
predictors = list(scale_columns[sequential_feature_selector.get_support()])
predictors

['ft%',
 'trb%',
 'usg%',
 'fg_max',
 '+/-_max',
 'trb%_max',
 '3p%_opp',
 'trb%_opp',
 'usg%_opp',
 'ortg_max_opp',
 'ts%_5_x',
 'efg%_5_x',
 'usg%_5_x',
 'pts_max_5_x',
 '+/-_max_5_x',
 'ft%_opp_5_x',
 'ast%_opp_5_x',
 'usg%_opp_5_x',
 'trb_max_opp_5_x',
 'blk%_max_opp_5_x',
 'home_next',
 'orb%_5_y',
 'usg%_5_y',
 'ast_max_5_y',
 '+/-_max_5_y',
 'efg%_max_5_y',
 'fta_opp_5_y',
 'drb%_opp_5_y',
 'usg%_opp_5_y',
 'ortg_max_opp_5_y']

In [82]:
predictions = backtest(full, ridge_regression, predictors)

In [83]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6299219503639393

In [84]:
# Next Steps:
# - Improve Accuracy: more powerful than ridge_regression: xgboost, random forest classifier
    # different feature numbers, backwards feature selector (usually better)