In [None]:
# Import pandas library for data manipulation
import pandas as pd

In [None]:
# Load NBA games dataset from CSV file
df = pd.read_csv("nba_games.csv", index_col=0)

In [102]:
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,...,26.2,26.8,155.0,123.0,MIA,107,1,2023,2022-11-07,True
1,240.0,240.0,40.0,84.0,0.476,14.0,39.0,0.359,13.0,15.0,...,41.0,37.3,160.0,121.0,POR,110,0,2023,2022-11-07,False
2,240.0,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,...,28.6,41.1,250.0,125.0,DAL,90,1,2023,2022-12-14,True
3,240.0,240.0,29.0,74.0,0.392,13.0,38.0,0.342,19.0,26.0,...,12.6,33.0,183.0,110.0,CLE,105,0,2023,2022-12-14,False
4,240.0,240.0,37.0,87.0,0.425,7.0,33.0,0.212,32.0,35.0,...,20.0,32.3,226.0,117.0,TOR,126,1,2023,2022-12-07,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,240.0,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,...,51.5,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False
20438,240.0,240.0,43.0,91.0,0.473,17.0,45.0,0.378,23.0,32.0,...,53.2,31.1,225.0,130.0,UTA,124,1,2025,2024-10-23,True
20439,240.0,240.0,40.0,85.0,0.471,10.0,26.0,0.385,34.0,45.0,...,22.5,34.2,258.0,125.0,MEM,126,0,2025,2024-10-23,False
20440,240.0,240.0,42.0,82.0,0.512,12.0,27.0,0.444,12.0,19.0,...,53.2,28.0,146.0,126.0,TOR,121,1,2024,2023-11-24,False


In [None]:
# Sort games by date to ensure chronological order for time series analysis
df = df.sort_values("date")

In [None]:
# Reset index after sorting to have clean sequential indices
df = df.reset_index(drop=True)

In [None]:
# Remove duplicate columns that aren't needed for analysis
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [None]:
# Create target variable: whether the team won their NEXT game
# This shifts the "won" column by -1 for each team, so we're predicting future outcomes
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [107]:
df[df["team"] == "WAS"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
17,240.0,42.0,97.0,0.433,6.0,22.0,0.273,30.0,38.0,0.789,...,31.2,148.0,125.0,PHI,115,0,2018,2017-10-18,True,True
43,240.0,40.0,75.0,0.533,6.0,17.0,0.353,29.0,35.0,0.829,...,43.6,215.0,122.0,DET,111,0,2018,2017-10-20,True,True
83,240.0,37.0,84.0,0.440,9.0,30.0,0.300,26.0,28.0,0.929,...,35.1,272.0,115.0,DEN,104,1,2018,2017-10-23,True,False
127,265.0,39.0,95.0,0.411,6.0,26.0,0.231,15.0,23.0,0.652,...,32.9,129.0,98.0,LAL,102,1,2018,2017-10-25,False,False
139,240.0,43.0,97.0,0.443,15.0,33.0,0.455,16.0,22.0,0.727,...,30.9,210.0,122.0,GSW,120,1,2018,2017-10-27,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20139,240.0,35.0,86.0,0.407,12.0,45.0,0.267,8.0,10.0,0.800,...,26.9,208.0,112.0,BOS,124,1,2025,2025-04-06,False,False
20161,240.0,38.0,91.0,0.418,11.0,39.0,0.282,11.0,15.0,0.733,...,26.6,145.0,112.0,IND,104,1,2025,2025-04-08,False,False
20180,240.0,34.0,87.0,0.391,10.0,33.0,0.303,25.0,30.0,0.833,...,31.5,165.0,109.0,PHI,122,0,2025,2025-04-09,False,False
20227,240.0,35.0,91.0,0.385,10.0,41.0,0.244,9.0,12.0,0.750,...,31.2,261.0,97.0,CHI,119,1,2025,2025-04-11,False,True


In [None]:
# Handle missing target values (last game of each team's season)
# Replace NaN values with 2 and convert target to integer type
df.loc[pd.isnull(df["target"]), "target"] = 2
df["target"] = df["target"].astype(int, errors="ignore")

In [109]:
df["won"].value_counts()

won
True     10221
False    10221
Name: count, dtype: int64

In [110]:
df["target"].value_counts()

target
1    10206
0    10206
2       30
Name: count, dtype: int64

In [None]:
# Find columns with null/missing values
nulls = pd.isnull(df).sum()

In [None]:
# Keep only columns that have missing values
nulls = nulls[nulls > 0]

In [None]:
# Get list of valid columns (those without any null values)
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [114]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'orb',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=138)

In [None]:
# Keep only valid columns (remove columns with missing values)
df = df[valid_columns].copy()

In [116]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,38.0,83.0,0.458,5.0,22.0,0.227,21.0,25.0,9.0,...,29.9,129.0,112.0,BOS,99,0,2018,2017-10-17,True,1
1,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,10.0,...,32.6,250.0,127.0,GSW,121,1,2018,2017-10-17,True,1
2,240.0,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,9.0,...,27.3,138.0,107.0,CLE,102,1,2018,2017-10-17,False,0
3,240.0,43.0,80.0,0.538,16.0,30.0,0.533,19.0,21.0,6.0,...,31.2,152.0,126.0,HOU,122,0,2018,2017-10-17,False,1
4,240.0,38.0,86.0,0.442,17.0,45.0,0.378,18.0,21.0,9.0,...,35.6,279.0,124.0,ATL,117,0,2018,2017-10-18,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,240.0,40.0,94.0,0.426,14.0,32.0,0.438,26.0,32.0,19.0,...,32.2,230.0,136.0,IND,109,0,2025,2025-06-16,True,0
20438,240.0,31.0,74.0,0.419,8.0,30.0,0.267,21.0,26.0,4.0,...,28.4,177.0,102.0,IND,108,1,2025,2025-06-19,False,1
20439,240.0,38.0,92.0,0.413,15.0,42.0,0.357,17.0,25.0,11.0,...,44.1,186.0,114.0,OKC,91,0,2025,2025-06-19,True,0
20440,240.0,35.0,87.0,0.402,11.0,40.0,0.275,22.0,31.0,13.0,...,42.5,134.0,123.0,IND,91,0,2025,2025-06-22,True,2


In [None]:
# Import machine learning tools for feature selection and model training
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

# Ridge Classifier model with regularization (alpha=1)
rr = RidgeClassifier(alpha=1)

# Time series cross-validation split (respects temporal order)
split = TimeSeriesSplit(n_splits=3)

# Sequential Feature Selector to find the best 30 features
# Uses forward selection: starts with 0 features and adds them one by one
sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [None]:
# Define which columns to exclude from model features
# Remove metadata and target-related columns
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [None]:
# Normalize features to 0-1 scale using MinMaxScaler
# This ensures all features have equal weight in the model
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [120]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.386364,0.327869,0.450000,0.142857,0.200000,0.217557,0.477273,0.390625,0.310345,...,0.101282,0.149254,0.376344,BOS,0.293578,0.0,2018,2017-10-17,True,1
1,0.0,0.590909,0.557377,0.514286,0.500000,0.516667,0.394402,0.295455,0.296875,0.344828,...,0.135897,0.751244,0.537634,GSW,0.495413,1.0,2018,2017-10-17,True,1
2,0.0,0.340909,0.409836,0.333333,0.250000,0.366667,0.246819,0.431818,0.390625,0.310345,...,0.067949,0.194030,0.322581,CLE,0.321101,1.0,2018,2017-10-17,False,0
3,0.0,0.500000,0.278689,0.640476,0.535714,0.333333,0.606870,0.431818,0.328125,0.206897,...,0.117949,0.263682,0.526882,HOU,0.504587,0.0,2018,2017-10-17,False,1
4,0.0,0.386364,0.377049,0.411905,0.571429,0.583333,0.409669,0.409091,0.328125,0.310345,...,0.174359,0.895522,0.505376,ATL,0.458716,0.0,2018,2017-10-18,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,0.0,0.431818,0.508197,0.373810,0.464286,0.366667,0.486005,0.590909,0.500000,0.655172,...,0.130769,0.651741,0.634409,IND,0.385321,0.0,2025,2025-06-16,True,0
20438,0.0,0.227273,0.180328,0.357143,0.250000,0.333333,0.268448,0.477273,0.406250,0.137931,...,0.082051,0.388060,0.268817,IND,0.376147,1.0,2025,2025-06-19,False,1
20439,0.0,0.386364,0.475410,0.342857,0.500000,0.533333,0.382952,0.386364,0.390625,0.379310,...,0.283333,0.432836,0.397849,OKC,0.220183,0.0,2025,2025-06-19,True,0
20440,0.0,0.318182,0.393443,0.316667,0.357143,0.500000,0.278626,0.500000,0.484375,0.448276,...,0.262821,0.174129,0.494624,IND,0.220183,0.0,2025,2025-06-22,True,2


In [None]:
# Run feature selection to identify the 30 most predictive features
sfs.fit(df[selected_columns], df["target"])

In [None]:
# Get the list of selected features (30 best predictors)
predictors = list(selected_columns[sfs.get_support()])

In [123]:
predictors

['drb',
 'stl',
 'tov',
 'orb%',
 'stl%',
 'tov%',
 'usg%',
 'fta_max',
 'trb_max',
 'gmsc_max',
 'ftr_max',
 'blk%_max',
 'drtg_max',
 'fg%_opp',
 '3p_opp',
 '3pa_opp',
 '3p%_opp',
 'tov_opp',
 'pts_opp',
 'ts%_opp',
 'drb%_opp',
 'usg%_opp',
 'ft_max_opp',
 'trb_max_opp',
 'efg%_max_opp',
 'drb%_max_opp',
 'ast%_max_opp',
 'tov%_max_opp',
 'usg%_max_opp',
 'total_opp']

In [None]:
# Backtesting function: simulates real-world predictions over time
# Trains on past seasons and predicts future seasons
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    # Loop through seasons, train on past data, test on current season
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
# Run backtest to get predictions for all seasons
predictions = backtest(df, rr, predictors)

In [None]:
# Calculate accuracy: percentage of correct predictions
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"], predictions["prediction"])

In [None]:
# Analyze home court advantage: win rate for home vs away games
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

In [128]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.386364,0.327869,0.450000,0.142857,0.200000,0.217557,0.477273,0.390625,0.310345,...,0.101282,0.149254,0.376344,BOS,0.293578,0.0,2018,2017-10-17,True,1
1,0.0,0.590909,0.557377,0.514286,0.500000,0.516667,0.394402,0.295455,0.296875,0.344828,...,0.135897,0.751244,0.537634,GSW,0.495413,1.0,2018,2017-10-17,True,1
2,0.0,0.340909,0.409836,0.333333,0.250000,0.366667,0.246819,0.431818,0.390625,0.310345,...,0.067949,0.194030,0.322581,CLE,0.321101,1.0,2018,2017-10-17,False,0
3,0.0,0.500000,0.278689,0.640476,0.535714,0.333333,0.606870,0.431818,0.328125,0.206897,...,0.117949,0.263682,0.526882,HOU,0.504587,0.0,2018,2017-10-17,False,1
4,0.0,0.386364,0.377049,0.411905,0.571429,0.583333,0.409669,0.409091,0.328125,0.310345,...,0.174359,0.895522,0.505376,ATL,0.458716,0.0,2018,2017-10-18,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,0.0,0.431818,0.508197,0.373810,0.464286,0.366667,0.486005,0.590909,0.500000,0.655172,...,0.130769,0.651741,0.634409,IND,0.385321,0.0,2025,2025-06-16,True,0
20438,0.0,0.227273,0.180328,0.357143,0.250000,0.333333,0.268448,0.477273,0.406250,0.137931,...,0.082051,0.388060,0.268817,IND,0.376147,1.0,2025,2025-06-19,False,1
20439,0.0,0.386364,0.475410,0.342857,0.500000,0.533333,0.382952,0.386364,0.390625,0.379310,...,0.283333,0.432836,0.397849,OKC,0.220183,0.0,2025,2025-06-19,True,0
20440,0.0,0.318182,0.393443,0.316667,0.357143,0.500000,0.278626,0.500000,0.484375,0.448276,...,0.262821,0.174129,0.494624,IND,0.220183,0.0,2025,2025-06-22,True,2


In [None]:
# Create new dataframe with only the features needed for rolling averages
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [None]:
# Calculate rolling averages for each team over their last 10 games
# This captures recent team performance trends
def find_team_averages(team):
    rolling = team[selected_columns].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [134]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,0.0,0.450000,0.396721,0.475952,0.378571,0.343333,0.410178,0.509091,0.418750,0.379310,...,0.113377,0.481850,0.0579,0.0841,0.405660,0.137821,0.454726,0.523656,0.395413,0.4
20438,0.0,0.427273,0.383607,0.456667,0.367857,0.358333,0.377481,0.509091,0.418750,0.362069,...,0.118969,0.439344,0.0625,0.0867,0.471174,0.119359,0.479104,0.498925,0.413761,0.5
20439,0.0,0.395455,0.322951,0.462619,0.403571,0.390000,0.390458,0.484091,0.421875,0.303448,...,0.180592,0.318852,0.0641,0.1075,0.315933,0.202821,0.541791,0.461290,0.401835,0.4
20440,0.0,0.404545,0.378689,0.433333,0.375000,0.370000,0.377735,0.515909,0.429688,0.375862,...,0.123904,0.439578,0.0641,0.0861,0.492034,0.132308,0.472637,0.493548,0.402752,0.5


In [135]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [136]:
df = df.dropna()

In [137]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,trb%_max_opp_10,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10
248,0.0,0.477273,0.426230,0.483333,0.535714,0.616667,0.361323,0.431818,0.359375,0.310345,...,0.168421,0.307260,0.0593,0.1114,0.327883,0.131667,0.336816,0.438710,0.326606,0.7
251,0.0,0.295455,0.409836,0.278571,0.285714,0.400000,0.265903,0.386364,0.343750,0.310345,...,0.169737,0.288407,0.0594,0.1126,0.388889,0.192436,0.381592,0.364516,0.370642,0.4
260,0.0,0.431818,0.393443,0.454762,0.178571,0.233333,0.246819,0.500000,0.421875,0.310345,...,0.225548,0.290164,0.0512,0.1056,0.350105,0.159487,0.437313,0.427957,0.360550,0.5
263,0.0,0.431818,0.262295,0.564286,0.428571,0.233333,0.618321,0.340909,0.343750,0.275862,...,0.186184,0.343794,0.0529,0.0751,0.429245,0.164231,0.322388,0.440860,0.313761,0.5
265,0.0,0.659091,0.459016,0.666667,0.607143,0.500000,0.501272,0.204545,0.265625,0.241379,...,0.194079,0.271429,0.0534,0.0627,0.293187,0.127821,0.392040,0.553763,0.406422,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,0.0,0.431818,0.508197,0.373810,0.464286,0.366667,0.486005,0.590909,0.500000,0.655172,...,0.113377,0.481850,0.0579,0.0841,0.405660,0.137821,0.454726,0.523656,0.395413,0.4
20438,0.0,0.227273,0.180328,0.357143,0.250000,0.333333,0.268448,0.477273,0.406250,0.137931,...,0.118969,0.439344,0.0625,0.0867,0.471174,0.119359,0.479104,0.498925,0.413761,0.5
20439,0.0,0.386364,0.475410,0.342857,0.500000,0.533333,0.382952,0.386364,0.390625,0.379310,...,0.180592,0.318852,0.0641,0.1075,0.315933,0.202821,0.541791,0.461290,0.401835,0.4
20440,0.0,0.318182,0.393443,0.316667,0.357143,0.500000,0.278626,0.500000,0.484375,0.448276,...,0.123904,0.439578,0.0641,0.0861,0.492034,0.132308,0.472637,0.493548,0.402752,0.5


In [138]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [139]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,home_next,team_opp_next,date_next
248,0.0,0.477273,0.426230,0.483333,0.535714,0.616667,0.361323,0.431818,0.359375,0.310345,...,0.1114,0.327883,0.131667,0.336816,0.438710,0.326606,0.7,1.0,UTA,2017-11-05
251,0.0,0.295455,0.409836,0.278571,0.285714,0.400000,0.265903,0.386364,0.343750,0.310345,...,0.1126,0.388889,0.192436,0.381592,0.364516,0.370642,0.4,0.0,MIN,2017-11-04
260,0.0,0.431818,0.393443,0.454762,0.178571,0.233333,0.246819,0.500000,0.421875,0.310345,...,0.1056,0.350105,0.159487,0.437313,0.427957,0.360550,0.5,1.0,BRK,2017-11-07
263,0.0,0.431818,0.262295,0.564286,0.428571,0.233333,0.618321,0.340909,0.343750,0.275862,...,0.0751,0.429245,0.164231,0.322388,0.440860,0.313761,0.5,1.0,IND,2017-11-08
265,0.0,0.659091,0.459016,0.666667,0.607143,0.500000,0.501272,0.204545,0.265625,0.241379,...,0.0627,0.293187,0.127821,0.392040,0.553763,0.406422,0.6,1.0,MIA,2017-11-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,0.0,0.431818,0.508197,0.373810,0.464286,0.366667,0.486005,0.590909,0.500000,0.655172,...,0.0841,0.405660,0.137821,0.454726,0.523656,0.395413,0.4,0.0,IND,2025-06-19
20438,0.0,0.227273,0.180328,0.357143,0.250000,0.333333,0.268448,0.477273,0.406250,0.137931,...,0.0867,0.471174,0.119359,0.479104,0.498925,0.413761,0.5,1.0,IND,2025-06-22
20439,0.0,0.386364,0.475410,0.342857,0.500000,0.533333,0.382952,0.386364,0.390625,0.379310,...,0.1075,0.315933,0.202821,0.541791,0.461290,0.401835,0.4,0.0,OKC,2025-06-22
20440,0.0,0.318182,0.393443,0.316667,0.357143,0.500000,0.278626,0.500000,0.484375,0.448276,...,0.0861,0.492034,0.132308,0.472637,0.493548,0.402752,0.5,,,


In [141]:
full = df.merge(
  df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
  left_on=["team", "date_next"], 
  right_on=["team_opp_next", "date_next"]
  )


In [142]:
full

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,stl%_max_opp_10_y,blk%_max_opp_10_y,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,team_opp_next_y,team_y
0,0.00,0.431818,0.393443,0.454762,0.178571,0.233333,0.246819,0.500000,0.421875,0.310345,...,0.0522,0.0983,0.385849,0.137564,0.314925,0.407527,0.455046,0.5,DEN,BRK
1,0.00,0.431818,0.262295,0.564286,0.428571,0.233333,0.618321,0.340909,0.343750,0.275862,...,0.0602,0.1458,0.286688,0.164872,0.421891,0.437634,0.373394,0.6,DET,IND
2,0.25,0.318182,0.393443,0.316667,0.285714,0.350000,0.297710,0.386364,0.343750,0.172414,...,0.0581,0.1447,0.291090,0.160513,0.415920,0.443011,0.386239,0.6,NOP,IND
3,0.00,0.431818,0.360656,0.480952,0.285714,0.283333,0.352417,0.409091,0.359375,0.275862,...,0.0550,0.1055,0.318029,0.133590,0.311443,0.445161,0.244954,0.6,LAL,BOS
4,0.00,0.431818,0.393443,0.454762,0.357143,0.450000,0.306616,0.250000,0.218750,0.206897,...,0.0533,0.0885,0.322327,0.184744,0.303483,0.452688,0.311009,0.4,MEM,POR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18163,0.00,0.295455,0.278689,0.371429,0.357143,0.433333,0.318066,0.568182,0.515625,0.241379,...,0.0576,0.0814,0.345807,0.129615,0.431343,0.518280,0.380734,0.4,IND,OKC
18164,0.00,0.363636,0.311475,0.433333,0.357143,0.333333,0.395674,0.545455,0.468750,0.620690,...,0.0579,0.0841,0.405660,0.137821,0.454726,0.523656,0.395413,0.4,IND,OKC
18165,0.00,0.431818,0.508197,0.373810,0.464286,0.366667,0.486005,0.590909,0.500000,0.655172,...,0.0630,0.0855,0.311635,0.194103,0.526368,0.479570,0.418349,0.5,OKC,IND
18166,0.00,0.227273,0.180328,0.357143,0.250000,0.333333,0.268448,0.477273,0.406250,0.137931,...,0.0641,0.1075,0.315933,0.202821,0.541791,0.461290,0.401835,0.4,OKC,IND


In [144]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,DEN,BRK,BRK,DEN,2017-11-07
1,DET,IND,IND,DET,2017-11-08
2,NOP,IND,IND,NOP,2017-11-07
3,LAL,BOS,BOS,LAL,2017-11-08
4,MEM,POR,POR,MEM,2017-11-07
...,...,...,...,...,...
18163,IND,OKC,OKC,IND,2025-06-16
18164,IND,OKC,OKC,IND,2025-06-19
18165,OKC,IND,IND,OKC,2025-06-19
18166,OKC,IND,IND,OKC,2025-06-22


In [145]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [146]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [147]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [148]:
predictors = list(selected_columns[sfs.get_support()])

In [149]:
predictors

['fg',
 'ast%',
 'usg%',
 'ortg',
 'fg_max',
 'fga_max',
 'pts_max',
 'gmsc_max',
 'fg%_opp',
 '3p%_opp',
 'usg%_opp',
 'drtg_opp',
 'pf_max_opp',
 '3p%_10_x',
 'usg%_10_x',
 'ortg_10_x',
 '3p%_max_10_x',
 'gmsc_max_10_x',
 'usg%_opp_10_x',
 'fg_max_opp_10_x',
 'ftr_max_opp_10_x',
 'usg%_10_y',
 'ortg_10_y',
 'gmsc_max_10_y',
 'fg_opp_10_y',
 '3p%_opp_10_y',
 'usg%_opp_10_y',
 'drtg_opp_10_y',
 'blk_max_opp_10_y',
 'drtg_max_opp_10_y']

In [150]:
predictions = backtest(full, rr, predictors)

In [151]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6188463819691578