In [98]:
import pandas as pd

In [99]:
df = pd.read_csv("nba_games.csv", index_col = 0)

### Clean CSV

In [100]:
df = df.sort_values("date")
df = df.reset_index(drop=True)

In [101]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

#### Add Target Column to Represent Team's Next Game

In [None]:
def add_target(team): 
    team["target"] = team["won"].shift(-1) # create new column target that represents the team's next game result
    return team

df = df.groupby("team", group_keys=False).apply(add_target)
df["target"][pd.isnull(df["target"])] = 2 # target for last game will be 2
df["target"] = df["target"].astype(int, errors="ignore") # turn wins into 0's and 1's


#### Get Rid of Null Columns

In [103]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_cols = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_cols].copy()

### ML Pipeline

In [104]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) # start with 0 features, pick best and continue to 30

remove_cols = ["season", "date", "won", "target", "team", "team_opp"] # remove non-statistic cols
select_cols = df.columns[~df.columns.isin(remove_cols)]

scaler = MinMaxScaler()
df[select_cols] = scaler.fit_transform(df[select_cols]) # process columns into ranges from 0 to 1


#### Backtest Function

In [105]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = [] # list of df predictions

    seasons = sorted(df["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i] # get current season
        train = data[data["season"] < season] # all data from previous seasons
        test = data[data["season"] == season] # data from current season

        model.fit(train[predictors], train["target"]) # take training data and predictors to fit model and predict target

        predictions = model.predict(test[predictors]) # generate predictions on test set
        predictions = pd.Series(predictions, index=test.index) # convert numpy arr to pandas series

        combined = pd.concat([test["target"], predictions], axis=1) # combine target and predictions into one df with two rows
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined) # add to list
    return pd.concat(all_predictions) # concat list of df into one

#### Get Rolling Averages For Each Team

In [108]:
def get_team_averages(team):
    rolling = team.rolling(10).mean() # get average of team's previous 10 games
    return rolling

df_rolling = df[list(selected_features) + ["won", "team", "season"]]
df_rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(get_team_averages, include_groups=False) # group by team and season to compute rolling averages
rolling_cols = [f"{col}_10" for col in df_rolling.columns] # rename rolling columns
df_rolling.columns = rolling_cols # apply new col names

df = pd.concat([df, df_rolling], axis=1) # combin to main df
df = df.dropna() # drop rows with missing values (team's first 10 games)

#### Get The Next Opposing Team's Stats And Merge Into Full DF

In [109]:
# shift value from to previous row
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1) # take value for the game and shift back one row
    return next_col

# return new row with values shifted back
def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name), include_groups=False)

df["home_next"] = add_col(df, "home") # home for next game
df["team_opp_next"] = add_col(df, "team_opp") # next opp
df["date_next"] = add_col(df, "date") # next game date

# merge current team and their next opp team
# x is the team we are trying to predict and y is the opposing team
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
                left_on=["team","date_next"],
                right_on=["team_opp_next", "date_next"])


In [111]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,DEN,NOP,NOP,DEN,2015-11-17
3,ORL,MIN,MIN,ORL,2015-11-18
4,PHI,DAL,DAL,PHI,2015-11-16
...,...,...,...,...,...
19261,UTA,NYK,NYK,UTA,2024-01-30
19262,BOS,IND,IND,BOS,2024-01-30
19263,NOP,HOU,HOU,NOP,2024-01-31
19264,DEN,OKC,OKC,DEN,2024-01-31


#### Fit Model and Select Features

In [113]:
remove_cols = list(full.columns[full.dtypes == "object"]) + remove_cols
select_cols = full.columns[~full.columns.isin(remove_cols)]

sfs.fit(full[select_cols], full["target"])
selected_features = list(select_cols[sfs.get_support()])
selected_features

['mp',
 'ft%',
 'tov',
 'pf',
 'usg%',
 'trb_max',
 'gmsc_max',
 'ftr_max',
 'trb%_max',
 'tov%_max',
 'mp_opp',
 '3p_opp',
 'fta_opp',
 'stl%_opp',
 'usg%_opp',
 'pts_max_opp',
 'ftr_max_opp',
 'stl%_max_opp',
 'usg%_10_x',
 'gmsc_max_10_x',
 'ast%_opp_10_x',
 'usg%_opp_10_x',
 'ft%_max_opp_10_x',
 'won_10_x',
 'home_next',
 'usg%_10_y',
 'gmsc_max_10_y',
 'usg%_opp_10_y',
 'ft%_max_opp_10_y',
 'won_10_y']

#### Predict Games Using Backtesting

In [114]:
from sklearn.metrics import accuracy_score

predictions = backtest(full, rr, selected_features)
predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.6255911178123501