<a href="https://colab.research.google.com/github/wasifsomji/NBA-Game-Predictor/blob/main/NBA_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

nba_data_url = 'https://raw.githubusercontent.com/wasifsomji/NBA-Game-Predictor/main/nba_games.csv'

data = pd.read_csv(nba_data_url)

# drops extra column
if 'Unnamed: 0' in data.columns:
    data.drop('Unnamed: 0', axis=1, inplace=True)

ds = data.sort_values("date")
ds = ds.reset_index(drop=True) #sorts our data

del ds["mp.1"] # removing minutes played so we can simulate that
del ds["mp_opp.1"] #removes minutes played for opponent
del ds["index_opp"]

def add_target(team):
  team["target"] = team["won"].shift(-1) #pulls future data
  return team

ds = ds.groupby("team", group_keys = False).apply(add_target)

#Focusing on Phoenix Suns
ds[ds["team"] == "PHO"]

ds["target"][pd.isnull(ds["target"])] = 2

ds["target"] = ds["target"].astype(int, errors="ignore")

#removing additional columns , as ML models do
nulls = pd.isnull(ds)

nulls = nulls.sum()

nulls = nulls[nulls > 0]

#data set becomes data frame here
nonnull_columns = ds.columns[~ds.columns.isin(nulls.index)]

df = ds[nonnull_columns].copy()

#done cleaning data

# Machine Learning

# feature selection (various columns)

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

#scale columns to be between 0-1 to make ratios easier

removed_col = ["season", "date", "won", "target", "team", "team_opp"]

selected_columns = df.columns[~df.columns.isin(removed_col)]

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

sfs.fit(df[selected_columns], df["target"])

predictors = list(selected_columns[sfs.get_support()])

def backtest(data, model, predictors, start=2, step=1):

    #Use historical data to predict future data
  all_predictions = []
  seasons = sorted(data["season"].unique())

  for i in range(start, len(seasons)):
    season = seasons[i]

    train = data[data["season"] < season]
    test = data[data["season"] == season]

    # feeding into our model for target predictors
    model.fit(train[predictors], train["target"])

    #generate predictions
    predictions = model.predict(test[predictors])
    preds = pd.Series(predictions, index=test.index)

    combined = pd.concat([test["target"], preds], axis = 1)
    combined.columns = ["actual", "prediction"]

    all_predictions.append(combined)
  return pd.concat(all_predictions)

predictions = backtest(df,rr,predictors)

from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]

#call accuracy score function on actual vs fake
accuracy_score(predictions["actual"], predictions["prediction"])

# ~54.85% accuracy

# Team (Home) wins > Team(Away) wins
# use this to further train model

df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

# add columns based off dataframe
# use rolling averages

df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
  rolling = team.rolling(10).mean() # groups row with 10 above, finds average performance through last 10 games
  return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

# combine rolling columns back with our regular columns

rolling_cols = [f"{col}_12" for col in df_rolling.columns]

df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

df = df.dropna()

# give information about game in advanec to make better judgments
def shift_col(team, col_name):
  next_col = team[col_name].shift(-1)
  return next_col

def add_col(df, col_name):
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

df = df.copy()

full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

# we can only pass numeric data types to our model
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_col

selected_columns = full.columns[~full.columns.isin(removed_columns)]

sfs.fit(full[selected_columns], full["target"])

predictors = list(selected_columns[sfs.get_support()])

predictions = backtest(full, rr, predictors)

accuracy_score(predictions["actual"], predictions["prediction"])

































A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds["target"][pd.isnull(ds["target"])] = 2
  rolling = team.rolling(10).mean() # groups row with 10 above, finds average performance through last 10 games
