# NBA Game Outcome Predictor 
### CMPE 257 Project
Authors: Kaushika Uppu, Miranda Billawala, Yun Ei Hlaing, Iris Cheung

## Imports

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

import random
from datetime import datetime, timedelta
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, f1_score, r2_score, precision_score, recall_score
import itertools

## Training Data
We load in the save statistics predictions for each game using our two methods: rolling window and a XGBoost model. Due to computational costs of predicting statistics with the model, the model data only spans 2014-2025.

In [2]:
all_stats_cleaned = pd.read_csv('all_stats_cleaned.csv')
all_stats_cleaned['GAME_DATE'] = pd.to_datetime(all_stats_cleaned['GAME_DATE'], format='ISO8601') # convert date to datetime object

all_stats_cleaned.head()

Unnamed: 0,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_DATE,HOME,OPPONENT,WIN,MIN,FGM,FGA,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON_YEAR
0,0,ATL,Atlanta Hawks,1986-04-12,1,IND,1,240.0,38,88,...,39,59,22,6,3,12.0,21,108,17.0,1985
1,0,ATL,Atlanta Hawks,1986-04-10,1,NJN,1,240.0,44,87,...,27,42,30,15,5,22.0,26,126,9.0,1985
2,0,ATL,Atlanta Hawks,1986-04-08,1,CHI,1,240.0,52,98,...,25,42,33,13,6,10.0,22,131,13.0,1985
3,0,ATL,Atlanta Hawks,1986-04-05,0,CHI,0,240.0,40,76,...,25,38,17,7,7,21.0,28,97,-5.0,1985
4,0,ATL,Atlanta Hawks,1986-04-04,0,WAS,0,265.0,54,100,...,28,45,24,6,7,14.0,37,129,-6.0,1985


In [3]:
df_rolling = pd.read_csv('df_rolling.csv')
df_rolling['GAME_DATE'] = pd.to_datetime(df_rolling['GAME_DATE'])
df_rolling.head()

Unnamed: 0,TEAM_ID_ONE,GAME_DATE,HOME_ONE,WIN_ONE,FG_PCT_ONE,FG3_PCT_ONE,FT_PCT_ONE,OREB_ONE,DREB_ONE,REB_ONE,...,BLK_TWO,TOV_TWO,PF_TWO,SEASON_YEAR,EFG%_TWO,TS%_TWO,WIN_STREAK_TWO,WIN_PERCENTAGE_TWO,ELO_TWO,WIN_LAST_TWO
0,28,1985-11-06,1,1,0.4818,0.1334,0.7716,18.6,33.0,51.6,...,7.2,18.0,30.2,1985,0.467987,0.526725,0,0.6,1502.231272,1.0
1,28,1985-11-12,1,1,0.4934,0.2,0.7922,14.6,29.4,44.0,...,5.0,19.4,20.8,1985,0.472332,0.496543,0,0.285714,1478.66721,0.0
2,28,1985-11-29,1,1,0.511,0.1,0.8744,13.6,30.8,44.4,...,6.2,18.4,31.6,1985,0.562264,0.586669,2,0.736842,1630.861942,1.0
3,28,1985-12-03,1,0,0.4938,0.1334,0.87,13.4,29.4,42.8,...,4.6,16.6,20.4,1985,0.505386,0.548772,3,0.529412,1526.169207,0.0
4,28,1985-12-05,1,1,0.4752,0.2334,0.8146,13.0,30.8,43.8,...,5.0,21.4,25.4,1985,0.514892,0.54852,0,0.5,1472.405125,0.0


In [4]:
df_model = pd.read_csv('df_model_tuned.csv')
df_model['GAME_DATE'] = pd.to_datetime(df_model['GAME_DATE'])
df_model.head()

Unnamed: 0,TEAM_ID_ONE,GAME_DATE,HOME_ONE,WIN_ONE,FG_PCT_ONE,FG3_PCT_ONE,FT_PCT_ONE,OREB_ONE,DREB_ONE,REB_ONE,...,BLK_TWO,TOV_TWO,PF_TWO,SEASON_YEAR,EFG%_TWO,TS%_TWO,WIN_STREAK_TWO,WIN_PERCENTAGE_TWO,ELO_TWO,WIN_LAST_TWO
0,10,2014-10-28,1,0,0.444152,0.35689,0.768085,7.817335,34.557854,44.871178,...,3.824581,15.420383,19.08717,2014,0.677708,0.56564,0,0.563948,1626.867122,1.0
1,22,2014-10-28,1,1,0.489515,0.384762,0.839081,9.745007,34.65654,39.966927,...,5.242719,11.993127,18.130919,2014,0.586378,0.556815,0,0.524464,1585.475243,0.0
2,3,2014-10-28,1,1,0.457504,0.34128,0.732926,13.244682,31.746922,41.1627,...,4.202436,15.582109,21.155241,2014,0.59331,0.537249,0,0.5005,1336.913853,0.0
3,19,2014-10-29,1,1,0.475021,0.381624,0.785072,10.271174,34.18862,44.21864,...,5.5549,14.85137,19.247747,2014,0.608978,0.524657,0,0.638353,1387.342422,1.0
4,1,2014-10-29,1,1,0.469322,0.372256,0.732027,12.267653,30.908613,44.791588,...,4.572797,16.491095,22.705935,2014,0.603948,0.554034,0,0.420601,1505.633151,1.0


## Test Set

In [5]:
def get_val_set (first_season, last_season, n = 1) :
    dates = []
    for season in range(first_season, last_season) :
        season_data = all_stats_cleaned[all_stats_cleaned['SEASON_YEAR'] == season]
        start_date = season_data['GAME_DATE'].min()
        end_date = season_data['GAME_DATE'].max()

        # day around the beginning of the season
        beg = season_data[season_data['GAME_DATE'].between(start_date, start_date + timedelta(weeks = 4))]

        # day around trade deadline (after about 2/3 of the season)
        delta = round((2/3)*(end_date-start_date).days)
        approx_deadline = start_date + timedelta(days = delta)
        mid = season_data[season_data['GAME_DATE'].between(approx_deadline, approx_deadline + timedelta(weeks = 4))]
        
        # day around the end of the season
        end = season_data[season_data['GAME_DATE'].between(end_date - timedelta(weeks = 4), end_date)]

        dates.extend(list(pd.concat([beg.sample(n)['GAME_DATE'], mid.sample(n)['GAME_DATE'], end.sample(n)['GAME_DATE']])))

    return dates

In [6]:
first_season = all_stats_cleaned['SEASON_YEAR'].min() + 1
last_season = all_stats_cleaned['SEASON_YEAR'].max() - 5
val_set = get_val_set(first_season, last_season)

## Model Building

In [7]:
time_horizon = 5

In [7]:
def get_training_set (df, date, num_seasons) :
    """
    Input: Date of games and number of seasons to include in dataset
    Output: All rows from the last num_seasons and all games in the current season up till the given date
    """
    # determine season of the game
    season = date.year if date.month >= 10 else date.year - 1
    
    # get games for training
    data = df[df['SEASON_YEAR'].between(season - num_seasons, season)].copy()
    data['DAYS_SINCE_GAME'] = [(date-game_day).days for game_day in data['GAME_DATE']]
    data = data[data['DAYS_SINCE_GAME'] > 0]

    data = data.sort_values(by = 'DAYS_SINCE_GAME')

    # split into X and y and only look at relevant columns
    X = data.drop(columns = ['WIN_ONE', 'GAME_DATE'])
    y = data['WIN_ONE']

    return (X,y)

def pred_by_date (df, model, date) :
    """
    Predict the outcome of all games on the given date. 
    """
    n = time_horizon # how many years in the past for training
    
    # determine season of the game
    season = date.year if date.month >= 10 else date.year - 1

    # get data in relevant time frame
    X, y = get_training_set(df, date, n)

    games_on_day = df[df['GAME_DATE'] == date].copy()
    games_on_day['DAYS_SINCE_GAME'] = np.zeros(len(games_on_day))

    test = games_on_day.drop(columns = ['WIN_ONE', 'GAME_DATE'])

    model.fit(X,y)
    pred = model.predict(test)

    #get y_score, predicted probability of positive 
    probs = model.predict_proba(test)
    pos_prob = probs[:,1]
    
    return pred, games_on_day['WIN_ONE'], pos_prob

def test_model(df, model, dates) :
    total_pred = total_pos_prob = []
    total_act = pd.DataFrame()
    
    for d in dates:
        pred, act, pos_prob = pred_by_date(df, model, d)
        total_pred = np.concatenate((total_pred, pred))
        total_act = pd.concat([total_act, act], axis=0)
        total_pos_prob = np.concatenate((total_pos_prob, pos_prob))
    
    return total_pred, total_act, total_pos_prob

def print_class_metrics(pred, act, pos_prob):
    print("Accuracy:", accuracy_score(act, pred))
    print("Precision:", precision_score(act, pred))
    print("Recall:", recall_score(act, pred))
    print("F1:", f1_score(act, pred))
    print("ROC AUC:", roc_auc_score(act, pos_prob))

### Random Forests (baseline)

In [11]:
model = RandomForestClassifier(random_state=33)
pred, act, pos_prob = test_model(df_rolling, model, val_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6720368239355581
#Precision: 0.6744457409568262
#Recall: 0.6651323360184119
#F1: 0.6697566628041715
#ROC AUC: 0.7221314395208438

Accuracy: 0.6720368239355581
Precision: 0.6744457409568262
Recall: 0.6651323360184119
F1: 0.6697566628041715
ROC AUC: 0.7221314395208438


### XGBoost

In [12]:
model = XGBClassifier(objective='binary:logistic', base_score = 0.5, random_state = 33)
pred, act, pos_prob = test_model(df_rolling, model, val_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6547756041426928
#Precision: 0.6584216725559482
#Recall: 0.6432681242807825
#F1: 0.6507566938300349
#ROC AUC: 0.7148952342613032

Accuracy: 0.6547756041426928
Precision: 0.6584216725559482
Recall: 0.6432681242807825
F1: 0.6507566938300349
ROC AUC: 0.7148952342613032


## Feature Selection
The average feature importance scores is calculated for the three games for each season using XG Boost built-in feature importance.

In [17]:
def pred_by_date_with_importance(df, model, date):
    n = 5 
    season = date.year if date.month >= 10 else date.year - 1
    X, y = get_training_set(df, date, n)
    # one hot encoding on the Home feature 
    games_on_day = df[df['GAME_DATE'] == date].copy()
    games_on_day['DAYS_SINCE_GAME'] = np.zeros(len(games_on_day))

    test = games_on_day.drop(columns = ['WIN_ONE', 'GAME_DATE'])

    model.fit(X,y)
    pred = model.predict(test)
    correct = np.sum(pred == games_on_day['WIN_ONE'])
    games = len(pred)
    importance_scores = model.get_booster().get_score(importance_type='gain')
    return correct, games, importance_scores

In [18]:
def test_model_with_importance(df, model, test) :
    """
    Outputs the average feature importance scores of game predictions
    """
    total_correct = total_games = 0
    feature_scores = {}
    for t in test:
        correct, games, importance_scores = pred_by_date_with_importance(df, model, t)
        
        for feature, score in importance_scores.items():
            if feature not in feature_scores:
                feature_scores[feature] = []
            feature_scores[feature].append(score)
            

        total_correct += correct
        total_games += games

    average_importance = {features: sum(scores)/len(scores) for features, scores in feature_scores.items()}  
    sorted_features = sorted(average_importance.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_features

In [19]:
model = XGBClassifier(objective='binary:logistic')
importance_scores = test_model_with_importance(df_rolling, model, val_set)
print(importance_scores)

[('HOME_ONE', 30.042636596795283), ('ELO_TWO', 12.774800866541236), ('ELO_ONE', 12.72723547497181), ('WIN_LAST_TWO', 5.377202563815647), ('DAYS_SINCE_GAME', 3.8290192671496457), ('TS%_ONE', 3.81653810871972), ('WIN_PERCENTAGE_TWO', 3.7969241720257383), ('TS%_TWO', 3.7965164738472064), ('WIN_PERCENTAGE_ONE', 3.7957291892080596), ('EFG%_ONE', 3.7222740276895387), ('EFG%_TWO', 3.71307538133679), ('FG_PCT_ONE', 3.695310910542806), ('FT_PCT_ONE', 3.6802050092003564), ('FG_PCT_TWO', 3.675352485492976), ('FT_PCT_TWO', 3.6634049656415226), ('REB_TWO', 3.6245810479828804), ('STL_ONE', 3.6223480316123577), ('OREB_TWO', 3.621642545016125), ('REB_ONE', 3.618171974866077), ('FG3_PCT_TWO', 3.6128498520513976), ('STL_TWO', 3.6113566364904846), ('OREB_ONE', 3.60653886409721), ('TOV_ONE', 3.6021888785892062), ('FG3_PCT_ONE', 3.60125114701011), ('AST_ONE', 3.594588037693139), ('AST_TWO', 3.5941335287961094), ('TOV_TWO', 3.5869254331396085), ('TEAM_ID_ONE', 3.582940436372853), ('TEAM_ID_TWO', 3.548127988

Testing the model with the feature importance scores by iteratively removing the least important features and comparing the accuracy:

In [20]:
def get_training_set_with_features (df, date, num_seasons, features) :
    """
    Input: Date of games, number of seasons and feature subset to include in dataset
    Output: All rows from the last num_seasons and all games in the current season up till the given date
    """
    season = date.year if date.month >= 10 else date.year - 1
    data = df[df['SEASON_YEAR'].between(season - num_seasons, season)].copy()
    data['DAYS_SINCE_GAME'] = [(date-game_day).days for game_day in data['GAME_DATE']]
    data = data[data['DAYS_SINCE_GAME'] > 0]

    data = data.sort_values(by = 'DAYS_SINCE_GAME')

    X = data[features]
    y = data['WIN_ONE']

    return (X,y)

def pred_by_date_with_features (df, model, date, features) :
    n = 5 
    season = date.year if date.month >= 10 else date.year - 1

    X, y = get_training_set_with_features(df, date, n, features)

    games_on_day = df[df['GAME_DATE'] == date].copy()
    games_on_day['DAYS_SINCE_GAME'] = np.zeros(len(games_on_day))

    test = games_on_day[features]
    model.fit(X,y)
    pred = model.predict(test)
    
    probs = model.predict_proba(test)
    pos_prob = probs[:,1]
    return pred, games_on_day['WIN_ONE'], pos_prob

In [24]:
def feature_selection_with_importance(df, model, current_features, min_subset_size, top_n, test) :
    """
    Iterates through the feature importance scores and iteratively remove the least importance features
    """
    results = []
    # current_features = [f[0] for f in feature_importance]
    
    while len(current_features) >= min_subset_size:
        total_pred = total_pos_prob = []
        total_act = pd.DataFrame()
        print(f"Evaluating with {len(current_features)} features...")
        for t in test:    
            pred, act, pos_prob = pred_by_date_with_features(df, model, t, features = current_features)
        
            total_pred = np.concatenate((total_pred, pred))
            total_act = pd.concat([total_act, act], axis=0)
            total_pos_prob = np.concatenate((total_pos_prob, pos_prob))
        
        acc = accuracy_score(total_act, total_pred)
        roc_auc = roc_auc_score(total_act, total_pos_prob)
        metrics = [acc, roc_auc]

        print(current_features, ': Accuracy = ', acc, ', ROC AUC = ', roc_auc)
        results.append((current_features.copy(), metrics))
        current_features.pop(-1)
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_n]

In [37]:
model = XGBClassifier(objective='binary:logistic', random_state = 33)
sorted_features = [f[0] for f in importance_scores]
print(sorted_features)
top_subsets = feature_selection_with_importance(df_rolling, model, sorted_features, min_subset_size=20, top_n=10, test=val_set)

for i, (subset, metrics) in enumerate(top_subsets, 1):
    print(f"#{i}: Features = {subset}, Accuracy = {metrics[0]:.4f}, ROC AUC = {metrics[1]:.4f}")

['HOME_ONE', 'ELO_TWO', 'ELO_ONE', 'WIN_LAST_TWO', 'DAYS_SINCE_GAME', 'TS%_ONE', 'WIN_PERCENTAGE_TWO', 'TS%_TWO', 'WIN_PERCENTAGE_ONE', 'EFG%_ONE', 'EFG%_TWO', 'FG_PCT_ONE', 'FT_PCT_ONE', 'FG_PCT_TWO', 'FT_PCT_TWO', 'REB_TWO', 'STL_ONE', 'OREB_TWO', 'REB_ONE', 'FG3_PCT_TWO', 'STL_TWO', 'OREB_ONE', 'TOV_ONE', 'FG3_PCT_ONE', 'AST_ONE', 'AST_TWO', 'TOV_TWO', 'TEAM_ID_ONE', 'TEAM_ID_TWO', 'DREB_TWO', 'BLK_TWO', 'BLK_ONE', 'PF_TWO', 'WIN_LAST_ONE', 'PF_ONE', 'DREB_ONE', 'WIN_STREAK_TWO', 'WIN_STREAK_ONE', 'SEASON_YEAR']
Evaluating with 39 features...
['HOME_ONE', 'ELO_TWO', 'ELO_ONE', 'WIN_LAST_TWO', 'DAYS_SINCE_GAME', 'TS%_ONE', 'WIN_PERCENTAGE_TWO', 'TS%_TWO', 'WIN_PERCENTAGE_ONE', 'EFG%_ONE', 'EFG%_TWO', 'FG_PCT_ONE', 'FT_PCT_ONE', 'FG_PCT_TWO', 'FT_PCT_TWO', 'REB_TWO', 'STL_ONE', 'OREB_TWO', 'REB_ONE', 'FG3_PCT_TWO', 'STL_TWO', 'OREB_ONE', 'TOV_ONE', 'FG3_PCT_ONE', 'AST_ONE', 'AST_TWO', 'TOV_TWO', 'TEAM_ID_ONE', 'TEAM_ID_TWO', 'DREB_TWO', 'BLK_TWO', 'BLK_ONE', 'PF_TWO', 'WIN_LAST_ONE', 

In [38]:
# best performing feature subset
best_feature_subset = top_subsets[0][0]
print('Best feature subset: ', best_feature_subset)

total_pred = total_pos_prob = []
total_act = pd.DataFrame()
for t in val_set:
    pred, act, pos_prob = pred_by_date_with_features(df_rolling, model, t, features = best_feature_subset)
        
    total_pred = np.concatenate((total_pred, pred))
    total_act = pd.concat([total_act, act], axis=0)
    total_pos_prob = np.concatenate((total_pos_prob, pos_prob))

print_class_metrics(total_pred, total_act, total_pos_prob)

#Accuracy: 0.6616800920598389
#Precision: 0.6624277456647398
#Recall: 0.6593785960874569
#F1: 0.6608996539792388
#ROC AUC: 0.7201656335536395

Best feature subset:  ['HOME_ONE', 'ELO_TWO', 'ELO_ONE', 'WIN_LAST_TWO', 'DAYS_SINCE_GAME', 'TS%_ONE', 'WIN_PERCENTAGE_TWO', 'TS%_TWO', 'WIN_PERCENTAGE_ONE', 'EFG%_ONE', 'EFG%_TWO', 'FG_PCT_ONE', 'FT_PCT_ONE', 'FG_PCT_TWO', 'FT_PCT_TWO', 'REB_TWO', 'STL_ONE', 'OREB_TWO', 'REB_ONE', 'FG3_PCT_TWO', 'STL_TWO', 'OREB_ONE', 'TOV_ONE', 'FG3_PCT_ONE', 'AST_ONE', 'AST_TWO', 'TOV_TWO', 'TEAM_ID_ONE', 'TEAM_ID_TWO', 'DREB_TWO', 'BLK_TWO', 'BLK_ONE']
Accuracy: 0.6616800920598389
Precision: 0.6624277456647398
Recall: 0.6593785960874569
F1: 0.6608996539792388
ROC AUC: 0.7201656335536395


In [None]:
# uncomment to avoid rerunning
#best_feature_subset = ['HOME_ONE', 'ELO_TWO', 'ELO_ONE', 'WIN_LAST_TWO', 'DAYS_SINCE_GAME', 'TS%_ONE', 'WIN_PERCENTAGE_TWO', 'TS%_TWO', 'WIN_PERCENTAGE_ONE', 'EFG%_ONE', 
#                       'EFG%_TWO', 'FG_PCT_ONE', 'FT_PCT_ONE', 'FG_PCT_TWO', 'FT_PCT_TWO', 'REB_TWO', 'STL_ONE', 'OREB_TWO', 'REB_ONE', 'FG3_PCT_TWO', 'STL_TWO', 'OREB_ONE', 
#                       'TOV_ONE', 'FG3_PCT_ONE', 'AST_ONE', 'AST_TWO', 'TOV_TWO', 'TEAM_ID_ONE', 'TEAM_ID_TWO', 'DREB_TWO', 'BLK_TWO', 'BLK_ONE']

In [13]:
best_feature_subset.remove('DAYS_SINCE_GAME') # we drop this because it will be calculated when we get the training set
base_features = ['GAME_DATE', 'WIN_ONE', 'SEASON_YEAR']
df_model_subset = df_model[base_features + best_feature_subset]
df_rolling_subset = df_rolling[base_features + best_feature_subset]

## Hyperparameter Tuning

In [65]:
def pred_by_date_multiple_models (df, models_dict, date) :
    """
    Predict the outcome of all games on the given date for all models given. Used specifically to make
    cross validation more efficient
    """
    n = 5 # how many years in the past for training
    
    # determine season of the game
    season = date.year if date.month >= 10 else date.year - 1

    # get data in relevant time frame
    X, y = get_training_set(df, date, n)

    games_on_day = df[df['GAME_DATE'] == date].copy()
    games_on_day['DAYS_SINCE_GAME'] = np.zeros(len(games_on_day))

    test = games_on_day.drop(columns = ['WIN_ONE', 'GAME_DATE'])

    preds = [0]* len(models_dict)
    pos_probs = [0]* len(models_dict)
    
    for k, v in models_dict.items() :
        v.fit(X,y)
        pred = v.predict(test)
        preds[k] = list(pred)
        probs = v.predict_proba(test)
        pos_prob = probs[:,1]
        pos_probs[k] = list(pos_prob)
    return preds, list(games_on_day['WIN_ONE']), pos_probs

In [96]:
# XGBoost parameters
param_grid = {
    "n_estimators": [50, 100, 200],
    "eta": [0.01, 0.05, 0.1, 0.2], # learning_rate
    "max_depth": [4, 6, 8, 10], # maximum depth of a tree
    "subsample": [0.5, 0.7, 1], # fraction of observation to be randomly sampled for each tree
    "colsample_bytree": [0.5, 0.7, 1], # fraction of columns to be random samples for each tree
    "alpha": [0.5, 1, 2, 5] # lasso regression
}

param_dict = {} # store params with key corresponding to index of score in np.array
index = 0

# Iterate over all combinations of hyperparameters
for values in itertools.product(*param_grid.values()):
    param_dict[index] = XGBClassifier(objective='binary:logistic', random_state = 33, **dict(zip(param_grid.keys(), values)))
    index += 1

total_pred = total_pos_prob = total_act = None

day = 1
total_days = len(param_test_set) 
param_test_set = [d for d in val_set if d.year >= 2014]
for t in param_test_set:
    print(f"Predicting day {day} / {total_days}")
    preds, act, pos_probs = pred_by_date_multiple_models(df_rolling_subset, param_dict, t)
    if total_pred is None :
        total_pred = preds
        total_pos_prob = pos_probs
        total_act = act
    else :
        total_pred = [total_pred[i] + preds[i] for i in range(len(preds))]
        total_pos_prob = [total_pos_prob[i] + pos_probs[i] for i in range(len(pos_probs))]
        total_act += act
    day += 1

scores = pd.DataFrame(columns = ['MODEL', 'ACCURACY', 'PRECISION', 'RECALL', 'F1', 'ROC-AUC'])
for i in range(len(total_pred)) :
    accuracy = accuracy_score(total_act, total_pred[i])
    precision = precision_score( total_act, total_pred[i])
    recall = recall_score(total_act, total_pred[i])
    f1 = f1_score(total_act, total_pred[i])
    roc_auc = roc_auc_score(total_act, total_pos_prob[i])
    scores.loc[i] = [param_dict[0], accuracy, precision, recall, f1, roc_auc]

Predicting day 1 / 17
Predicting day 2 / 17
Predicting day 3 / 17
Predicting day 4 / 17
Predicting day 5 / 17
Predicting day 6 / 17
Predicting day 7 / 17
Predicting day 8 / 17
Predicting day 9 / 17
Predicting day 10 / 17
Predicting day 11 / 17
Predicting day 12 / 17
Predicting day 13 / 17
Predicting day 14 / 17
Predicting day 15 / 17
Predicting day 16 / 17
Predicting day 17 / 17


In [98]:
scores = scores.sort_values(by = 'ROC-AUC', ascending = False)
scores.head(10)

Unnamed: 0,MODEL,ACCURACY,PRECISION,RECALL,F1,ROC-AUC
32,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.718543,0.72,0.715232,0.717608,0.793934
33,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.718543,0.72,0.715232,0.717608,0.793737
34,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.708609,0.708609,0.708609,0.708609,0.793693
608,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.725166,0.726667,0.721854,0.724252,0.793693
610,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.725166,0.723684,0.728477,0.726073,0.792992
609,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.725166,0.726667,0.721854,0.724252,0.792926
35,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.705298,0.706667,0.701987,0.704319,0.792071
20,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.721854,0.721854,0.721854,0.721854,0.792049
21,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.718543,0.72,0.715232,0.717608,0.791917
22,"XGBClassifier(alpha=0.5, base_score=None, boos...",0.711921,0.713333,0.708609,0.710963,0.791654


Looking at these scores, we see that the highest ROC-ACU score sacrifices some points in accuracy and in F1-score. But, row 608 seems to have very comparable ROC-AUC scores while having strong scores in all five metrics. For this reason, we ultimately choose these parameters.

In [101]:
best_model = scores.iloc[610]['MODEL']
best_params = best_model.get_params()
best_params # {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.5, 'eta': 0.01, 'alpha': 0.5, 'random_state': 33}

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.5,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 4,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 50,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 33,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.5,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eta': 0.01,
 'alpha': 0.5}

In [None]:
# uncomment so you don't have to rerun
#best_params = {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.5, 'eta': 0.01, 'alpha': 0.5, 'random_state': 33}

In [None]:
model = XGBClassifier(**best_params)
pred, act, pos_prob = test_model(df_rolling_subset, model, val_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6846950517836594
#Precision: 0.6825938566552902
#Recall: 0.6904487917146145
#F1: 0.6864988558352403
#ROC AUC: 0.7442015676127343

Accuracy: 0.6846950517836594
Precision: 0.6825938566552902
Recall: 0.6904487917146145
F1: 0.6864988558352403
ROC AUC: 0.7442015676127343


## Test Models
We want to test the model trained on rolling averages and the predicted statistics from the second model. We will predict every game in the last 4 seasons. This means we need to predict all the statistics for the games in the last 9 seasons using the second model. We have these predictions stored in the csv files imported at the beginning.

In [9]:
# get all dates in the test set
time_horizon = 5 # can change here
first_test_season = df_rolling['SEASON_YEAR'].max() - 5
test_set = df_rolling[df_rolling['SEASON_YEAR'] >= first_test_season]['GAME_DATE'].sort_values().unique()

In [10]:
rf = RandomForestClassifier(random_state = 33)
final_model = XGBClassifier(**best_params)

### Rolling Window 

In [None]:
pred, act, pos_prob = test_model(df_rolling_subset, rf, test_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6314856818826198
#Precision: 0.6353816961027587
#Recall: 0.6170966827332011
#F1: 0.6261057173678533
#ROC AUC: 0.6804085724053957

Accuracy: 0.6314856818826198
Precision: 0.6353816961027587
Recall: 0.6170966827332011
F1: 0.6261057173678533
ROC AUC: 0.6804085724053957


In [14]:
pred, act, pos_prob = test_model(df_rolling_subset, final_model, test_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6424014743407995
#Precision: 0.6420591146938198
#Recall: 0.6436064644173518
#F1: 0.6428318584070797
#ROC AUC: 0.6956831816834594

Accuracy: 0.6424014743407995
Precision: 0.6420591146938198
Recall: 0.6436064644173518
F1: 0.6428318584070797
ROC AUC: 0.6956831816834594


### ML Model Predictions

In [None]:
pred, act, pos_prob = test_model(df_model_subset, rf, test_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6270910121916643
#Precision: 0.6303241750254397
#Recall: 0.6146867025800964
#F1: 0.6224072346228379
#ROC AUC: 0.675179901427113

Accuracy: 0.6270910121916643
Precision: 0.6303241750254397
Recall: 0.6146867025800964
F1: 0.6224072346228379
ROC AUC: 0.675179901427113


In [15]:
pred, act, pos_prob = test_model(df_model_subset, final_model, test_set)
print_class_metrics(pred, act, pos_prob)

#Accuracy: 0.6376523958037993
#Precision: 0.6389127324749643
#Recall: 0.6331159625744258
#F1: 0.6360011392765594
#ROC AUC: 0.6919508053591843

Accuracy: 0.6376523958037993
Precision: 0.6389127324749643
Recall: 0.6331159625744258
F1: 0.6360011392765594
ROC AUC: 0.6919508053591843


## Playoff Prediction
First Round:

Eastern
1. Cleveland (C) vs. Miami (H): 4/20 C, 4/23 C, 4/26 H, 4/28 H, 4/30 C, 5/2 H, 5/4 C
2. Boston (C) vs. Orlando (M): 4/20 C, 4/23 C, 4/25 M, 4/27 M, 4/29 C, 5/1 M, 5/3 C
3. New York (K) vs. Detroit (P): 4/19 K, 4/21 K, 4/24 P, 4/27 P, 4/29 K, 5/1 P, 5/3 K
4. Indiana (P) vs. Milwaukee (B): 4/19 P, 4/22 P, 4/25 B, 4/27 B, 4/29 P, 5/2 B, 5/4 P

Western
1. Oklahoma City (T) vs. Memphis (G): 4/20 T, 4/22 T, 4/24 G, 4/26 G, 4/28 T, 5/1 G, 5/3 T
2. Houston (R) vs Golden State (W): 4/20 R, 4/23 R, 4/26 W, 4/28 W, 4/30 R, 5/2 W, 5/4 R
3. LA Lakers (L) vs. Minnesota (T): 4/19 L, 4/22 L, 4/25 T, 4/27 T, 4/30 L, 5/2 T, 5/4 L
4. Denver (N) vs. LA Clippers (C): 4/19 N, 4/21 N, 4/24 C, 4/26 C, 4/29 N, 5/1 C, 5/3 N

Semifinals begin May 5-6

Conference Finals begin May 20-21

Western: 5/20, 5/22, 5/24, 5/26, 5/28, 5/30, 6/1

Eastern: 5/21, 5/23, 5/25, 5/27, 5/29, 5/31, 6/2

Finals begin June 5: 6/5, 6/8, 6/11, 6/13, 6/16, 6/19, 6/22


In [172]:
df_model = pd.read_csv('df_model_tuned.csv')
df_model['GAME_DATE'] = pd.to_datetime(df_model['GAME_DATE'])

In [119]:
features = base_features + best_feature_subset

In [122]:
final_model = XGBClassifier(**best_params)

In [150]:
def predict_outcomes(file_path, df) :
    game = pd.read_csv(file_path)
    game['GAME_DATE'] = pd.to_datetime(game['GAME_DATE'])

    test_set = game['GAME_DATE'].unique()
    outcomes = None
    for t in test_set :
        df_upd = pd.concat([df, game[game['GAME_DATE'] == t]], ignore_index = True)
        df_upd_subset = df_upd[features]
        pred, act, pos_prob = pred_by_date(df_upd_subset, final_model, t)
        game_outcomes = df_upd[df_upd['GAME_DATE'] == t][['TEAM_ID_ONE', 'TEAM_ID_TWO']]
        game_outcomes['OUTCOME'] = pred
        outcomes = game_outcomes if outcomes is None else pd.concat([outcomes, game_outcomes], ignore_index = True)

    winners = np.array([outcomes.iloc[i]['TEAM_ID_ONE'] if outcomes.iloc[i]['OUTCOME'] == 1 else outcomes.iloc[i]['TEAM_ID_TWO']
              for i in range(len(outcomes))])
    game.loc[:, 'WIN_ONE'] = [1 if team in winners else 0 for team in game['TEAM_ID_ONE']]
    print("Outcomes: ", outcomes)
    print("Winners:", winners)
    
    return pd.concat([df, game], ignore_index = True)

### Round 1 Conference Quarterfinals
#### Game 1

In [173]:
df_model = predict_outcomes('playoffs_round_one_one.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            2           11        1
1            1           16        1
2           23           26        1
3            8            7        1
4           15           28        1
5           17           12        0
6           10           13        0
7            6            9        1
Winners: [ 2  1 23  8 15 12 13  6]


#### Game 2

In [174]:
df_model = predict_outcomes('playoffs_round_one_two.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            2           11        0
1            1           16        1
2            8            7        1
3           15           28        0
4            6            9        1
5           17           12        0
6           23           26        1
7           10           13        0
Winners: [11  1  8 28  6 12 23 13]


#### Game 3

In [175]:
df_model = predict_outcomes('playoffs_round_one_three.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            2           11        0
1            8            7        0
2            1           16        0
3           17           12        0
4           10           13        0
5           15           28        0
6           23           26        1
7            6            9        0
Winners: [11  7 16 12 13 28 23  9]


#### Game 4

In [176]:
df_model = predict_outcomes('playoffs_round_one_four.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            2           11        0
1            8            7        0
2            1           16        0
3           15           28        0
4           17           12        0
5           10           13        0
6            2           11        0
7            8            7        0
8           23           26        1
9            6            9        0
Winners: [11  7 16 28 12 13 11  7 23  9]


#### Game 5

In [177]:
df_model = predict_outcomes('playoffs_round_one_five.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            2           11        0
1            8            7        0
2            1           16        0
3           15           28        1
4            6            9        1
Winners: [11  7 16 15  6]


#### Game 6

In [178]:
df_model = predict_outcomes('playoffs_round_one_six.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0            1           16        0
1            6            9        0
2            8            7        0
Winners: [16  9  7]


#### Game 7 (no games)

### Round 2 : Conference Semifinals
#### Game 1

In [180]:
df_model = predict_outcomes('playoffs_round_two_one.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26448           23            9        1
26449           13            7        1
26450           12           11        0
26451           28           16        1
Winners: [23 13 11 28]


#### Game 2

In [181]:
df_model = predict_outcomes('playoffs_round_two_two.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26452           23            9        1
26453           13            7        1
26454           12           11        1
26455           28           16        1
Winners: [23 13 12 28]


#### Game 3

In [182]:
df_model = predict_outcomes('playoffs_round_two_three.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26456           23            9        0
26457           13            7        0
26458           12           11        0
26459           28           16        0
Winners: [ 9  7 11 16]


#### Game 4

In [183]:
df_model = predict_outcomes('playoffs_round_two_four.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26460           23            9        0
26461           13            7        1
26462           12           11        0
26463           28           16        0
Winners: [ 9 13 11 16]


#### Game 5

In [184]:
df_model = predict_outcomes('playoffs_round_two_five.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26464           23            9        1
26465           13            7        1
26466           12           11        0
26467           28           16        1
Winners: [23 13 11 28]


#### Game 6

In [185]:
df_model = predict_outcomes('playoffs_round_two_six.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26468           23            9        0
26469           28           16        0
Winners: [ 9 16]


#### Game 7

In [186]:
df_model = predict_outcomes('playoffs_round_two_seven.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26470           23            9        1
26471           28           16        0
Winners: [23 16]


### Round 3: Conference Finals
#### Game 1

In [187]:
df_model = predict_outcomes('playoffs_round_three_one.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0           23           13        0
1           11           16        1
Winners: [13 11]


#### Game 2

In [188]:
df_model = predict_outcomes('playoffs_round_three_two.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0           23           13        0
1           11           16        1
Winners: [13 11]


#### Game 3

In [189]:
df_model = predict_outcomes('playoffs_round_three_three.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0           23           13        0
1           11           16        0
Winners: [13 16]


#### Game 4

In [190]:
df_model = predict_outcomes('playoffs_round_three_four.csv', df_model)

Outcomes:     TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
0           23           13        0
1           11           16        0
Winners: [13 16]


#### Game 5

In [191]:
df_model = predict_outcomes('playoffs_round_three_five.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26480           11           16        1
Winners: [11]


#### Game 6

In [192]:
df_model = predict_outcomes('playoffs_round_three_six.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26481           11           16        1
Winners: [11]


#### Game 7 (no games)

### Round 4 : Finals

#### Game 1

In [193]:
df_model = predict_outcomes('playoffs_round_four_one.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26482           13           11        1
Winners: [13]


#### Game 2

In [194]:
df_model = predict_outcomes('playoffs_round_four_two.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26483           13           11        1
Winners: [13]


#### Game 3

In [195]:
df_model = predict_outcomes('playoffs_round_four_three.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26484           13           11        0
Winners: [11]


#### Game 4

In [196]:
df_model = predict_outcomes('playoffs_round_four_four.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26485           13           11        0
Winners: [11]


#### Game 5

In [197]:
df_model = predict_outcomes('playoffs_round_four_five.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26486           13           11        1
Winners: [13]


#### Game 6

In [198]:
df_model = predict_outcomes('playoffs_round_four_six.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26487           13           11        0
Winners: [11]


#### Game 7

In [199]:
df_model = predict_outcomes('playoffs_round_four_seven.csv', df_model)

Outcomes:         TEAM_ID_ONE  TEAM_ID_TWO  OUTCOME
26488           13           11        1
Winners: [13]
