# Modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

import matplotlib.pyplot as plt

In [3]:
def npv_score(y_test, y_pred):
    true_negatives = np.unique(np.array(list(zip(y_test.values, y_pred))).sum(axis=1), return_counts=True)[1][0]
    predicted_negatives = np.unique(y_pred, return_counts=True)[1][0]
    npv = true_negatives / predicted_negatives
    return npv

In [4]:
df = pd.read_csv("/Users/tativalentine/Documents/GitHub/games_with_features.csv", index_col="id")

In [5]:
df.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,...,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,...,Charlotte Hornets,1,105.5,98.0,2 2018,4 2018,113.3,108.4,4.026829,-3.791892
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,...,LA Clippers,0,105.5,100.3,2 2018,13 2018,113.3,112.7,4.026829,0.223457
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,...,Denver Nuggets,1,102.9,104.0,23 2018,8 2018,117.9,107.7,8.839759,-4.969412
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,...,Cleveland Cavaliers,1,103.1,98.2,30 2018,6 2018,116.4,103.6,7.187013,-8.797368
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,...,Miami Heat,1,105.3,95.6,26 2018,16 2018,114.8,105.3,4.853247,-6.616216


# Ultra Baseline

In [6]:
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = df[df["home_team_avg_score"].gt(df["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.665687
0    0.334313
Name: winner, dtype: float64

In [7]:
s2018 = df[df["season"].eq(2018)]
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = s2018[s2018["home_team_avg_score"].gt(s2018["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.678982
0    0.321018
Name: winner, dtype: float64

In [8]:
s2020 = df[df["season"].eq(2020)]
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = s2018[s2018["home_team_avg_score"].gt(s2018["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.678982
0    0.321018
Name: winner, dtype: float64

##### Home team wins 66% of the time when their avg score is higher there is a flaw with this baseline in that it uses the average score from all games
##### that occured that season and correlates it with the winner of games that occured before that average score was known

In [9]:
df[(df["home_team_avg_score"] - 5).gt(df["visitor_team_avg_score"])]["winner"].value_counts(normalize=True)

1    0.717619
0    0.282381
Name: winner, dtype: float64

##### Same experiment as above but only using teams that are heavy favourites (10 pt average more). Win percantage seems to increse 1% per 1pt advantage

In [10]:

s2020 = df[df["season"].eq(2020)]
s2019 = df[df["season"].eq(2019)]
s2018 = df[df["season"].eq(2018)]
s2017 = df[df["season"].eq(2017)]

# Baseline

In [11]:
lr = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]



In [12]:
def train_model(model, train_data, test_data):
    X_train = train_data.drop("winner", axis=1)
    y_train = train_data["winner"]
    X_test = test_data.drop("winner", axis=1)
    y_test = test_data["winner"]                     

    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(accuracy_score(y_test, y_pred))
    #print(cross_val_score(model, feats, target).mean())

In [38]:
train_data = df[df["season"].isin([2018])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
test_data = df[df["season"].isin([2019])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
for model in models:
    train_model(model, train_data, test_data)

0.5893169877408057
0.6024518388791593
0.5376532399299475
0.5359019264448336


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.542031523642732


#### Using score diff feat

In [40]:
features = ["winner", "home_team_avg_score", "visitor_team_avg_score", "home_avg_score_diff", "visitor_avg_score_diff"]
train_data = df[df["season"].isin(range(2018,2019))][features]
test_data = df[df["season"].isin([2018])][features]
for model in models:
    train_model(model, train_data, test_data)

0.6454018227009114
0.6607290803645401
0.7891466445733223


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8744821872410936
0.8566694283347142


#### xgboost and random forest predict around 80% correctly when using data from the same season. Unfortunately when trying to predict results of future seasons the result is horrible.

### The Big problem with this so far is that averages for the season are used to predict outcomes of games that contributed to those averages

# Using stats from 2017 - 2020


In [41]:
lr = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]



In [42]:
stats = pd.read_csv("/Users/tativalentine/Documents/GitHub/stats_feats.csv", index_col="game.id")

In [43]:
stats

Unnamed: 0_level_0,game.date,game.season,winner,home_ast,home_blk,home_dreb,home_fg3_pct,home_fg3a,home_fg3m,home_fg_pct,...,diff_away_fgm,diff_away_ft_pct,diff_away_fta,diff_away_ftm,diff_away_oreb,diff_away_pf,diff_away_pts,diff_away_reb,diff_away_stl,diff_away_turnover
game.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2018-10-16,2018,1,22.15,3.50,30.55,0.186827,15.80,5.75,0.408467,...,0.35,0.026595,1.45,1.70,1.65,5.20,4.20,-0.20,2.50,-0.35
2,2018-10-16,2018,1,24.65,4.35,28.95,0.128660,11.30,3.45,0.414999,...,0.55,-0.014083,4.70,2.80,3.10,3.05,1.80,1.60,2.00,0.60
3,2018-10-17,2018,0,25.70,6.40,31.20,0.105784,11.30,3.30,0.439629,...,-2.85,-0.007727,3.95,2.20,-1.90,-7.30,-7.30,-0.20,-0.20,1.45
4,2018-10-17,2018,1,19.15,5.55,31.05,0.133770,12.75,4.90,0.452937,...,0.35,-0.060419,3.45,2.10,2.00,2.45,3.90,4.15,-0.95,3.85
5,2018-10-17,2018,1,20.90,6.10,31.70,0.175094,15.35,5.60,0.393937,...,0.75,0.067940,4.50,3.95,-0.25,1.70,7.40,2.60,0.15,-3.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46912,2019-01-21,2018,0,21.20,5.65,31.15,0.107874,7.35,2.35,0.416625,...,0.90,0.135110,2.10,2.75,0.25,-1.05,2.45,4.50,-1.00,-0.05
46913,2019-01-21,2018,1,26.55,5.45,33.30,0.256833,31.20,11.05,0.455795,...,7.65,0.113787,-1.45,-0.25,-2.85,-2.85,21.45,0.70,2.45,-0.35
46914,2019-01-21,2018,1,26.60,5.70,36.55,0.233078,30.05,10.85,0.432333,...,6.00,-0.058437,-1.80,-2.75,3.80,-1.05,13.75,11.45,-1.00,1.75
46915,2019-01-21,2018,0,15.85,4.75,31.10,0.119687,9.80,3.45,0.391709,...,-1.95,0.017012,3.60,3.25,1.25,1.00,-6.60,0.90,-0.40,3.55


In [44]:
X = stats.drop(["winner", "game.date", "game.season"], axis=1)
y = stats["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.6554315156719622
0.6361880231809401
0.6381160973639654
GaussianNB()
0.6134607127522542
0.583386992916935
0.5933682319254806


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
0.7473164448261056
0.6027044430135222
0.5792068595927116
RandomForestClassifier()
1.0
0.6377978106889891
0.620096204130875
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None, ...)
0.980463718334049
0.6281390856406954




0.5962688537831834


In [48]:
train = stats[stats["game.season"].isin([2018,2019])]
test = stats[stats["game.season"].eq(2020)]

In [49]:
X_train = train.drop(["winner", "game.date", "game.season"], axis=1)
y_train = train["winner"]
X_test = test.drop(["winner", "game.date", "game.season"], axis=1)
y_test = test["winner"]

In [50]:
for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

ValueError: Found array with 0 sample(s) (shape=(0, 54)) while a minimum of 1 is required.

# Using all stats

In [27]:
lr = LogisticRegression(max_iter=5000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]



In [28]:
stats = pd.read_csv("/Users/tativalentine/Documents/GitHub/stats_feats.csv", index_col="game.id")

In [29]:
X = stats.drop(["winner", "game.date", "game.season"], axis=1)
y = stats["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=5000)
0.657471017604122
0.6342562781712814
0.6361956392293313
GaussianNB()
0.6110991841992272
0.5975531229877656
0.5956231780501318


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
0.7511807642765135
0.5898261429491307
0.5759779631441871
RandomForestClassifier()
1.0
0.6352221506761108
0.6107522873075788
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None, ...)
0.9791756118505797
0.6204121056020605




0.6001232324173748


# Using stats from 2020 season

In [51]:
train = stats[stats["game.season"].lt(2020)]
test = stats[stats["game.season"].eq(2020)]

In [52]:
X_train = train.drop(["winner", "game.date", "game.season"], axis=1)
y_train = train["winner"]
X_test = test.drop(["winner", "game.date", "game.season"], axis=1)
y_test = test["winner"]

In [53]:
# using only diff
X_train = X_train.loc[:,"diff_away_ast":]
X_test = X_test.loc[:,"diff_away_ast":]

In [54]:
for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(precision_score(y_test, y_pred))
    print(recall_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

ValueError: Found array with 0 sample(s) (shape=(0, 18)) while a minimum of 1 is required.

##### More data slightly improved random forest and XGboost algos. Some models overfit so possibly hyper parameter tuning would help? But I think we need to be more selective of features / engineer better featrures

# Best_model

In [35]:
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_pred = lr.predict(X_test)

print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(npv_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

ValueError: Found array with 0 sample(s) (shape=(0, 18)) while a minimum of 1 is required.

In [36]:
plt.figure()
plot_confusion_matrix(lr, X_test, y_test)
plt.ylabel("True Outcome")
plt.yticks(ticks=[0,1], labels=["Away Win", "Home Win"])
plt.xticks(ticks=[0,1], labels=["Away Win", "Home Win"])
plt.xlabel("Predicted Outcome")
plt.show()



ValueError: Found array with 0 sample(s) (shape=(0, 18)) while a minimum of 1 is required.

<Figure size 640x480 with 0 Axes>

In [37]:
176/311

0.5659163987138264