# NBA Shot Log EDA 
### Virginia Baskin and Harry Golen

Edited version of EDA. Only essential changes to data were kept, so visualizations/other explorations are in 'nba baseline[s]' notebook. 

Table of contents: 
-Feature Cleaning/Engineering

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy import stats #for Box-Cox

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [3]:
#import dataset
df = pd.read_csv('shot_logs.csv')
df.head(3)

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0:00,,3,...,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148


## Feature Cleaning/Engineering

Some quick necessary feature engineering to make the dataset easier to work with. (House keeping) (Virginia)

In [4]:
# Make all column names lowercase (easier to type)
df.columns = df.columns.str.lower()

In [5]:
# Touch times should not be negative, remove anamolies
zero_outliers = df[(df.touch_time < 0)]
df2 = df.drop(zero_outliers.index)

In [6]:
#convert binary variables into numerical binary indicators (0,1)
df2['location'] = np.where(df2['location'] == 'H', 1, 0)
df2['w'] = np.where(df2['w'] == 'W', 1, 0)

In [7]:
#convert game_clock from min:secs to just seconds (to match shot_clock)
df2.game_clock = df2.game_clock.apply(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))

In [7]:
#df2 is cleaned dset
df2.head(3)

Unnamed: 0,game_id,matchup,location,w,final_margin,shot_number,period,game_clock,shot_clock,dribbles,...,shot_dist,pts_type,shot_result,closest_defender,closest_defender_player_id,close_def_dist,fgm,pts,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1,1,69,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,2,1,14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,3,1,0,,3,...,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148


### Adjusting skewness: 


In [8]:
# df2b will be skew-adjusted dset
df2b = df2.copy()

In [9]:
#transforming skewness

### dribbles:
print('skewness of raw dribbles:', df2b['dribbles'].skew())

# log transformation
df2b['log_dribbles'] = np.log(df2b.dribbles+0.5)
print('skewness of log-tranformed dribbles:', df2b['log_dribbles'].skew())

# boxcox transformation 
df2b['cox_dribbles'], _ = stats.boxcox(df2b.dribbles+0.5)
print('skewness of cox-tranformed dribbles:', df2b['cox_dribbles'].skew())

print('\n', '=================================', '\n')

### close_def_dist
print('skewness of raw close_def_dist:', df2b['close_def_dist'].skew())

# log transformation
df2b['log_close_def_dist'] = np.log(df2b.close_def_dist+0.5)
print('skewness of log-tranformed close_def_dist:', df2b['log_close_def_dist'].skew())

# boxcox transformation
df2b['cox_close_def_dist'], _ = stats.boxcox(df2b.close_def_dist+0.5)
print('skewness of cox-tranformed close_def_dist:', df2b['cox_close_def_dist'].skew())

print('\n','=================================', '\n')

### touch_time
print('skewness of raw touch_time:', df2b['touch_time'].skew())

# log transformation
df2b['log_touch_time'] = np.log(df2b.touch_time+0.5)
print('skewness of log-tranformed touch_time:', df2b['log_touch_time'].skew())

# boxcox transformation
df2b['cox_touch_time'], _ = stats.boxcox(df2b.touch_time+0.5)
print('skewness of cox-tranformed touch_time:', df2b['cox_touch_time'].skew())

print('\n','=================================', '\n')

### shot_number
print('skewness of raw shot_number:', df2b['shot_number'].skew())

# log transformation
df2b['log_shot_number'] = np.log(df2b.shot_number+0.5)
print('skewness of log-tranformed shot_number:', df2b['log_shot_number'].skew())

# boxcox transformation
df2b['cox_shot_number'], _ = stats.boxcox(df2b.shot_number+0.5)
print('skewness of cox-tranformed shot_number:', df2b['cox_shot_number'].skew())


#skew adjusted dset
df2b.head(5)

skewness of raw dribbles: 2.777119650393458
skewness of log-tranformed dribbles: 0.6627391354599313
skewness of cox-tranformed dribbles: 0.26900748917728257


skewness of raw close_def_dist: 2.4791904173734816
skewness of log-tranformed close_def_dist: -0.5865318510812597
skewness of cox-tranformed close_def_dist: 0.03754678443842185


skewness of raw touch_time: 2.3990504499978655
skewness of log-tranformed touch_time: 0.4177412301213816
skewness of cox-tranformed touch_time: 0.02393201232593657


skewness of raw shot_number: 1.1388870086190162
skewness of log-tranformed shot_number: -0.2497821073645716
skewness of cox-tranformed shot_number: -0.03311189130948012


Unnamed: 0,game_id,matchup,location,w,final_margin,shot_number,period,game_clock,shot_clock,dribbles,...,player_name,player_id,log_dribbles,cox_dribbles,log_close_def_dist,cox_close_def_dist,log_touch_time,cox_touch_time,log_shot_number,cox_shot_number
0,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1,1,69,10.8,2,...,brian roberts,203148,0.916291,0.750047,0.587787,0.633374,0.875469,0.793966,0.405465,0.419786
1,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,2,1,14,3.4,0,...,brian roberts,203148,-0.693147,-0.814192,1.88707,2.413943,0.262364,0.254704,0.916291,0.991621
2,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,3,1,0,,3,...,brian roberts,203148,1.252763,0.956203,0.336472,0.351094,1.163151,1.022264,1.252763,1.396375
3,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,4,2,707,10.3,2,...,brian roberts,203148,0.916291,0.750047,1.360977,1.622419,0.875469,0.793966,1.504077,1.714181
4,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,5,2,634,10.9,2,...,brian roberts,203148,0.916291,0.750047,0.470004,0.498859,1.163151,1.022264,1.704748,1.97789


Previous EDA found that fixing skewness improves correlation of the features to fgm, and Box-Cox is consistently the best transformation. 

## Feature Engineering

Engineer features using domain knowledge of basketball. (Harry)

In [10]:
# fill circumstances where shot clock is off to game clock
df2.shot_clock.fillna(df2.game_clock, inplace = True)

# make column for seconds left in game
df2['full_game_clock'] = df2.game_clock + ((4 - df2.period) * 720)

# dummy variable for clutch shots
df2['clutch_shot'] = np.where((df2['final_margin'] >= -5) & (df2['final_margin'] <= 5) & (df2['full_game_clock'] <= 120), 1, 0)

# dummy variable for buzzer beaters
df2['buzzer_beater'] = np.where((df2['shot_clock'] < 1), 1, 0)

# add player shooting % column
df2['player_shooting'] = df2.groupby('player_id')['fgm'].transform(lambda x: x.sum()/x.count())

# add defense shooting % column
df2['defender_quality'] = df2.groupby('closest_defender_player_id')['fgm'].transform(lambda x: x.sum()/x.count())

# add dummy for assumed putback
df2['putback'] = np.where((df2['shot_clock'] > 23) & (df2['shot_dist'] < 5), 1, 0)

# add dummy for overtime
df2['overtime'] = np.where((df2['period'] > 4), 1, 0)

In [11]:
#Make sure these features are also in the skew-adjusted dset
# (will add adjusted skewness 0to a combined final dset  for model building)
df2b['full_game_clock'] = df2['full_game_clock']
df2b['clutch_shot'] = df2['clutch_shot']
df2b['buzzer_beater'] = df2['buzzer_beater']
df2b['player_shooting'] = df2['player_shooting']
df2b['putback'] = df2['putback']
df2b['overtime'] = df2['overtime']
df2b['defender_quality'] = df2['defender_quality']
df2b.shot_clock.fillna(df2b.game_clock, inplace = True)
df2b.head()

Unnamed: 0,game_id,matchup,location,w,final_margin,shot_number,period,game_clock,shot_clock,dribbles,...,cox_touch_time,log_shot_number,cox_shot_number,full_game_clock,clutch_shot,buzzer_beater,player_shooting,putback,overtime,defender_quality
0,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1,1,69,10.8,2,...,0.793966,0.405465,0.419786,2229,0,0,0.400538,0,0,0.45288
1,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,2,1,14,3.4,0,...,0.254704,0.916291,0.991621,2174,0,0,0.400538,0,0,0.407524
2,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,3,1,0,0.0,3,...,1.022264,1.252763,1.396375,2160,0,1,0.400538,0,0,0.407524
3,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,4,2,707,10.3,2,...,0.793966,1.504077,1.714181,2147,0,0,0.400538,0,0,0.46
4,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,5,2,634,10.9,2,...,1.022264,1.704748,1.97789,2074,0,0,0.400538,0,0,0.482109


- **clutch shots** - final margin <= 5 pts, <120 secs left in game 

- **buzzer beaters** - dummy variable indicating whether there's <1 seconds left on shot clock 

- **putbacks** - dummy variable indicating if shot clock > 23 seconds and shot distance < 5ft (usually a rebound)

- **player_shooting** - % of shots made for that player in this season 

- **defender_quality** - % of shots made for shots defended by that player in this season 

- **overtime** - dummy variable indicating if game is in overtime or not

- **full_game_clock** - number of seconds left in game, accounting for game_clock and period (is negative for overtime)

# Put it all together 

## Dataset to use from here on out (for modeling)
Has adjusted skewness, fixed anomalies, imputed na's, and added features 

In [15]:
df2b.columns

Index(['game_id', 'matchup', 'location', 'w', 'final_margin', 'shot_number',
       'period', 'game_clock', 'shot_clock', 'dribbles', 'touch_time',
       'shot_dist', 'pts_type', 'shot_result', 'closest_defender',
       'closest_defender_player_id', 'close_def_dist', 'fgm', 'pts',
       'player_name', 'player_id', 'log_dribbles', 'cox_dribbles',
       'log_close_def_dist', 'cox_close_def_dist', 'log_touch_time',
       'cox_touch_time', 'log_shot_number', 'cox_shot_number',
       'full_game_clock', 'clutch_shot', 'buzzer_beater', 'player_shooting',
       'putback', 'overtime', 'defender_quality'],
      dtype='object')

In [12]:
nba = df2b[['game_id', 'matchup', 'location', 'w', 'final_margin', 'cox_shot_number',
            'period', 'game_clock', 'shot_clock', 'full_game_clock', 'overtime', 'buzzer_beater', 
            'putback', 'clutch_shot', 'cox_dribbles', 'cox_touch_time', 'shot_dist', 'pts_type',  
            'closest_defender', 'closest_defender_player_id', 'defender_quality', 'cox_close_def_dist', 'player_name', 
            'player_id', 'player_shooting', 'fgm']]

#dataset for model building–has all the necessary adjustments and new features
nba.head()

Unnamed: 0,game_id,matchup,location,w,final_margin,cox_shot_number,period,game_clock,shot_clock,full_game_clock,...,shot_dist,pts_type,closest_defender,closest_defender_player_id,defender_quality,cox_close_def_dist,player_name,player_id,player_shooting,fgm
0,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,0.419786,1,69,10.8,2229,...,7.7,2,"Anderson, Alan",101187,0.45288,0.633374,brian roberts,203148,0.400538,1
1,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,0.991621,1,14,3.4,2174,...,28.2,3,"Bogdanovic, Bojan",202711,0.407524,2.413943,brian roberts,203148,0.400538,0
2,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1.396375,1,0,0.0,2160,...,10.1,2,"Bogdanovic, Bojan",202711,0.407524,0.351094,brian roberts,203148,0.400538,0
3,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1.714181,2,707,10.3,2147,...,17.2,2,"Brown, Markel",203900,0.46,1.622419,brian roberts,203148,0.400538,0
4,21400899,"MAR 04, 2015 - CHA @ BKN",0,1,24,1.97789,2,634,10.9,2074,...,3.7,2,"Young, Thaddeus",201152,0.482109,0.498859,brian roberts,203148,0.400538,0


### Feature Selection

(Harry)

The first method we use is to select features via intuition and domain knowledge. We also look at a sorted list of the highest correlated features. This leads us to remove features such as player and defender ID in favor of player_shooting and defender_quality. 

In addition, we use sequential backward selection (SBS) to select the strongest set of 12 features from the dataset with 26 columns. We then compare the columns selected. 

In [19]:
nba_corr = nba.corr(numeric_only=True)['fgm']
best_features = abs(nba_corr[abs(nba_corr) > 0.01]).sort_values(ascending=False)
best_features

fgm                 1.000000
shot_dist           0.192518
pts_type            0.121745
player_shooting     0.105316
defender_quality    0.077712
cox_touch_time      0.062989
final_margin        0.058861
putback             0.058657
w                   0.050329
buzzer_beater       0.048777
cox_dribbles        0.043376
full_game_clock     0.016397
period              0.014091
clutch_shot         0.012559
game_clock          0.011591
overtime            0.010588
Name: fgm, dtype: float64

In [20]:
nba.columns

Index(['game_id', 'matchup', 'location', 'w', 'final_margin',
       'cox_shot_number', 'period', 'game_clock', 'shot_clock',
       'full_game_clock', 'overtime', 'buzzer_beater', 'putback',
       'clutch_shot', 'cox_dribbles', 'cox_touch_time', 'shot_dist',
       'pts_type', 'closest_defender', 'closest_defender_player_id',
       'defender_quality', 'cox_close_def_dist', 'player_name', 'player_id',
       'player_shooting', 'fgm'],
      dtype='object')

In [13]:
#data for feature selection 
X = nba.drop(labels = ['fgm', 'matchup', 'player_name', 'closest_defender'], axis = 1)
y = nba[['fgm']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #get training and testing data
y_train = np.squeeze(y_train)

from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train) #standardize X matrix so features don't get too unequal weight


In [23]:
features = SequentialFeatureSelector(LogisticRegression(), n_features_to_select = 12, direction="backward").fit(X_train, y_train)
#wrapper method
#SBS

#FOR FUTURE - drop game_id and closest_defended_player_id

features = features.get_support() # keep only the subset of 12 features that performed the best

In [24]:
X_trains = pd.DataFrame(X_train).loc[:,features]
X_tests = pd.DataFrame(X_test).loc[:,features]
print(X_trains.head())

cols = list(X.columns)

print("selected features: ")
for i in range(len(features)):
    if features[i]:
        print(cols[i])

         0         3         4         8         10        13        14  \
0  1.163910  1.195393 -0.341098 -0.290069 -0.107611  1.033358  0.945554   
1  0.364778 -0.771047 -1.096771  1.452025 -0.107611 -0.963427 -0.256913   
2  1.322961  0.212173 -0.065304  1.061373 -0.107611  0.372686  0.178672   
3  1.586752 -1.829900 -0.673468  0.443135 -0.107611  0.372686  0.178672   
4  0.667362 -0.771047  0.735420 -0.383919 -0.107611 -0.963427 -0.711869   

         15        16        17        18        19  
0 -0.178004 -0.601384 -1.985648 -0.936308 -0.317875  
1  0.890432 -0.601384  0.554951  0.278003  0.082611  
2 -0.324210 -0.601384  0.570112  1.796320  0.595513  
3  1.351546  1.662832  0.554812 -0.945749 -0.043247  
4  1.306559  1.662832  0.554863 -0.515237  0.162907  
selected features: 
game_id
final_margin
cox_shot_number
full_game_clock
buzzer_beater
cox_dribbles
cox_touch_time
shot_dist
pts_type
closest_defender_player_id
defender_quality
cox_close_def_dist


Correlation analysis: 


- shot_dist    

- pts_type  

- player_shooting ~  

- defender_quality 

- cox_touch_time  

- final_margin  

- putback ~    

- w ~         

- buzzer_beater  

- cox_dribbles  

- full_game_clock 

- period ~        

- clutch_shot ~   

- game_clock ~  

- overtime ~   



SBS picks: 


- game_id ?

- final_margin #

- cox_shot_number 

- full_game_clock #

- buzzer_beater #

- cox_dribbles #

- cox_touch_time #

- shot_dist #

- pts_type #

- closest_defender_player_id ?

- cox_close_def_dist

- defender_quality #


\# = in both sets

? = unique keys (interesting, needs further investigation) 

\~ = only in correlation

# Looking ahead:

Potential algorithms to use for classification: 

- logistic regression 
- K nearest classification 
- random forest classification 
- SVM
- ANNs

# Baseline models

In [18]:
baseline = df2.copy(deep = True)
baseline.drop(columns = ["game_id", "matchup", "shot_result", "closest_defender", "closest_defender_player_id", 
                         "fgm", "player_name", "player_id", "full_game_clock", "clutch_shot", "buzzer_beater", 
                         "player_shooting", "defender_quality", "putback", "overtime", "pts"], inplace = True)
baseline.head()

Unnamed: 0,location,w,final_margin,shot_number,period,game_clock,shot_clock,dribbles,touch_time,shot_dist,pts_type,close_def_dist
0,0,1,24,1,1,69,10.8,2,1.9,7.7,2,1.3
1,0,1,24,2,1,14,3.4,0,0.8,28.2,3,6.1
2,0,1,24,3,1,0,0.0,3,2.7,10.1,2,0.9
3,0,1,24,4,2,707,10.3,2,1.9,17.2,2,3.4
4,0,1,24,5,2,634,10.9,2,2.7,3.7,2,1.1


In [19]:
# Prepare baseline data for modeling (no engineered features)
X = baseline
y = df2[['fgm']]

#create training, testing, and validation sets 60-20-20
X_all, X_test, y_all, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size = 0.25, random_state = 0) 
y_train = np.squeeze(y_train)
y_valid = np.squeeze(y_valid)

# Standardize feartures so they are not weighted unequally
X_train = StandardScaler().fit_transform(X_train) 
X_valid = StandardScaler().fit_transform(X_valid) 
X_test = StandardScaler().fit_transform(X_test)  

In [20]:
#BASELINE MODEL
logit = LogisticRegression().fit(X_train, y_train)

preds = logit.predict(X_valid) 

#model evaluations metric 
print("Baseline Logistic Regression Model:")
print("Accuracy =", accuracy_score(preds, y_valid))
print("ROC AUC Score =", roc_auc_score(y_valid, preds))

Baseline Logistic Regression Model:
Accuracy = 0.6132592360676268
ROC AUC Score = 0.6005909850436838


In [43]:
#create a function that will run the top models given a dataframe
#then, all we have to do is call the function on the unengineered versus featured engineered dataframes for immediate comparison 

def all_models(X_train, y_train, X_valid, y_valid):
    """
    Given training matrix, training labels, test matrix, and test labels, 
    fit Logistic Regression, SVM, KNN, Random Forrest, and ANN 
    
    Note: This does not optimally tune models, just compares the difference of default models given a data set.
    Ideal for comparing the effects of data cleaning and/or feature engineering. 
    """
    accuracies = []
    f1scores = []
    rocscore = []
    
    # Logistic Regression 
    log = LogisticRegression().fit(X_train, y_train)
    log_preds = log.predict(X_valid) 
    
    
    #model evaluations metric 
    log_acc = accuracy_score(y_valid, log_preds)
    log_f1 = f1_score(y_valid, log_preds)
    log_roc = roc_auc_score(y_valid, log_preds)
    
    accuracies.append(("Logistic Regression", log_acc))
    f1scores.append(("Logistic Regression", log_f1))
    rocscore.append(('Logistic Regression', log_roc))
    
    #print it out
    print("Logistic Regression: \n")
    print("Accuracy = {}".format(log_acc))
    print("F1 score = ", log_f1)
    print("ROC AUC Score =", log_roc)
    print("+++++++++++++++++", "\n")
    
    
    # SVM 
    svc = SVC().fit(X_train, y_train)
    svc_preds = svc.predict(X_valid) 
    
    #model evaluations metric 
    svc_acc = accuracy_score(y_valid, svc_preds)
    svc_f1 = f1_score(y_valid, svc_preds)
    svc_roc = roc_auc_score(y_valid, svc_preds)
    
    accuracies.append(("SVM", svc_acc))
    f1scores.append(("SMV", svc_f1))
    rocscore.append(("SVM", svc_roc))
    
    #print it out
    print("SVM: \n")
    print("Accuracy = {}".format(svc_acc))
    print("F1 score = ", svc_f1)
    print("ROC AUC Score =", svc_roc)
    print("+++++++++++++++++", "\n")
    
    
    # KNN
    KNNc = KNeighborsClassifier().fit(X_train, y_train)
    knn_preds = KNNc.predict(X_valid)
    
    #model evaluations metric 
    knn_acc = accuracy_score(y_valid, knn_preds)
    knn_f1 = f1_score(y_valid, knn_preds)
    knn_roc = roc_auc_score(y_valid, knn_preds)
    
    accuracies.append(("KNN", knn_acc))
    f1scores.append(("KNN", knn_f1))
    rocscore.append(("KNN", knn_roc))
    
    #print it out
    print("KNN: \n")
    print("Accuracy = {}".format(knn_acc))
    print("F1 score = ", knn_f1)
    print("ROC AUC Score =", knn_roc)
    print("+++++++++++++++++", "\n")   
    
    
    # Random Forrest 
    rfc = RandomForestClassifier().fit(X_train, y_train)
    rfc_preds = rfc.predict(X_valid)
    
    #model evaluations metric 
    rfc_acc = accuracy_score(y_valid, rfc_preds)
    rfc_f1 = f1_score(y_valid, rfc_preds)
    rfc_roc = roc_auc_score(y_valid, rfc_preds)
    
    accuracies.append(("Random Forest", rfc_acc))
    f1scores.append(("Random Forest", rfc_f1))
    rocscore.append(("Random Forest", rfc_roc))
    
    #print it out
    print("Random Forrest: \n")
    print("Accuracy = {}".format(rfc_acc))
    print("F1 score = ", rfc_f1)
    print("ROC AUC Score =", rfc_roc)
    print("+++++++++++++++++", "\n")
    
    
    # ANN 
    ann = MLPClassifier().fit(X_train, y_train)
    ann_preds = ann.predict(X_valid)
    
    #model evaluations metric 
    ann_acc = accuracy_score(y_valid, ann_preds)
    ann_f1 = f1_score(y_valid, ann_preds)
    ann_roc = roc_auc_score(y_valid, ann_preds)
    
    accuracies.append(("ANN", ann_acc))
    f1scores.append(("ANN", ann_f1))
    rocscore.append(("ANN", ann_roc))
    
    #print it out
    print("Neural Networks: \n")
    print("Accuracy = {}".format(ann_acc))
    print("F1 score = ", ann_f1)
    print("ROC AUC Score =", ann_roc)
    print("+++++++++++++++++", "\n")  
    
    return accuracies, f1scores, rocscore
    

In [46]:
all_models(X_train, y_train, X_valid, y_valid)

Logistic Regression: 

Accuracy = 0.6132592360676268
F1 score =  0.5250865051903115
ROC AUC Score = 0.6005909850436838
+++++++++++++++++ 

SVM: 

Accuracy = 0.6211646837820914
F1 score =  0.445081403347856
ROC AUC Score = 0.5950533548369424
+++++++++++++++++ 

KNN: 

Accuracy = 0.565552598622417
F1 score =  0.4971689994111519
ROC AUC Score = 0.5574882903501033
+++++++++++++++++ 

Random Forrest: 

Accuracy = 0.6071149029430182
F1 score =  0.48003314859895374
ROC AUC Score = 0.5883473890401856
+++++++++++++++++ 

Neural Networks: 

Accuracy = 0.6193252974326863
F1 score =  0.4564403464654932
ROC AUC Score = 0.5950080932581048
+++++++++++++++++ 



([('Logistic Regression', 0.6132592360676268),
  ('SVM', 0.6211646837820914),
  ('KNN', 0.565552598622417),
  ('Random Forest', 0.6071149029430182),
  ('ANN', 0.6193252974326863)],
 [('Logistic Regression', 0.5250865051903115),
  ('SMV', 0.445081403347856),
  ('KNN', 0.4971689994111519),
  ('Random Forest', 0.48003314859895374),
  ('ANN', 0.4564403464654932)],
 [('Logistic Regression', 0.6005909850436838),
  ('SVM', 0.5950533548369424),
  ('KNN', 0.5574882903501033),
  ('Random Forest', 0.5883473890401856),
  ('ANN', 0.5950080932581048)])

# Model Build with Engineered Features

In [20]:
nba.columns

Index(['game_id', 'matchup', 'location', 'w', 'final_margin',
       'cox_shot_number', 'period', 'game_clock', 'shot_clock',
       'full_game_clock', 'overtime', 'buzzer_beater', 'putback',
       'clutch_shot', 'cox_dribbles', 'cox_touch_time', 'shot_dist',
       'pts_type', 'closest_defender', 'closest_defender_player_id',
       'defender_quality', 'cox_close_def_dist', 'player_name', 'player_id',
       'player_shooting', 'fgm'],
      dtype='object')

previous SBS picks:

game_id ?
final_margin #
cox_shot_number
full_game_clock #
buzzer_beater #
cox_dribbles #
cox_touch_time #
shot_dist #
pts_type #
closest_defender_player_id ?
cox_close_def_dist
defender_quality #

In [56]:
#Sequential Feature Selection for best features (including engineered features)
X = nba.drop(labels = ['fgm', 'matchup', 'player_name', 'closest_defender', 'game_id', 
                       'player_id', 'closest_defender_player_id'], axis = 1)
y = nba[['fgm']]

y_stand = np.squeeze(y)
X_stand = StandardScaler().fit_transform(X) #standardize X matrix so features don't get too unequal weight


features2 = SequentialFeatureSelector(LogisticRegression(), n_features_to_select = 10, 
                                     direction="backward").fit(X_stand, y_stand)
#wrapper method
#SBS


features2 = features2.get_support() # keep only the subset of 10 features that performed the best

cols = list(X.columns)
print("selected features: ")
for i in range(len(features2)):
    if features2[i]:
        print(cols[i])

selected features: 
w
final_margin
cox_shot_number
full_game_clock
buzzer_beater
cox_touch_time
shot_dist
pts_type
defender_quality
cox_close_def_dist


# Candidate Models 


In [13]:
#Dataset with backwards selection dataset
selected_X = nba[["w", "final_margin", "cox_shot_number", "full_game_clock", "buzzer_beater", "cox_touch_time", 
                  "shot_dist", "pts_type", "defender_quality", "cox_close_def_dist"]]
selected_y = nba[['fgm']]

#create training, testing, and validation sets 60-20-20
sX_all, sX_test, sy_all, sy_test = train_test_split(selected_X, selected_y, test_size = 0.2, random_state = 0) 
sX_train, sX_valid, sy_train, sy_valid = train_test_split(sX_all, sy_all, test_size = 0.25, random_state = 0) 
sy_train = np.squeeze(sy_train)
sy_valid = np.squeeze(sy_valid)

# Standardize feartures so they are not weighted unequally
sX_train = StandardScaler().fit_transform(sX_train) 
sX_valid = StandardScaler().fit_transform(sX_valid) 
sX_test = StandardScaler().fit_transform(sX_test)  


In [13]:
# Dataset using domain knowledge 
# WINNER (compared to backwards selection)
nba_selected = nba[['location', 'w', 'final_margin', 'cox_shot_number',
            'period', 'game_clock', 'shot_clock', 'overtime', 'buzzer_beater', 
            'putback', 'clutch_shot', 'cox_dribbles', 'cox_touch_time', 'shot_dist', 'pts_type',  
            'defender_quality', 'cox_close_def_dist', 'player_shooting', 'fgm']]
sX2 = nba_selected.drop(columns=["fgm"])
sy2 = nba[['fgm']]
#selected x (round 2) = sX2

#create training, testing, and validation sets 60-20-20
sX2_all, sX2_test, sy2_all, sy2_test = train_test_split(sX2, sy2, test_size = 0.2, random_state = 0) #get training and testing data
sX2_train, sX2_valid, sy2_train, sy2_valid = train_test_split(sX2_all, sy2_all, test_size = 0.25, random_state = 0) #get training and testing data
sy2_train = np.squeeze(sy2_train)
sy2_valid = np.squeeze(sy2_valid)

# Standardize feartures so they are not weighted unequally
sX2_train = StandardScaler().fit_transform(sX2_train) 
sX2_valid = StandardScaler().fit_transform(sX2_valid) 
sX2_test = StandardScaler().fit_transform(sX2_test) 

In [91]:
all_models(sX_train, sy_train, sX_valid, sy_valid)
#backwards selection choice

Logistic Regression: 

Accuracy = 0.615920475892298
F1 score =  0.5154058858384356
ROC AUC Score = 0.6010445108302945
+++++++++++++++++ 

SVM: 

Accuracy = 0.6214386349405134
F1 score =  0.44231767079850104
ROC AUC Score = 0.5949321176912478
+++++++++++++++++ 

KNN: 

Accuracy = 0.5686834690043833
F1 score =  0.4979272014942372
ROC AUC Score = 0.5601452561467054
+++++++++++++++++ 

Random Forrest: 

Accuracy = 0.6030839073262367
F1 score =  0.4792030399507035
ROC AUC Score = 0.5849457444505197
+++++++++++++++++ 

Neural Networks: 

Accuracy = 0.6234737006887915
F1 score =  0.46988814810733376
ROC AUC Score = 0.6002210893328862
+++++++++++++++++ 



([('Logistic Regression', 0.615920475892298),
  ('SVM', 0.6214386349405134),
  ('KNN', 0.5686834690043833),
  ('Random Forest', 0.6030839073262367),
  ('ANN', 0.6234737006887915)],
 [('Logistic Regression', 0.5154058858384356),
  ('SMV', 0.44231767079850104),
  ('KNN', 0.4979272014942372),
  ('Random Forest', 0.4792030399507035),
  ('ANN', 0.46988814810733376)],
 [('Logistic Regression', 0.6010445108302945),
  ('SVM', 0.5949321176912478),
  ('KNN', 0.5601452561467054),
  ('Random Forest', 0.5849457444505197),
  ('ANN', 0.6002210893328862)])

In [None]:
# Before: 
([('Logistic Regression', 0.6132592360676268),
  ('SVM', 0.6211646837820914),
  ('KNN', 0.565552598622417),
  ('Random Forest', 0.6071149029430182),
  ('ANN', 0.6193252974326863)],
 [('Logistic Regression', 0.5250865051903115),
  ('SMV', 0.445081403347856),
  ('KNN', 0.4971689994111519),
  ('Random Forest', 0.48003314859895374),
  ('ANN', 0.4564403464654932)],
 [('Logistic Regression', 0.6005909850436838),
  ('SVM', 0.5950533548369424),
  ('KNN', 0.5574882903501033),
  ('Random Forest', 0.5883473890401856),
  ('ANN', 0.5950080932581048)])

# After (domain dset)
([('Logistic Regression', 0.6159987476518473),
  ('SVM', 0.6228083907326236),
  ('KNN', 0.5625),
  ('Random Forest', 0.6125547902316844),
  ('ANN', 0.6140419536631183)],
 [('Logistic Regression', 0.5173635022134775),
  ('SMV', 0.4479954180985109),
  ('KNN', 0.49234821306934295),
  ('Random Forest', 0.4772415249762383),
  ('ANN', 0.5116854822737177)],
 [('Logistic Regression', 0.6014372153063561),
  ('SVM', 0.5967785629177886),
  ('KNN', 0.5541786453438191),
  ('Random Forest', 0.592360459586251),
  ('ANN', 0.5989212063790875)])

In [92]:
all_models(sX2_train, sy2_train, sX2_valid, sy2_valid)
#harry domain knowledge

Logistic Regression: 

Accuracy = 0.6159987476518473
F1 score =  0.5173635022134775
ROC AUC Score = 0.6014372153063561
+++++++++++++++++ 

SVM: 

Accuracy = 0.6228083907326236
F1 score =  0.4479954180985109
ROC AUC Score = 0.5967785629177886
+++++++++++++++++ 

KNN: 

Accuracy = 0.5625
F1 score =  0.49234821306934295
ROC AUC Score = 0.5541786453438191
+++++++++++++++++ 

Random Forrest: 

Accuracy = 0.6125547902316844
F1 score =  0.4772415249762383
ROC AUC Score = 0.592360459586251
+++++++++++++++++ 

Neural Networks: 

Accuracy = 0.6140419536631183
F1 score =  0.5116854822737177
ROC AUC Score = 0.5989212063790875
+++++++++++++++++ 



([('Logistic Regression', 0.6159987476518473),
  ('SVM', 0.6228083907326236),
  ('KNN', 0.5625),
  ('Random Forest', 0.6125547902316844),
  ('ANN', 0.6140419536631183)],
 [('Logistic Regression', 0.5173635022134775),
  ('SMV', 0.4479954180985109),
  ('KNN', 0.49234821306934295),
  ('Random Forest', 0.4772415249762383),
  ('ANN', 0.5116854822737177)],
 [('Logistic Regression', 0.6014372153063561),
  ('SVM', 0.5967785629177886),
  ('KNN', 0.5541786453438191),
  ('Random Forest', 0.592360459586251),
  ('ANN', 0.5989212063790875)])

## Build models

In [23]:
#Random Forest

rfc = RandomForestClassifier().fit(sX2_train, sy2_train)
rfc_preds = rfc.predict(sX2_valid)
    
#model evaluations metric 
rfc_acc = accuracy_score(sy2_valid, rfc_preds)
rfc_roc = roc_auc_score(sy2_valid, rfc_preds)
    

#print it out
print("Random Forrest: \n")
print("Accuracy = {}".format(rfc_acc))
print("ROC AUC Score =", rfc_roc)
print("+++++++++++++++++", "\n")
    

Random Forrest: 

Accuracy = 0.6152551659361303
ROC AUC Score = 0.5955675069344806
+++++++++++++++++ 



In [95]:
# Tune rfc 
parameters = {'n_estimators': [5, 18, 25], 'max_depth': [3, 8, 12], 
             'max_features': [3, 6, 9], 'min_samples_split': [3, 7, 10]}

#max_depth=8, max_features=6, min_samples_leaf=1, min_samples_split=7, n_estimators=18, oob_score=True
rf = RandomForestClassifier(random_state=0)
rf_cv = GridSearchCV(rf, parameters)
#rf_cv.fit_transform(sX2_train, sy2_train)
#rf_cv.best_params_


{'max_depth': 8,
 'max_features': 6,
 'min_samples_split': 10,
 'n_estimators': 25}

In [97]:
best_rf = rf_cv.fit(sX2_train, sy2_train)
rf_cv.best_params_

{'max_depth': 8,
 'max_features': 6,
 'min_samples_split': 10,
 'n_estimators': 25}

In [24]:
#get model evaluations for tuned models 
best_rf = RandomForestClassifier(max_depth=8, max_features=6, min_samples_split=10, n_estimators=25).fit(sX2_train, sy2_train)
rf_cv_preds = best_rf.predict(sX2_valid)

#model evaluations metric 
rfcv_acc = accuracy_score(sy2_valid, rf_cv_preds)
rfcv_roc = roc_auc_score(sy2_valid, rf_cv_preds)

#print it out
print("Tuned Random Forrest: \n")
print("Accuracy = {}".format(rfcv_acc))
print("ROC AUC Score =", rfcv_roc)
print("+++++++++++++++++", "\n")

Tuned Random Forrest: 

Accuracy = 0.6221430807764559
ROC AUC Score = 0.5954107210482437
+++++++++++++++++ 



In [None]:
# TODO: create a plot of accuracy versus number of trees

In [15]:
# SVM 
svc = SVC().fit(sX2_train, sy2_train)
svc_preds = svc.predict(sX2_valid) 
    
#model evaluations metric 
svc_acc = accuracy_score(sy2_valid, svc_preds)
svc_roc = roc_auc_score(sy2_valid, svc_preds)

    
#print it out
print("SVM: \n")
print("Accuracy = {}".format(svc_acc))
print("ROC AUC Score =", svc_roc)
print("+++++++++++++++++", "\n")

SVM: 

Accuracy = 0.6228866624921728
ROC AUC Score = 0.5967209482676837
+++++++++++++++++ 



In [None]:
# Tune SVM

#Skipping SVM because its takes way too long to run--not good time complexity for the problem at hand

#doing this on a ec2 instance also!!! 
#started at 7:28, gave up at 9:38, 130 mins later. 
#will try again with less parameter options

parameters = {'C': [0.001, 1, 10], 'kernel': ["linear", "poly", "rbf"], 
             'degree': [3, 9], 'gamma': ["scale", .1, .5]}

svc = SVC(random_state=0)
svc_cv = GridSearchCV(svc, parameters)
best_svm = svc_cv.fit(sX2_train, sy2_train)
svc_cv.best_params_

#final result: TBD, will run again on thursday 

In [None]:
#model evaluations for tuned model

svm_cv_preds = best_svm.predict(sX2_valid)

#model evaluations metric 
svmcv_acc = accuracy_score(sy2_valid, svm_cv_preds)
svmcv_roc = roc_auc_score(sy2_valid, svm_cv_preds)

#print it out
print("Tuned SVM: \n")
print("Accuracy = {}".format(svmcv_acc))
print("ROC AUC Score =", svmcv_roc)
print("+++++++++++++++++", "\n")

In [38]:
#ANN 
ann = MLPClassifier().fit(sX2_train, sy2_train)
ann_preds = ann.predict(sX2_valid)
    
#model evaluations metric 
ann_acc = accuracy_score(sy2_valid, ann_preds)
ann_roc = roc_auc_score(sy2_valid, ann_preds)
    
#print it out
print("Neural Networks: \n")
print("Accuracy = {}".format(ann_acc))
print("ROC AUC Score =", ann_roc)
print("+++++++++++++++++", "\n")  
    

Neural Networks: 

Accuracy = 0.6185425798371947
ROC AUC Score = 0.5973935576767869
+++++++++++++++++ 



In [None]:
# Tune ANN
# switched to aws after 322 mins 

#ran on ec2 instance 
#c5ad.8xlarge (instance type)
#with 32 vCPUs 
# and 64 gb of memory 
#used 97% of cpu for 22 mins 
#started at 7:03pm, ended at 7:25pm 
#31 out of 32 cores
 
parameters = {'hidden_layer_sizes': [(10, 10, 10), (100,), (10, 100, 10), (50, 100, 50)], 
              'activation': ['tanh', 'relu'],
              'solver': ['sgd', 'adam'],
              'alpha': [0.0001, 0.05],
              'learning_rate': ['constant', 'adaptive']}

ann = MLPClassifier(max_iter=800, random_state=0)
ann_cv = GridSearchCV(ann, parameters)
best_ann = ann_cv.fit(sX2_train, sy2_train)
ann_cv.best_params_


In [16]:
# model evaluations for tuned model

#output from ec2
#{'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 
# 'learning_rate': 'constant', 'solver': 'adam'}

best_ann = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes=(50, 100, 50), 
                        learning_rate='constant', solver='adam').fit(sX2_train, sy2_train)
ann_cv_preds = best_ann.predict(sX2_valid)

#model evaluations metric 
anncv_acc = accuracy_score(sy2_valid, ann_cv_preds)
anncv_roc = roc_auc_score(sy2_valid, ann_cv_preds)

#print it out
print("Tuned ANN: \n")
print("Accuracy = {}".format(anncv_acc))
print("ROC AUC Score =", anncv_roc)
print("+++++++++++++++++", "\n")

Tuned ANN: 

Accuracy = 0.6237085159674389
ROC AUC Score = 0.5984801698539561
+++++++++++++++++ 



In [17]:
#GMM 

#GMM might not be great because assumes strong independence between factors 
#when that is not true (by the nature of how we engineered our features)

#build model
#no tuning necessary!
nb = GaussianNB().fit(sX2_train, sy2_train)
nb_preds = nb.predict(sX2_valid)

#model evaluations metric 
nb_acc = accuracy_score(sy2_valid, nb_preds)
nb_roc = roc_auc_score(sy2_valid, nb_preds)
    
#print it out
print("Gaussian Mixed Model: \n")
print("Accuracy = {}".format(nb_acc))
print("ROC AUC Score =", nb_roc)
print("+++++++++++++++++", "\n")  


Gaussian Mixed Model: 

Accuracy = 0.5581950532247965
ROC AUC Score = 0.5715164207478979
+++++++++++++++++ 



In [25]:
# KNN
KNNc = KNeighborsClassifier().fit(sX2_train, sy2_train)
knn_preds = KNNc.predict(sX2_valid)
    
#model evaluations metric 
knn_acc = accuracy_score(sy2_valid, knn_preds)
knn_roc = roc_auc_score(sy2_valid, knn_preds)

#print it out
print("KNN: \n")
print("Accuracy = {}".format(knn_acc))
print("ROC AUC Score =", knn_roc)
print("+++++++++++++++++", "\n")   

KNN: 

Accuracy = 0.5620695053224797
ROC AUC Score = 0.5537155088615721
+++++++++++++++++ 



In [36]:
#tune KNN 
parameters = {'n_neighbors': [ 79, 80, 81, 82, 83, 84, 85]}
#iterative tuned (started with 1, 25, 50, 100) and ended up closing in around 82!
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, parameters)
best_knn = knn_cv.fit(sX2_train, sy2_train)
knn_cv.best_params_

{'n_neighbors': 82}

In [37]:
# model evaluations for tuned model

knn_cv_preds = best_knn.predict(sX2_valid)

#model evaluations metric 
knncv_acc = accuracy_score(sy2_valid, knn_cv_preds)
knncv_roc = roc_auc_score(sy2_valid, knn_cv_preds)

#print it out
print("Tuned KNN: \n")
print("Accuracy = {}".format(knncv_acc))
print("ROC AUC Score =", knncv_roc)
print("+++++++++++++++++", "\n")

Tuned KNN: 

Accuracy = 0.6095804633688165
ROC AUC Score = 0.587682449616199
+++++++++++++++++ 

