In [33]:
#Import all needed libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [2]:
#Review Data
df = pd.read_csv('./data/df.csv', index_col=None, header=0)
df.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,All-Star,YEAR
0,AJ Hammons_2017,DAL,24.0,22.0,4.0,18.0,163.0,48.0,17.0,42.0,...,10.0,1.0,13.0,21.0,129.2,0.0,0.0,-5.0,0.0,2017
1,Aaron Brooks_2017,IND,32.0,65.0,36.0,29.0,894.0,322.0,121.0,300.0,...,66.0,25.0,9.0,93.0,628.3,0.0,0.0,-32.0,0.0,2017
2,Aaron Gordon_2017,ORL,21.0,80.0,29.0,51.0,2298.0,1019.0,393.0,865.0,...,89.0,65.0,40.0,172.0,1956.0,7.0,0.0,-163.0,0.0,2017
3,Aaron Harrison_2017,CHA,22.0,5.0,2.0,3.0,17.0,1.0,0.0,4.0,...,0.0,0.0,0.0,2.0,9.1,0.0,0.0,-5.0,0.0,2017
4,Adreian Payne_2017,MIN,26.0,18.0,5.0,13.0,135.0,63.0,23.0,54.0,...,8.0,8.0,7.0,32.0,150.1,0.0,0.0,8.0,0.0,2017


$$Data Dictionary$$

| Abbreviation | Definition |
|---|---|
|GP |Games Played 
|W |Wins 
|L |Losses 
|MIN |Minutes Played 
|FGM |Field Goals Made 
|FGA |Field Goals Attempted 
|FG% |Field Goal Percentage 
|3PM |3 Point Field Goals Made 
|3PA |3 Point Field Goals Attempted 
|3P% |3 Point Field Goals Percentage 
|FTM |Free Throws Made 
|FTA |Free Throws Attempted 
|FT% |Free Throw Percentage 
|OREB |Offensive Rebounds 
|DREB |Defensive Rebounds 
|REB |Rebounds AST Assists 
|TOV |Turnovers 
|STL |Steals 
|BLK |Blocks 
|PF |Personal Fouls 
|FP |Fantasy Points 
|DD2 |Double doubles 
|TD3 |Triple doubles 
|PTS |Points 
|+/- |Plus Minus Ratio

In [3]:
#drop null values is any
df = df.dropna()

In [4]:
#Set up X and y
X = df.drop(['All-Star','PLAYER','TEAM'], axis = 1)
y = df['All-Star']

In [11]:
#Set up Train and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [12]:
#Run Logistical Regression & print Confusion Matrix and Accuracy Score
lreg = LogisticRegression(max_iter=10000)
lreg.fit(X_train,y_train)
preds = lreg.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2095   19]
 [  28   94]]
Accuracy Score: 0.9789803220035778


In [13]:
#Print out Coefficients
coefficients = pd.DataFrame({'Features': X.columns,
                             'Coefficients': np.round(lreg.coef_[0],4)})\
.sort_values('Coefficients', ascending=False)
coefficients.style.hide_index()

Features,Coefficients
AGE,0.161
FT%,0.0188
FG%,0.017
3PM,0.0152
BLK,0.0085
TOV,0.0079
FTA,0.0063
3P%,0.0059
TD3,0.0047
PTS,0.0027


In [27]:
#print out Type I Errors
fp = X_test[(y_test == 0) & (preds == 1)].sort_index()
indices = df.index.intersection(fp.index)
df.loc[indices][['PLAYER','FTA','All-Star']]

Unnamed: 0,PLAYER,FTA,All-Star
168,Hassan Whiteside_2017,358.0,0.0
1204,Michael Finley_2002,251.0,0.0
1449,Damian Lillard_2016,464.0,0.0
1460,DeAndre Jordan_2016,619.0,0.0
2270,Tim Duncan_2014,316.0,0.0
2418,DeAndre Jordan_2015,471.0,0.0
2828,Antoine Walker_2001,348.0,0.0
4557,Stephen Jackson_2010,434.0,0.0
4849,Latrell Sprewell_2004,295.0,0.0
6263,Marcus Camby_2007,221.0,0.0


In [28]:
#print out Type II Errors
fn = X_test[(y_test == 1) & (preds == 0)].sort_index()
indices = df.index.intersection(fn.index)
df.loc[indices][['PLAYER','FTA','All-Star']]

Unnamed: 0,PLAYER,FTA,All-Star
372,Paul Millsap_2017,405.0,1.0
913,Zydrunas Ilgauskas_2003,512.0,1.0
1035,Dikembe Mutombo_2002,364.0,1.0
2060,Joe Johnson_2014,195.0,1.0
2278,Tony Parker_2014,243.0,1.0
2432,Dirk Nowitzki_2015,289.0,1.0
3187,Stephon Marbury_2001,458.0,1.0
3200,Theo Ratliff_2001,217.0,1.0
3269,Antawn Jamison_2005,296.0,1.0
3605,Rashard Lewis_2005,283.0,1.0


In [29]:
#Set up new Train and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [30]:
#Run Random Forest & print Confusion Matrix and Accuracy Score
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
rmc = RandomForestClassifier(n_estimators=100)
rmc.fit(X_train,y_train)
preds = rmc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2101   13]
 [  53   69]]
Accuracy Score: 0.9704830053667263


In [31]:
#Set up Train and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [32]:
#Run SVC & print Confusion Matrix and Accuracy Score
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
svc = SVC()
svc.fit(X_train,y_train)
oreds = svc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test, preds)}')

[[2036   78]
 [ 118    4]]
Accuracy Score: 0.9123434704830053
