In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [39]:
df = pd.read_csv('./data/df.csv', index_col=None, header=0)
df.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,All-Star
0,AJ Hammons_2017,DAL,24.0,22.0,4.0,18.0,163.0,48.0,17.0,42.0,...,4.0,10.0,1.0,13.0,21.0,129.2,0.0,0.0,-5.0,0.0
1,Aaron Brooks_2017,IND,32.0,65.0,36.0,29.0,894.0,322.0,121.0,300.0,...,125.0,66.0,25.0,9.0,93.0,628.3,0.0,0.0,-32.0,0.0
2,Aaron Gordon_2017,ORL,21.0,80.0,29.0,51.0,2298.0,1019.0,393.0,865.0,...,150.0,89.0,65.0,40.0,172.0,1956.0,7.0,0.0,-163.0,0.0
3,Aaron Harrison_2017,CHA,22.0,5.0,2.0,3.0,17.0,1.0,0.0,4.0,...,3.0,0.0,0.0,0.0,2.0,9.1,0.0,0.0,-5.0,0.0
4,Adreian Payne_2017,MIN,26.0,18.0,5.0,13.0,135.0,63.0,23.0,54.0,...,7.0,8.0,8.0,7.0,32.0,150.1,0.0,0.0,8.0,0.0


$$Data Dictionary$$

| Abbreviation | Definition |
|---|---|
|GP |Games Played 
|W |Wins 
|L |Losses 
|MIN |Minutes Played 
|FGM |Field Goals Made 
|FGA |Field Goals Attempted 
|FG% |Field Goal Percentage 
|3PM |3 Point Field Goals Made 
|3PA |3 Point Field Goals Attempted 
|3P% |3 Point Field Goals Percentage 
|FTM |Free Throws Made 
|FTA |Free Throws Attempted 
|FT% |Free Throw Percentage 
|OREB |Offensive Rebounds 
|DREB |Defensive Rebounds 
|REB |Rebounds AST Assists 
|TOV |Turnovers 
|STL |Steals 
|BLK |Blocks 
|PF |Personal Fouls 
|FP |Fantasy Points 
|DD2 |Double doubles 
|TD3 |Triple doubles 
|PTS |Points 
|+/- |Plus Minus Ratio

In [40]:
df = df.dropna()

In [41]:
X = df.drop(['All-Star','PLAYER','TEAM'], axis = 1)
y = df['All-Star']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [43]:
lreg = LogisticRegression(max_iter=10000)
lreg.fit(X_train,y_train)
preds = lreg.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2088   26]
 [  29   93]]
Accuracy Score: 0.975402504472272


In [44]:
list(zip(X.columns, np.round(lreg.coef_[0],4)))

[('AGE', 0.1657),
 ('GP', -0.117),
 ('W', -0.0294),
 ('L', -0.0875),
 ('MIN', 0.0012),
 ('PTS', 0.0047),
 ('FGM', -0.0097),
 ('FGA', 0.0042),
 ('FG%', 0.0155),
 ('3PM', 0.0315),
 ('3PA', -0.0132),
 ('3P%', -0.0062),
 ('FTM', -0.0073),
 ('FTA', 0.0061),
 ('FT%', 0.0176),
 ('OREB', 0.0026),
 ('DREB', -0.0001),
 ('REB', 0.0026),
 ('AST', 0.0024),
 ('TOV', 0.0098),
 ('STL', -0.0109),
 ('BLK', 0.0109),
 ('PF', -0.0133),
 ('FP', 0.0019),
 ('DD2', -0.0155),
 ('TD3', -0.0131),
 ('+/-', 0.0031)]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [46]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
rmc = RandomForestClassifier(n_estimators=100)
rmc.fit(X_train,y_train)
preds = rmc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2102   12]
 [  42   80]]
Accuracy Score: 0.9758497316636852


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [48]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
svc = SVC()
svc.fit(X_train,y_train)
oreds = svc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test, preds)}')

[[2027   87]
 [ 117    5]]
Accuracy Score: 0.9087656529516994
