In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [4]:
df = pd.read_csv('./data/df.csv', index_col=None, header=0)
df.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,All-Star
0,AJ Hammons_2017,DAL,24.0,22.0,4.0,18.0,163.0,48.0,17.0,42.0,...,4.0,10.0,1.0,13.0,21.0,129.2,0.0,0.0,-5.0,0.0
1,Aaron Brooks_2017,IND,32.0,65.0,36.0,29.0,894.0,322.0,121.0,300.0,...,125.0,66.0,25.0,9.0,93.0,628.3,0.0,0.0,-32.0,0.0
2,Aaron Gordon_2017,ORL,21.0,80.0,29.0,51.0,2298.0,1019.0,393.0,865.0,...,150.0,89.0,65.0,40.0,172.0,1956.0,7.0,0.0,-163.0,0.0
3,Aaron Harrison_2017,CHA,22.0,5.0,2.0,3.0,17.0,1.0,0.0,4.0,...,3.0,0.0,0.0,0.0,2.0,9.1,0.0,0.0,-5.0,0.0
4,Adreian Payne_2017,MIN,26.0,18.0,5.0,13.0,135.0,63.0,23.0,54.0,...,7.0,8.0,8.0,7.0,32.0,150.1,0.0,0.0,8.0,0.0


$$Data Dictionary$$

| Abbreviation | Definition |
|---|---|
|GP |Games Played 
|W |Wins 
|L |Losses 
|MIN |Minutes Played 
|FGM |Field Goals Made 
|FGA |Field Goals Attempted 
|FG% |Field Goal Percentage 
|3PM |3 Point Field Goals Made 
|3PA |3 Point Field Goals Attempted 
|3P% |3 Point Field Goals Percentage 
|FTM |Free Throws Made 
|FTA |Free Throws Attempted 
|FT% |Free Throw Percentage 
|OREB |Offensive Rebounds 
|DREB |Defensive Rebounds 
|REB |Rebounds AST Assists 
|TOV |Turnovers 
|STL |Steals 
|BLK |Blocks 
|PF |Personal Fouls 
|FP |Fantasy Points 
|DD2 |Double doubles 
|TD3 |Triple doubles 
|PTS |Points 
|+/- |Plus Minus Ratio

In [5]:
df = df.dropna()

In [6]:
X = df.drop(['All-Star','PLAYER','TEAM'], axis = 1)
y = df['All-Star']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [8]:
lreg = LogisticRegression(max_iter=10000)
lreg.fit(X_train,y_train)
preds = lreg.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2095   19]
 [  38   84]]
Accuracy Score: 0.9745080500894454


In [9]:
coefficients = pd.DataFrame({'Features': X.columns,
                             'Coefficients': np.round(lreg.coef_[0],4)})\
.sort_values('Coefficients', ascending=False)
coefficients.style.hide_index()

Features,Coefficients
AGE,0.1392
3PM,0.0251
FG%,0.0218
FT%,0.0143
BLK,0.0089
TOV,0.0078
3P%,0.0072
FTA,0.0055
PTS,0.0037
FGA,0.0033


In [13]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit(maxiter=100000)
print(result.summary())

         Current function value: 0.066382
         Iterations: 100000
                           Logit Regression Results                           
Dep. Variable:               All-Star   No. Observations:                 8944
Model:                          Logit   Df Residuals:                     8920
Method:                           MLE   Df Model:                           23
Date:                Sat, 17 Oct 2020   Pseudo R-squ.:                  0.6865
Time:                        10:07:29   Log-Likelihood:                -593.72
converged:                      False   LL-Null:                       -1893.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
AGE           -0.0680      0.010     -6.662      0.000      -0.088      -0.048
GP         -3.241e+08   3.63e+04  -8923.720      0.000   -3.2



In [11]:
fp = X_test[(y_test == 0) & (preds == 1)].sort_index()
indices = df.index.intersection(fp.index)
df.loc[indices]['Player']

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,All-Star
258,Karl-Anthony Towns_2017,MIN,21.0,82.0,31.0,51.0,3030.0,2061.0,802.0,1480.0,...,220.0,212.0,56.0,103.0,241.0,3864.4,62.0,1.0,-26.0,0.0
402,Rudy Gobert_2017,UTA,25.0,81.0,51.0,30.0,2744.0,1137.0,413.0,625.0,...,97.0,147.0,49.0,214.0,246.0,3166.5,58.0,0.0,435.0,0.0
1204,Michael Finley_2002,DAL,29.0,69.0,45.0,24.0,2755.0,1424.0,569.0,1228.0,...,230.0,117.0,65.0,25.0,144.0,2354.0,5.0,0.0,221.0,0.0
1449,Damian Lillard_2016,POR,25.0,75.0,40.0,35.0,2676.0,1879.0,618.0,1474.0,...,512.0,242.0,65.0,28.0,165.0,3046.4,15.0,0.0,94.0,0.0
2418,DeAndre Jordan_2015,LAC,26.0,82.0,56.0,26.0,2820.0,946.0,379.0,534.0,...,61.0,109.0,81.0,183.0,245.0,3191.7,47.0,0.0,622.0,0.0
2828,Antoine Walker_2001,BOS,24.0,81.0,35.0,46.0,3393.0,1892.0,711.0,1720.0,...,445.0,301.0,138.0,49.0,251.0,3682.3,38.0,5.0,-118.0,0.0
3001,Jamal Mashburn_2001,CHH,28.0,76.0,42.0,34.0,2992.0,1528.0,573.0,1388.0,...,411.0,211.0,85.0,13.0,185.0,2918.7,21.0,1.0,160.0,0.0
4740,Elton Brand_2004,LAC,25.0,69.0,22.0,47.0,2675.0,1379.0,484.0,982.0,...,227.0,193.0,64.0,154.0,229.0,3037.3,40.0,0.0,-222.0,0.0
5545,Andrei Kirilenko_2006,UTA,25.0,69.0,38.0,31.0,2604.0,1054.0,336.0,730.0,...,299.0,203.0,102.0,220.0,162.0,2927.9,22.0,2.0,39.0,0.0
5838,Michael Redd_2006,MIL,26.0,80.0,39.0,41.0,3130.0,2028.0,682.0,1516.0,...,229.0,170.0,95.0,5.0,157.0,2911.9,0.0,0.0,83.0,0.0


In [12]:
fn = X_test[(y_test == 1) & (preds == 0)].sort_index()
indices = df.index.intersection(fn.index)
df.loc[indices]

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,All-Star
105,DeAndre Jordan_2017,LAC,28.0,81.0,51.0,30.0,2570.0,1029.0,412.0,577.0,...,96.0,115.0,51.0,134.0,212.0,2949.8,41.0,0.0,455.0,1.0
133,Draymond Green_2017,GSW,27.0,76.0,62.0,14.0,2471.0,776.0,272.0,650.0,...,533.0,184.0,154.0,106.0,217.0,2890.3,17.0,5.0,820.0,1.0
529,Brad Miller_2003,IND,27.0,73.0,42.0,31.0,2271.0,955.0,329.0,667.0,...,193.0,118.0,65.0,43.0,203.0,2174.1,27.0,0.0,280.0,1.0
923,Alonzo Mourning_2002,MIA,32.0,75.0,34.0,41.0,2453.0,1178.0,447.0,866.0,...,87.0,182.0,27.0,186.0,258.0,2523.9,26.0,0.0,51.0,1.0
1569,Jimmy Butler_2016,CHI,26.0,67.0,37.0,30.0,2474.0,1399.0,470.0,1035.0,...,321.0,132.0,110.0,43.0,124.0,2637.1,10.0,2.0,-12.0,1.0
1861,Anthony Davis_2014,NOP,21.0,67.0,29.0,38.0,2358.0,1394.0,522.0,1005.0,...,105.0,109.0,89.0,189.0,200.0,3084.1,36.0,0.0,-113.0,1.0
1965,Dwyane Wade_2014,MIA,32.0,54.0,36.0,18.0,1775.0,1028.0,415.0,761.0,...,252.0,161.0,79.0,29.0,106.0,1858.2,4.0,0.0,141.0,1.0
2067,John Wall_2014,WAS,23.0,82.0,44.0,38.0,2980.0,1583.0,579.0,1337.0,...,721.0,295.0,149.0,40.0,219.0,3336.1,29.0,2.0,196.0,1.0
2378,Carmelo Anthony_2015,NYK,31.0,40.0,10.0,30.0,1428.0,966.0,358.0,806.0,...,122.0,89.0,40.0,17.0,87.0,1547.8,5.0,0.0,-108.0,1.0
2587,Kevin Durant_2015,OKC,26.0,27.0,18.0,9.0,913.0,686.0,238.0,467.0,...,110.0,74.0,24.0,25.0,40.0,1137.6,5.0,0.0,168.0,1.0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [46]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
rmc = RandomForestClassifier(n_estimators=100)
rmc.fit(X_train,y_train)
preds = rmc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test,preds)}')

[[2102   12]
 [  42   80]]
Accuracy Score: 0.9758497316636852


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y)

In [48]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
svc = SVC()
svc.fit(X_train,y_train)
oreds = svc.predict(X_test)
print(confusion_matrix(y_test,preds))
print(f'Accuracy Score: {accuracy_score(y_test, preds)}')

[[2027   87]
 [ 117    5]]
Accuracy Score: 0.9087656529516994
