In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import xgboost

In [108]:
dataset = pd.read_csv('nba_player_data_through_jan.csv')
dataset.set_index('Player')
dataset = dataset.drop(columns = 'Unnamed: 0')
dataset.head()

Unnamed: 0,All-Star,Position,Player,Age,Conference,PPG,RPG,APG,BPG,SPG,GP,W%
0,True,G,Bradley Beal 2018-19,25,East,24.7255,5.0588,5.098,0.7843,1.3725,51.0,0.4314
1,True,G,Stephen Curry 2018-19,30,West,29.55,5.1,5.4,0.35,1.175,40.0,0.7059
2,True,G,James Harden 2018-19,29,West,36.3404,6.6596,8.1277,0.7234,2.0851,47.0,0.58
3,True,G,Kyrie Irving 2018-19,26,East,23.6512,4.7907,6.9302,0.4651,1.7209,43.0,0.6275
4,True,G,Damian Lillard 2018-19,28,West,26.4118,4.5686,6.2549,0.4902,1.0784,51.0,0.6154


### Data Preprocessing

In [109]:
std_scaler = StandardScaler()
x = dataset[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y = dataset['All-Star']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)
x_train_std = std_scaler.fit_transform(x_train)
x_test_std = std_scaler.transform(x_test)

### Classification Algorithm Performance Function

In [112]:
def check_classification_algorithms(algos, class_names, x_train, x_test, y_train, y_test): 
    for classifier in algos: 
        model = classifier.fit(x_train, y_train)
        preds = model.predict(x_test)
        print(classifier)
        print(classification_report(y_test, preds, target_names=class_names))

In [114]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(tree.DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(svm.SVC())
classifiers.append(xgboost.XGBClassifier())
classifiers.append(GaussianNB())
check_classification_algorithms(classifiers, ['Non-All-Star', 'All-Star'], x_train_std, x_test_std, y_train, y_test)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

Non-All-Star       0.99      0.99      0.99      3004
    All-Star       0.86      0.82      0.84       208

    accuracy                           0.98      3212
   macro avg       0.93      0.91      0.92      3212
weighted avg       0.98      0.98      0.98      3212

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impuri

### K Nearest Neighbors

In [90]:
NEIGHBORS = 15
nca = neighbors.NeighborhoodComponentsAnalysis(random_state = 42)
knn = neighbors.KNeighborsClassifier(NEIGHBORS)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(x_train_std, y_train)
knn_preds = nca_pipe.predict(x_test_std)
target_names = ['Non-All-Star', 'All-Star']
print(classification_report(y_test, knn_preds, target_names=target_names))

              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.99      2993
    All-Star       0.90      0.71      0.79       219

    accuracy                           0.97      3212
   macro avg       0.94      0.85      0.89      3212
weighted avg       0.97      0.97      0.97      3212



In [None]:
x_test['Prediction Probability'] = probs
x_test['Prediction'] = preds
x_test['All-Star'] = dataset.iloc[list(x_test.index),[0]]
x_test['Player'] = dataset.iloc[list(x_test.index),[2]]
x_test.to_csv('log-reg-predictions.csv')