In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
import xgboost

In [2]:
dataset = pd.read_csv('nba_player_data_through_jan.csv')
dataset.set_index('Player')
dataset = dataset.drop(columns = 'Unnamed: 0')
dataset.head()

Unnamed: 0,All-Star,Position,Player,Age,Conference,PPG,RPG,APG,BPG,SPG,GP,W%
0,True,G,Bradley Beal 2018-19,25,East,24.7255,5.0588,5.098,0.7843,1.3725,51.0,0.4314
1,True,G,Stephen Curry 2018-19,30,West,29.55,5.1,5.4,0.35,1.175,40.0,0.7059
2,True,G,James Harden 2018-19,29,West,36.3404,6.6596,8.1277,0.7234,2.0851,47.0,0.58
3,True,G,Kyrie Irving 2018-19,26,East,23.6512,4.7907,6.9302,0.4651,1.7209,43.0,0.6275
4,True,G,Damian Lillard 2018-19,28,West,26.4118,4.5686,6.2549,0.4902,1.0784,51.0,0.6154


### Data Preprocessing

In [3]:
std_scaler = StandardScaler()
x = dataset[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y = dataset['All-Star']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)
x_train_std = std_scaler.fit_transform(x_train)
x_test_std = std_scaler.transform(x_test)

### Classification Algorithm Performance Functions

In [4]:
def check_classification_algorithms(algos, class_names, x_train, x_test, y_train, y_test): 
    for classifier in algos: 
        model = classifier.fit(x_train, y_train)
        preds = model.predict(x_test)
        print(classifier)
        print(confusion_matrix(y_test, preds))
        print(classification_report(y_test, preds, target_names=class_names))
def check_classification_k_fold_cross_validation(kfoldtype, algos, class_names, x_data, y_data):
    X, y = np.array(x_data), np.array(y_data)
    for classifier in algos:
        cv_total_preds = []
        cv_total_real = []
        std_pipeline = make_pipeline(StandardScaler(), classifier)
        for train_ind, test_ind in kfoldtype.split(X, y): 
            x_tr, x_te = X[train_ind], X[test_ind]
            y_tr, y_te = y[train_ind], y[test_ind]
            std_pipeline.fit(x_tr, y_tr)
            preds = std_pipeline.predict(x_te)
            cv_total_real = np.append(cv_total_real,y_te)
            cv_total_preds = np.append(cv_total_preds, preds)
        print(classifier)
        print(confusion_matrix(cv_total_real, cv_total_preds))
        print(classification_report(cv_total_real, cv_total_preds, target_names=class_names))

In [5]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(KNeighborsClassifier(15))
classifiers.append(tree.DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(svm.SVC())
classifiers.append(xgboost.XGBClassifier())
classifiers.append(GaussianNB())
kfold = StratifiedKFold(10, True, 42)
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x ,y)
#check_classification_algorithms(classifiers, ['Non-All-Star', 'All-Star'], x_train_std, x_test_std, y_train, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[14944   136]
 [  283   694]]
              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.99     15080
    All-Star       0.84      0.71      0.77       977

    accuracy                           0.97     16057
   macro avg       0.91      0.85      0.88     16057
weighted avg       0.97      0.97      0.97     16057

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[15009    71]
 [  315   662]]
              precision    recall  f1-score   support

Non-All-Star       0.98      1.00      0.9

### K Nearest Neighbors w/ NCA

In [None]:
NEIGHBORS = 15
nca = NeighborhoodComponentsAnalysis(random_state = 42)
knn = KNeighborsClassifier(NEIGHBORS)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(x_train_std, y_train)
knn_preds = nca_pipe.predict(x_test_std)
target_names = ['Non-All-Star', 'All-Star']
print(classification_report(y_test, knn_preds, target_names=target_names))

In [None]:
x_test['Prediction Probability'] = probs
x_test['Prediction'] = preds
x_test['All-Star'] = dataset.iloc[list(x_test.index),[0]]
x_test['Player'] = dataset.iloc[list(x_test.index),[2]]
x_test.to_csv('log-reg-predictions.csv')