In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
import xgboost

In [11]:
dataset = pd.read_csv('nba_player_data_through_jan.csv')
dataset.set_index('Player')
dataset = dataset.drop(columns = 'Unnamed: 0')
dataset.head()

Unnamed: 0,All-Star,Position,Player,Age,Conference,PPG,RPG,APG,BPG,SPG,GP,W%
0,True,G,Bradley Beal 2018-19,25,East,24.7255,5.0588,5.098,0.7843,1.3725,51.0,0.4314
1,True,G,Stephen Curry 2018-19,30,West,29.55,5.1,5.4,0.35,1.175,40.0,0.7059
2,True,G,James Harden 2018-19,29,West,36.3404,6.6596,8.1277,0.7234,2.0851,47.0,0.58
3,True,G,Kyrie Irving 2018-19,26,East,23.6512,4.7907,6.9302,0.4651,1.7209,43.0,0.6275
4,True,G,Damian Lillard 2018-19,28,West,26.4118,4.5686,6.2549,0.4902,1.0784,51.0,0.6154


### Data Preprocessing

In [31]:
std_scaler = StandardScaler()
x = dataset[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y = dataset['All-Star']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)
x_train_std = std_scaler.fit_transform(x_train)
x_test_std = std_scaler.transform(x_test)

### Classification Algorithm Performance Functions

In [50]:
def check_classification_algorithms(algos, class_names, x_train, x_test, y_train, y_test): 
    for classifier in algos: 
        model = classifier.fit(x_train, y_train)
        preds = model.predict(x_test)
        print(classifier)
        print(confusion_matrix(y_test, preds))
        print(classification_report(y_test, preds, target_names=class_names))
def check_classification_k_fold_cross_validation(kfoldtype, algos, class_names, x_data, y_data):
    X, Y = np.array(x_data), np.array(y_data)
    for classifier in algos:
        cv_total_preds = []
        cv_total_real = []
        std_pipeline = classifier#make_pipeline(StandardScaler(), classifier)
        for train_ind, test_ind in kfoldtype.split(X, y): 
            x_tr, x_te = X[train_ind], X[test_ind]
            y_tr, y_te = Y[train_ind], y[test_ind]
            std_pipeline.fit(x_tr, y_tr)
            preds = std_pipeline.predict(x_te)
            cv_total_real = np.append(cv_total_real,y_te)
            cv_total_preds = np.append(cv_total_preds, preds)
        print(classifier)
        print(confusion_matrix(cv_total_real, cv_total_preds))
        print(classification_report(cv_total_real, cv_total_preds, target_names=class_names))

In [39]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(KNeighborsClassifier(15))
classifiers.append(tree.DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(AdaBoostClassifier())
classifiers.append(svm.SVC())
classifiers.append(xgboost.XGBClassifier())
classifiers.append(GaussianNB())
kfold = StratifiedKFold(10, True, 42)
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x ,y)
#Random Forest Seems to have the best performance

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[14944   136]
 [  283   694]]
              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.99     15080
    All-Star       0.84      0.71      0.77       977

    accuracy                           0.97     16057
   macro avg       0.91      0.85      0.88     16057
weighted avg       0.97      0.97      0.97     16057

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[15009    71]
 [  315   662]]
              precision    recall  f1-score   support

Non-All-Star       0.98      1.00      0.9

### K Nearest Neighbors w/ NCA

In [None]:
NEIGHBORS = 15
nca = NeighborhoodComponentsAnalysis(random_state = 42)
knn = KNeighborsClassifier(NEIGHBORS)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(x_train_std, y_train)
knn_preds = nca_pipe.predict(x_test_std)
target_names = ['Non-All-Star', 'All-Star']
print(classification_report(y_test, knn_preds, target_names=target_names))

### Random Search CV For Random Forest
Used as reference: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [35]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 200, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [36]:
r_forest = RandomForestClassifier()
tuned_r_forest = RandomizedSearchCV(estimator = r_forest, 
                               param_distributions = random_grid, scoring='f1', n_iter = 100, cv = 10, 
                               verbose=2, random_state=42, n_jobs = -1)
tuned_r_forest.fit(x,y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 29.6min finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
              

In [37]:
tuned_r_forest.best_params_
check_classification_k_fold_cross_validation(kfold, [tuned_r_forest.best_estimator_], ['Non-All-Star', 'All-Star'], x ,y)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1336,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
[[15044    36]
 [  209   768]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      0.99     15080
    All-Star       0.96      0.79      0.86       977

    accuracy                           0.98     16057
   macro avg       0.97      0.89      0.93     16057
weighted avg       0.98      0.98      0.98     16057



In [51]:
rforestcomp = []
#Using a old set of best parameters that appears to work well
best_param_test = RandomForestClassifier(n_estimators = 100, min_samples_split=5, min_samples_leaf=1,
                                        max_features='sqrt', max_depth=None, bootstrap=False)
rforestcomp.append(DummyClassifier(strategy='constant', constant=0))
rforestcomp.append(DummyClassifier('stratified'))
rforestcomp.append(DummyClassifier('prior'))
rforestcomp.append(DummyClassifier('uniform'))
rforestcomp.append(best_param_test)                               
check_classification_k_fold_cross_validation(kfold, rforestcomp, ['Non-All-Star', 'All-Star'], x ,y)

DummyClassifier(constant=0, random_state=None, strategy='constant')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score   support

Non-All-Star       0.94      1.00      0.97     15080
    All-Star       0.00      0.00      0.00       977

    accuracy                           0.94     16057
   macro avg       0.47      0.50      0.48     16057
weighted avg       0.88      0.94      0.91     16057

DummyClassifier(constant=None, random_state=None, strategy='stratified')
[[14168   912]
 [  922    55]]
              precision    recall  f1-score   support

Non-All-Star       0.94      0.94      0.94     15080
    All-Star       0.06      0.06      0.06       977

    accuracy                           0.89     16057
   macro avg       0.50      0.50      0.50     16057
weighted avg       0.89      0.89      0.89     16057

DummyClassifier(constant=None, random_state=None, strategy='prior')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
[[15040    40]
 [  195   782]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      0.99     15080
    All-Star       0.95      0.80      0.87       977

    accuracy                           0.99     16057
   macro avg       0.97      0.90      0.93     16057
weighted avg       0.99      0.99      0.98     16057



## Predictions Using Tuned Random Forest Classifier