In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
import xgboost
import copy

### Load Dataset

In [66]:
dataset = pd.read_csv('nba_player_data_through_jan.csv')
dataset.set_index('Player')
dataset = dataset.drop(columns = 'Unnamed: 0')
dataset.head()

Unnamed: 0,All-Star,Position,Player,Age,Conference,PPG,RPG,APG,BPG,SPG,GP,W%
0,True,G,Bradley Beal 2018-19,25,East,24.7255,5.0588,5.098,0.7843,1.3725,51.0,0.4314
1,True,G,Stephen Curry 2018-19,30,West,29.55,5.1,5.4,0.35,1.175,40.0,0.7059
2,True,G,James Harden 2018-19,29,West,36.3404,6.6596,8.1277,0.7234,2.0851,47.0,0.58
3,True,G,Kyrie Irving 2018-19,26,East,23.6512,4.7907,6.9302,0.4651,1.7209,43.0,0.6275
4,True,G,Damian Lillard 2018-19,28,West,26.4118,4.5686,6.2549,0.4902,1.0784,51.0,0.6154


### Data Preprocessing

In [84]:
std_scaler = StandardScaler()
dataset_guards = dataset[dataset['Position'] == 'G']
dataset_bigs = dataset[dataset['Position'] == 'F/C']
x = dataset[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y = dataset['All-Star']
x_guards = dataset_guards[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_guards = dataset_guards['All-Star']
x_bigs = dataset_bigs[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_bigs = dataset_bigs['All-Star']
#Years in which the average league ppg is >= 105
year_ranges = [str(year) for year in range(2016,2019)] + [str(year) for year in range(1980,1993)]
#year_ranges = [str(year) for year in range(2000,2019)]
dataset_guards_restricted = dataset_guards[dataset_guards['Player'].str.split(' ', expand=True)[2].str.split('-',expand=True)[0].isin(year_ranges)]
dataset_bigs_restricted = dataset_bigs[dataset_bigs['Player'].str.split(' ', expand=True)[2].str.split('-',expand=True)[0].isin(year_ranges)]
x_guards_restricted = dataset_guards_restricted[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_guards_restricted = dataset_guards_restricted['All-Star']
x_bigs_restricted = dataset_bigs_restricted[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_bigs_restricted = dataset_bigs_restricted['All-Star']

### Classification Algorithm Performance Functions

In [11]:
def check_classification_algorithms(algos, class_names, x_train, x_test, y_train, y_test): 
    for classifier in algos: 
        model = classifier.fit(x_train, y_train)
        preds = model.predict(x_test)
        print(classifier)
        print(confusion_matrix(y_test, preds))
        print(classification_report(y_test, preds, target_names=class_names))
def check_classification_k_fold_cross_validation(kfoldtype, algos, class_names, x_data, y_data):
    X, Y = np.array(x_data), np.array(y_data)
    for classifier in algos:
        cv_total_preds = []
        cv_total_real = []
        std_pipeline = make_pipeline(StandardScaler(), classifier)
        for train_ind, test_ind in kfoldtype.split(X, Y): 
            x_tr, x_te = X[train_ind], X[test_ind]
            y_tr, y_te = Y[train_ind], Y[test_ind]
            std_pipeline.fit(x_tr, y_tr)
            preds = std_pipeline.predict(x_te)
            cv_total_real = np.append(cv_total_real,y_te)
            cv_total_preds = np.append(cv_total_preds, preds)
        print(classifier)
        print(confusion_matrix(cv_total_real, cv_total_preds))
        print(classification_report(cv_total_real, cv_total_preds, target_names=class_names))

In [12]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(KNeighborsClassifier(15))
classifiers.append(tree.DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(AdaBoostClassifier())
classifiers.append(svm.SVC())
classifiers.append(xgboost.XGBClassifier())
classifiers.append(GaussianNB())
kfold = StratifiedKFold(10, True, 42)

### One model for all players

In [95]:
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x ,y)
#Random Forest Seems to have the best performance

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[14944   136]
 [  283   694]]
              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.99     15080
    All-Star       0.84      0.71      0.77       977

    accuracy                           0.97     16057
   macro avg       0.91      0.85      0.88     16057
weighted avg       0.97      0.97      0.97     16057

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[15009    71]
 [  315   662]]
              precision    recall  f1-score   support

Non-All-Star       0.98      1.00      0.9

### Different Models for Guards and Bigs

In [101]:
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_guards ,y_guards)
#SVC seems to be really good for guards 
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_bigs ,y_bigs)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[7391   26]
 [  36  357]]
              precision    recall  f1-score   support

Non-All-Star       1.00      1.00      1.00      7417
    All-Star       0.93      0.91      0.92       393

    accuracy                           0.99      7810
   macro avg       0.96      0.95      0.96      7810
weighted avg       0.99      0.99      0.99      7810

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[7394   23]
 [  40  353]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      1.00      7

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
[[7586   77]
 [ 180  404]]
              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.98      7663
    All-Star       0.84      0.69      0.76       584

    accuracy                           0.97      8247
   macro avg       0.91      0.84      0.87      8247
weighted avg       0.97      0.97      0.97      8247

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_

### Restricting Dataset Based on League Scoring - May More Accurately Reflect Modern Scoring Trends
Dataset only contains years where average PPG >= 105 in the 3 point era (https://www.basketball-reference.com/leagues/NBA_stats_per_game.html) - similar to 2019-2020 (110.6 PPG)

In [106]:
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_guards_restricted ,y_guards_restricted)
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_bigs_restricted ,y_bigs_restricted)
guards_model = svm.SVC(C=1, gamma='scale', kernel='rbf')
bigs_model = svm.SVC(C=100, gamma=0.01, kernel='rbf')
check_classification_k_fold_cross_validation(kfold, [guards_model], ['Non-All-Star', 'All-Star'], x_guards_restricted ,y_guards_restricted)
check_classification_k_fold_cross_validation(kfold, [bigs_model], ['Non-All-Star', 'All-Star'], x_bigs_restricted ,y_bigs_restricted)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[2697   11]
 [  21  135]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      0.99      2708
    All-Star       0.92      0.87      0.89       156

    accuracy                           0.99      2864
   macro avg       0.96      0.93      0.94      2864
weighted avg       0.99      0.99      0.99      2864

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[2697   11]
 [  19  137]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      0.99      2

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
[[2673   35]
 [  67  173]]
              precision    recall  f1-score   support

Non-All-Star       0.98      0.99      0.98      2708
    All-Star       0.83      0.72      0.77       240

    accuracy                           0.97      2948
   macro avg       0.90      0.85      0.88      2948
weighted avg       0.96      0.97      0.96      2948

GaussianNB(priors=None, var_smoothing=1e-09)
[[2509  199]
 [  21  219]]
              precision    recall  f1-score   support

Non-All-Star       0.99      0.93      0.96      

### Random Search CV To Tune Random Forest Classifier For General Model
Used as reference: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [4]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 200, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [5]:
r_forest = RandomForestClassifier()
tuned_r_forest = RandomizedSearchCV(estimator = r_forest, 
                               param_distributions = random_grid, scoring='f1', n_iter = 100, cv = 10, 
                               verbose=2, random_state=42, n_jobs = -1)
tuner_r_forest.fit(x,y)
print(tuned_r_forest.best_params_)

### Grid Search CV for seperate guard/big models - Support Vector Classifier

In [15]:
grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [2, 1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'linear']} 
best_svc_guards = GridSearchCV(estimator=svm.SVC(), param_grid=grid, scoring='f1', cv=10, verbose = 3, n_jobs=-1) 
best_svc_bigs = GridSearchCV(estimator=svm.SVC(), param_grid=grid, scoring='f1', cv=10, verbose = 3, n_jobs=-1) 
best_svc_guards.fit(x_guards, y_guards)
best_svc_bigs.fit(x_bigs, y_bigs)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   45.5s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 11.2min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [2, 1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=3)

In [18]:
print(best_svc_guards.best_estimator_)
print(best_svc_bigs.best_estimator_)
check_classification_k_fold_cross_validation(kfold, [best_svc_guards.best_estimator_], ['Non-All-Star', 'All-Star'], x_guards ,y_guards)
check_classification_k_fold_cross_validation(kfold, [best_svc_bigs.best_estimator_], ['Non-All-Star', 'All-Star'], x_bigs ,y_bigs)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
[[7386   31]
 [  34  359]]
              precision    recall  f1-score   support

Non-All-Star       1.00      1.00      1.00      7417
    All-Star       0.92      0.91      0.92       393

    accuracy                           0.99      7810
   macro 

### Comparing Tuned Random Forest General Model to Dummy Classifiers

In [57]:
rforestcomp = []
rforestcomp.append(DummyClassifier(strategy='constant', constant=0))
rforestcomp.append(DummyClassifier('stratified'))
rforestcomp.append(DummyClassifier('prior'))
rforestcomp.append(DummyClassifier('uniform'))                     
check_classification_k_fold_cross_validation(kfold, rforestcomp, ['Non-All-Star', 'All-Star'], x ,y)

DummyClassifier(constant=0, random_state=None, strategy='constant')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score   support

Non-All-Star       0.94      1.00      0.97     15080
    All-Star       0.00      0.00      0.00       977

    accuracy                           0.94     16057
   macro avg       0.47      0.50      0.48     16057
weighted avg       0.88      0.94      0.91     16057

DummyClassifier(constant=None, random_state=None, strategy='stratified')
[[14198   882]
 [  926    51]]
              precision    recall  f1-score   support

Non-All-Star       0.94      0.94      0.94     15080
    All-Star       0.05      0.05      0.05       977

    accuracy                           0.89     16057
   macro avg       0.50      0.50      0.50     16057
weighted avg       0.88      0.89      0.89     16057

DummyClassifier(constant=None, random_state=None, strategy='prior')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Generate Predictions - Ultimately Using Support Vector Classifier Fitted on Restricted Dataset

In [104]:
dataset_current = pd.read_csv('nba_player_data_through_jan_2019_2020.csv')
dataset_current.set_index('Player')
dataset_current_guards = dataset_current[dataset_current['Position'] == 'G']
dataset_current_bigs = dataset_current[dataset_current['Position'] == 'F/C']
x_current_guards = dataset_current_guards[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
x_current_bigs = dataset_current_bigs[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
guards_model = svm.SVC(C=1, gamma='scale', kernel='rbf')
bigs_model = svm.SVC(C=100, gamma=0.01, kernel='rbf')
guards_model.fit(x_guards_restricted, y_guards_restricted)
bigs_model.fit(x_bigs_restricted, y_bigs_restricted)
dataset_current_guards['All-Star'] = guards_model.predict(x_current_guards)
dataset_current_guards = dataset_current_guards[dataset_current_guards['All-Star']]
dataset_current_guards.to_csv('nba_all_star_predictions_guards_2019_2020.csv')
dataset_current_bigs['All-Star'] = bigs_model.predict(x_current_bigs)
dataset_current_bigs = dataset_current_bigs[dataset_current_bigs['All-Star']]
dataset_current_bigs.to_csv('nba_all_star_predictions_bigs_2019_2020.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
