In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
import xgboost
import copy

### Load Dataset

In [61]:
dataset = pd.read_csv('nba_player_data_through_jan.csv')
dataset.set_index('Player')
dataset = dataset.drop(columns = 'Unnamed: 0')
dataset.head()

Unnamed: 0,All-Star,Position,Player,Age,Conference,PPG,RPG,APG,BPG,SPG,GP,W%
0,True,G,Bradley Beal 2018-19,25,East,24.7255,5.0588,5.098,0.7843,1.3725,51.0,0.4314
1,True,G,Stephen Curry 2018-19,30,West,29.55,5.1,5.4,0.35,1.175,40.0,0.7059
2,True,G,James Harden 2018-19,29,West,36.3404,6.6596,8.1277,0.7234,2.0851,47.0,0.58
3,True,G,Kyrie Irving 2018-19,26,East,23.6512,4.7907,6.9302,0.4651,1.7209,43.0,0.6275
4,True,G,Damian Lillard 2018-19,28,West,26.4118,4.5686,6.2549,0.4902,1.0784,51.0,0.6154


### Data Preprocessing

In [73]:
std_scaler = StandardScaler()
dataset_guards = dataset[dataset['Position'] == 'G']
dataset_bigs = dataset[dataset['Position'] == 'F/C']
x = dataset[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y = dataset['All-Star']
x_guards = dataset_guards[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_guards = dataset_guards['All-Star']
x_bigs = dataset_bigs[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
y_bigs = dataset_bigs['All-Star']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)
x_train_std = std_scaler.fit_transform(x_train)
x_test_std = std_scaler.transform(x_test)

### Classification Algorithm Performance Functions

In [77]:
def check_classification_algorithms(algos, class_names, x_train, x_test, y_train, y_test): 
    for classifier in algos: 
        model = classifier.fit(x_train, y_train)
        preds = model.predict(x_test)
        print(classifier)
        print(confusion_matrix(y_test, preds))
        print(classification_report(y_test, preds, target_names=class_names))
def check_classification_k_fold_cross_validation(kfoldtype, algos, class_names, x_data, y_data):
    X, Y = np.array(x_data), np.array(y_data)
    for classifier in algos:
        cv_total_preds = []
        cv_total_real = []
        std_pipeline = classifier#make_pipeline(StandardScaler(), classifier)
        for train_ind, test_ind in kfoldtype.split(X, Y): 
            x_tr, x_te = X[train_ind], X[test_ind]
            y_tr, y_te = Y[train_ind], y[test_ind]
            std_pipeline.fit(x_tr, y_tr)
            preds = std_pipeline.predict(x_te)
            cv_total_real = np.append(cv_total_real,y_te)
            cv_total_preds = np.append(cv_total_preds, preds)
        print(classifier)
        print(confusion_matrix(cv_total_real, cv_total_preds))
        print(classification_report(cv_total_real, cv_total_preds, target_names=class_names))

In [78]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(KNeighborsClassifier(15))
classifiers.append(tree.DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(AdaBoostClassifier())
classifiers.append(svm.SVC())
classifiers.append(xgboost.XGBClassifier())
classifiers.append(GaussianNB())
kfold = StratifiedKFold(10, True, 42)
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_guards ,y_guards)
check_classification_k_fold_cross_validation(kfold, classifiers, ['Non-All-Star', 'All-Star'], x_bigs ,y_bigs)
#Random Forest Seems to have the best performance

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[6807   26]
 [ 623  354]]
              precision    recall  f1-score   support

Non-All-Star       0.92      1.00      0.95      6833
    All-Star       0.93      0.36      0.52       977

    accuracy                           0.92      7810
   macro avg       0.92      0.68      0.74      7810
weighted avg       0.92      0.92      0.90      7810

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[6803   30]
 [ 659  318]]
              precision    recall  f1-score   support

Non-All-Star       0.91      1.00      0.95      6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
[[7205   65]
 [ 551  426]]
              precision    recall  f1-score   support

Non-All-Star       0.93      0.99      0.96      7270
    All-Star       0.87      0.44      0.58       977

    accuracy                           0.93      8247
   macro avg       0.90      0.71      0.77      8247
weighted avg       0.92      0.93      0.91      8247

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
[[7191   79]
 [ 646  331]]
              precision    recall  f1-score   support

Non-All-Star       0.92      0.99      0.95      7

### K Nearest Neighbors w/ NCA

In [None]:
NEIGHBORS = 15
nca = NeighborhoodComponentsAnalysis(random_state = 42)
knn = KNeighborsClassifier(NEIGHBORS)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(x_train_std, y_train)
knn_preds = nca_pipe.predict(x_test_std)
target_names = ['Non-All-Star', 'All-Star']
print(classification_report(y_test, knn_preds, target_names=target_names))

### Random Search CV To Tune Random Forest Classifier
Used as reference: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [52]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 200, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [53]:
r_forest = RandomForestClassifier()
tuned_r_forest = RandomizedSearchCV(estimator = r_forest, 
                               param_distributions = random_grid, scoring='f1', n_iter = 100, cv = 10, 
                               verbose=2, random_state=42, n_jobs = -1)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 37.8min finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
              

In [54]:
print(tuned_r_forest.best_params_)
check_classification_k_fold_cross_validation(kfold, [tuned_r_forest.best_estimator_], ['Non-All-Star', 'All-Star'], x ,y)

{'n_estimators': 452, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=452,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
[[15047    33]
 [  207   770]]
              precision    recall  f1-score   support

Non-All-Star       0.99      1.00      0.99     15080
    All-Star       0.96      0.79      0.87       977

    accuracy                           0.99     16057
   macro avg       0.97      0.89      0.93     16057
weighted avg   

### Comparing Tuned Random Forest to Dummy Classifiers

In [57]:
rforestcomp = []
rforestcomp.append(DummyClassifier(strategy='constant', constant=0))
rforestcomp.append(DummyClassifier('stratified'))
rforestcomp.append(DummyClassifier('prior'))
rforestcomp.append(DummyClassifier('uniform'))                     
check_classification_k_fold_cross_validation(kfold, rforestcomp, ['Non-All-Star', 'All-Star'], x ,y)

DummyClassifier(constant=0, random_state=None, strategy='constant')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score   support

Non-All-Star       0.94      1.00      0.97     15080
    All-Star       0.00      0.00      0.00       977

    accuracy                           0.94     16057
   macro avg       0.47      0.50      0.48     16057
weighted avg       0.88      0.94      0.91     16057

DummyClassifier(constant=None, random_state=None, strategy='stratified')
[[14198   882]
 [  926    51]]
              precision    recall  f1-score   support

Non-All-Star       0.94      0.94      0.94     15080
    All-Star       0.05      0.05      0.05       977

    accuracy                           0.89     16057
   macro avg       0.50      0.50      0.50     16057
weighted avg       0.88      0.89      0.89     16057

DummyClassifier(constant=None, random_state=None, strategy='prior')
[[15080     0]
 [  977     0]]
              precision    recall  f1-score 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Generate Predictions Using Tuned Random Forest Classifier

In [71]:
dataset_current = pd.read_csv('nba_player_data_through_jan_2019_2020.csv')
dataset_current.set_index('Player')
dataset_current_guards = dataset_current[dataset_current['Position'] == 'G']
dataset_current_bigs = dataset_current[dataset_current['Position'] == 'F/C']
x_current_guards = dataset_current_guards[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
x_current_bigs = dataset_current_bigs[['PPG', 'RPG', 'APG', 'BPG', 'SPG', 'GP', 'W%']]
guards_model = copy.copy(tuned_r_forest.best_estimator_)
bigs_model = copy.copy(tuned_r_forest.best_estimator_)
guards_model.fit(x_guards, y_guards)
bigs_model.fit(x_bigs, y_bigs)
dataset_current_guards['All-Star'] = guards_model.predict(x_current_guards)
dataset_current_guards = dataset_current_guards[dataset_current_guards['All-Star']]
dataset_current_guards.to_csv('nba_all_star_predictions_guards_2019_2020.csv')
dataset_current_bigs['All-Star'] = bigs_model.predict(x_current_bigs)
dataset_current_bigs = dataset_current_bigs[dataset_current_bigs['All-Star']]
dataset_current_bigs.to_csv('nba_all_star_predictions_bigs_2019_2020.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
