In [1]:
import pandas as pd
from pathlib import Path
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
import pickle
import numpy as np

## Import data from s3.amazonaws

In [2]:
cleaned_data_url = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/MBB_StatsAndDraft.csv'
data_2007_2020 = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/07-20_MBB_StatsAndDraft.csv'
data_2007_2019 = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/07-19_MBB_StatsAndDraft.csv'

raw_df = pd.read_csv(data_2007_2019)

## Preprocessing: Remove non-numerical columns

In [3]:
numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['#']

In [4]:
numerical_df[numerical_df['draft_year'] == 2019]

Unnamed: 0_level_0,#,gp,mpg,fgm,fga,FG%,3PM,3PA,3P%,ftm,...,orb,drb,rpg,apg,spg,bpg,ppg,season_year,pk,draft_year
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,16,33,30.0,9.0,13.2,0.680,0.7,2.2,0.338,3.9,...,3.5,5.4,8.9,2.1,2.1,1.8,22.6,2019,1.0,2019.0
7,7,33,36.6,8.0,16.1,0.499,1.7,4.8,0.363,6.7,...,1.2,4.5,5.7,10.0,1.8,0.8,24.5,2019,2.0,2019.0
576,576,32,34.0,4.2,9.1,0.459,0.8,2.8,0.307,3.5,...,1.7,4.8,6.5,6.3,0.9,0.4,12.7,2018,2.0,2019.0
1177,1177,33,19.9,3.2,6.5,0.488,0.6,1.7,0.382,2.2,...,1.2,2.4,3.5,1.1,0.6,0.4,9.2,2018,4.0,2019.0
264,264,38,32.5,5.4,10.4,0.520,1.2,2.8,0.438,3.2,...,1.4,3.7,5.1,2.0,0.6,0.6,15.2,2019,4.0,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,309,29,32.8,5.4,13.3,0.405,2.0,6.3,0.310,2.4,...,0.9,5.1,6.0,3.6,0.9,0.7,15.1,2018,58.0,2019.0
555,555,29,31.2,4.4,9.9,0.443,2.1,5.3,0.396,2.0,...,0.9,5.4,6.3,2.7,0.7,1.1,12.9,2017,58.0,2019.0
135,135,29,31.0,5.8,13.1,0.441,1.9,5.2,0.371,3.7,...,0.7,5.6,6.3,3.6,0.9,1.3,17.1,2019,58.0,2019.0
766,766,32,25.8,4.6,8.0,0.576,0.0,0.1,0.000,2.2,...,2.0,4.6,6.7,0.4,0.5,1.0,11.4,2018,59.0,2019.0


In [5]:
numerical_df['draft_status'] = (numerical_df['season_year']==numerical_df['draft_year']).astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Preprocessing: Select common individual basketball statistics to use as features in the model

In [6]:
numerical_df.columns

Index(['#', 'gp', 'mpg', 'fgm', 'fga', 'FG%', '3PM', '3PA', '3P%', 'ftm',
       'fta', 'FT%', 'tov', 'pf', 'orb', 'drb', 'rpg', 'apg', 'spg', 'bpg',
       'ppg', 'season_year', 'pk', 'draft_year', 'draft_status'],
      dtype='object')

In [7]:
# selected_features = ['ppg','mpg','rpg','apg','spg','gp','tov','draft_status']
selected_features = ['ppg','rpg','apg','spg','tov','draft_status','FG%','3P%','FT%']
data_df = numerical_df[selected_features]

# X = numerical_df.drop(columns=['draft_status', 'season_year','draft_year','pk','#','pf'])
X = data_df.drop(columns=['draft_status'])
y = data_df['draft_status']

## Preprocessing: Use SMOTEENN to oversample drafted players and undersample undrafted players

In [8]:
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X,y)

## Preprocessing: Test Random Undersampling

In [9]:
# ros = RandomUnderSampler(random_state=1)
# X_resampled, y_resampled = ros.fit_resample(X,y)

## Preprocessing: Split into training and test datasets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,random_state=1,test_size=0.05)

## Preprocessing: Scale data

In [11]:
len(X_resampled)

46666

In [12]:
# scaler = StandardScaler()
# X_scaler = scaler.fit(X_train)

# X_train = X_scaler.transform(X_train)
# X_test = X_scaler.transform(X_test)

# Random Forest Model

In [13]:
rf = RandomForestClassifier(n_estimators=100,random_state=1,)
rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_acc_score = accuracy_score(y_test,rf_pred)
rf_matrix = confusion_matrix(y_test,rf_pred)
rf_results = pd.DataFrame({"Prediction": rf_pred, "Actual": y_test}).reset_index(drop=True)
rf_report = classification_report(y_test,rf_pred,zero_division=True)

print(rf_acc_score,'\n')
print(rf_matrix,'\n')
print(rf_report,'\n')

model_filename = 'rf_2007_2019.sav'
pickle.dump(rf, open(model_filename, 'wb'))

0.9884318766066839 

[[1037   20]
 [   7 1270]] 

              precision    recall  f1-score   support

       False       0.99      0.98      0.99      1057
        True       0.98      0.99      0.99      1277

    accuracy                           0.99      2334
   macro avg       0.99      0.99      0.99      2334
weighted avg       0.99      0.99      0.99      2334
 



# Logistic Regression Model

In [14]:
lr = LogisticRegression(solver='lbfgs',random_state=1, max_iter=1000)
lr.fit(X_train,y_train)

lr_pred = lr.predict(X_test)

lr_acc_score = accuracy_score(y_test,lr_pred)
lr_matrix = confusion_matrix(y_test,lr_pred)
lr_results = pd.DataFrame({"Prediction": lr_pred, "Actual": y_test}).reset_index(drop=True)
lr_report = classification_report(y_test,lr_pred,zero_division=True)

print(lr_acc_score,'\n')
print(lr_matrix,'\n')
print(lr_report,'\n')

model_filename = 'lr_2007_2019.sav'
pickle.dump(lr, open(model_filename, 'wb'))

0.8564695801199658 

[[ 883  174]
 [ 161 1116]] 

              precision    recall  f1-score   support

       False       0.85      0.84      0.84      1057
        True       0.87      0.87      0.87      1277

    accuracy                           0.86      2334
   macro avg       0.86      0.85      0.86      2334
weighted avg       0.86      0.86      0.86      2334
 



# Support Vector Machines (SVM) Model

In [15]:
svm_model = svm.SVC(kernel='linear',probability=True)
svm_model.fit(X_train,y_train)

svm_pred = svm_model.predict(X_test)

svm_acc_score = accuracy_score(y_test,svm_pred)
svm_matrix = confusion_matrix(y_test,svm_pred)
svm_results = pd.DataFrame({"Prediction": svm_pred, "Actual": y_test}).reset_index(drop=True)
svm_report = classification_report(y_test,svm_pred,zero_division=True)

print(svm_acc_score,'\n')
print(svm_matrix,'\n')
print(svm_report,'\n')

model_filename = 'svm_2007_2019.sav'
pickle.dump(svm_model, open(model_filename, 'wb'))

0.8586118251928021 

[[ 883  174]
 [ 156 1121]] 

              precision    recall  f1-score   support

       False       0.85      0.84      0.84      1057
        True       0.87      0.88      0.87      1277

    accuracy                           0.86      2334
   macro avg       0.86      0.86      0.86      2334
weighted avg       0.86      0.86      0.86      2334
 



# Gradient Boosting Model

In [16]:
learning_rates = [0.05,0.1,0.25,0.5,0.75,1]

for rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=100,learning_rate=rate,random_state=1)
    classifier.fit(X_train,y_train)

    print(f'learning_rate={rate}; training: {classifier.score(X_train,y_train)}')
    print(f'learning_rate={rate}; validation: {classifier.score(X_test,y_test)}')

learning_rate=0.05; training: 0.8872372101416584
learning_rate=0.05; validation: 0.8851756640959726
learning_rate=0.1; training: 0.9167644139673373
learning_rate=0.1; validation: 0.9155955441302485
learning_rate=0.25; training: 0.9723224758639357
learning_rate=0.25; validation: 0.9708654670094259
learning_rate=0.5; training: 0.9870522421726969
learning_rate=0.5; validation: 0.9841473864610112
learning_rate=0.75; training: 0.9894432915275647
learning_rate=0.75; validation: 0.9862896315338475
learning_rate=1; training: 0.9912929712171795
learning_rate=1; validation: 0.987146529562982


In [17]:
gb = GradientBoostingClassifier(n_estimators=1000,learning_rate=1,random_state=1)
gb.fit(X_train,y_train)

gb_pred = gb.predict(X_test)

gb_acc_score = accuracy_score(y_test,gb_pred)
gb_matrix = confusion_matrix(y_test,gb_pred)
gb_results = pd.DataFrame({"Prediction": gb_pred, "Actual": y_test}).reset_index(drop=True)
gb_report = classification_report(y_test,gb_pred,zero_division=True)

print(gb_acc_score,'\n')
print(gb_matrix,'\n')
print(gb_report,'\n')

model_filename = 'gb_2007_2019.sav'
pickle.dump(gb, open(model_filename, 'wb'))

0.9880034275921166 

[[1048    9]
 [  19 1258]] 

              precision    recall  f1-score   support

       False       0.98      0.99      0.99      1057
        True       0.99      0.99      0.99      1277

    accuracy                           0.99      2334
   macro avg       0.99      0.99      0.99      2334
weighted avg       0.99      0.99      0.99      2334
 

