In [8]:
import pandas as pd
from pathlib import Path
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pickle
import numpy as np

## Import data from s3.amazonaws

In [2]:
cleaned_data_url = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/MBB_StatsAndDraft.csv'
data_2007_2020 = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/07-20_MBB_StatsAndDraft.csv'
data_2007_2019 = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/07-19_MBB_StatsAndDraft.csv'

raw_df = pd.read_csv(data_2007_2019)

## Preprocessing: Remove non-numerical columns

In [3]:
numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['#']

In [21]:
numerical_df.head(2)

Unnamed: 0_level_0,#,gp,mpg,fgm,fga,FG%,3PM,3PA,3P%,ftm,...,drb,rpg,apg,spg,bpg,ppg,season_year,pk,draft_year,draft_status
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
240,240,33,28.4,5.7,9.9,0.573,0.0,0.1,0.0,3.4,...,5.9,9.2,1.8,1.0,0.8,14.8,2008,1.0,2009.0,False
250,250,40,32.0,5.2,8.4,0.623,0.1,0.5,0.15,3.6,...,7.4,10.4,1.2,1.4,4.7,14.2,2012,1.0,2012.0,True


In [17]:
numerical_df['draft_status'] = (numerical_df['season_year']==numerical_df['draft_year']).astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Preprocessing: Select common individual basketball statistics to use as features in the model

In [18]:
selected_features = ['ppg','mpg','rpg','apg','spg','gp','tov','draft_status']
data_df = numerical_df[selected_features]


X = data_df.drop(columns=['draft_status'])
y = data_df['draft_status']

## Preprocessing: Use SMOTEENN to oversample drafted players and undersample undrafted players

In [23]:
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X,y)

## Preprocessing: Split into training and test datasets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,random_state=1)

## Preprocessing: Scale data

In [25]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

# Random Forest Model

In [26]:
rf = RandomForestClassifier(n_estimators=120,random_state=1)
rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_acc_score = accuracy_score(y_test,rf_pred)
rf_matrix = confusion_matrix(y_test,rf_pred)
rf_results = pd.DataFrame({"Prediction": rf_pred, "Actual": y_test}).reset_index(drop=True)
rf_report = classification_report(y_test,rf_pred,zero_division=True)

print(rf_acc_score,'\n')
print(rf_matrix,'\n')
print(rf_report,'\n')

0.9859403940057249 

[[5534  127]
 [  40 6177]] 

              precision    recall  f1-score   support

       False       0.99      0.98      0.99      5661
        True       0.98      0.99      0.99      6217

    accuracy                           0.99     11878
   macro avg       0.99      0.99      0.99     11878
weighted avg       0.99      0.99      0.99     11878
 



In [27]:
model_filename = 'rf_2007_2019.sav'
pickle.dump(rf, open(model_filename, 'wb'))

# Code below this cell is not used in the 2007 - 2019 Random Forest Model

# Logistic Regression Model

In [9]:
classifier = LogisticRegression(solver='lbfgs',random_state=1, max_iter=1000)
classifier.fit(X_train,y_train)

lr_pred = classifier.predict(X_test)

lr_acc_score = accuracy_score(y_test,lr_pred)
lr_matrix = confusion_matrix(y_test,lr_pred)
lr_results = pd.DataFrame({"Prediction": lr_pred, "Actual": y_test}).reset_index(drop=True)
lr_report = classification_report(y_test,lr_pred,zero_division=True)

print(lr_acc_score,'\n')
print(lr_matrix,'\n')
print(lr_report,'\n')

0.8026440037771483 

[[5372 1163]
 [1136 3978]] 

              precision    recall  f1-score   support

           1       0.83      0.82      0.82      6535
           2       0.77      0.78      0.78      5114

    accuracy                           0.80     11649
   macro avg       0.80      0.80      0.80     11649
weighted avg       0.80      0.80      0.80     11649
 



# Support Vector Machines (SVM) Model

In [10]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train,y_train)

svm_pred = svm_model.predict(X_test)

svm_acc_score = accuracy_score(y_test,svm_pred)
svm_matrix = confusion_matrix(y_test,svm_pred)
svm_results = pd.DataFrame({"Prediction": svm_pred, "Actual": y_test}).reset_index(drop=True)
svm_report = classification_report(y_test,svm_pred,zero_division=True)

print(svm_acc_score,'\n')
print(svm_matrix,'\n')
print(svm_report,'\n')

0.8026440037771483 

[[5331 1204]
 [1095 4019]] 

              precision    recall  f1-score   support

           1       0.83      0.82      0.82      6535
           2       0.77      0.79      0.78      5114

    accuracy                           0.80     11649
   macro avg       0.80      0.80      0.80     11649
weighted avg       0.80      0.80      0.80     11649
 



# Gradient Boosting Model

In [11]:
learning_rates = [0.05,0.1,0.25,0.5,0.75,1]

for rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=rate,random_state=1)
    classifier.fit(X_train,y_train)

    print(f'learning_rate={rate}; training: {classifier.score(X_train,y_train)}')
    print(f'learning_rate={rate}; validation: {classifier.score(X_test,y_test)}')

learning_rate=0.05; training: 0.8057343978023864
learning_rate=0.05; validation: 0.8091681689415401
learning_rate=0.1; training: 0.8150341946375941
learning_rate=0.1; validation: 0.8174092196755086
learning_rate=0.25; training: 0.85034480785189
learning_rate=0.25; validation: 0.8483990042063696
learning_rate=0.5; training: 0.8874581509142415
learning_rate=0.5; validation: 0.881878272813117
learning_rate=0.75; training: 0.9330128480270123
learning_rate=0.75; validation: 0.924113657824706
learning_rate=1; training: 0.9415686611154033
learning_rate=1; validation: 0.9341574384067302


In [12]:
classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=1,random_state=1)
classifier.fit(X_train,y_train)

gb_pred = classifier.predict(X_test)

gb_acc_score = accuracy_score(y_test,gb_pred)
gb_matrix = confusion_matrix(y_test,gb_pred)
gb_results = pd.DataFrame({"Prediction": gb_pred, "Actual": y_test}).reset_index(drop=True)
gb_report = classification_report(y_test,gb_pred,zero_division=True)

print(gb_acc_score,'\n')
print(gb_matrix,'\n')
print(gb_report,'\n')

0.9341574384067302 

[[6129  406]
 [ 361 4753]] 

              precision    recall  f1-score   support

           1       0.94      0.94      0.94      6535
           2       0.92      0.93      0.93      5114

    accuracy                           0.93     11649
   macro avg       0.93      0.93      0.93     11649
weighted avg       0.93      0.93      0.93     11649
 



# Save and export the random forest model

In [18]:
model_filename = 'nba_rf_model.sav'
pickle.dump(rf, open(model_filename, 'wb'))

In [19]:
loaded_model = pickle.load(open(model_filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9714138552665464
