In [28]:
import pandas as pd
from pathlib import Path
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

## Import data from s3.amazonaws

In [7]:
cleaned_data_url = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/MBB_StatsAndDraft.csv'

raw_df = pd.read_csv(cleaned_data_url)

## Preprocessing: Remove non-numerical columns

In [37]:
numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['#']

## Preprocessing: Select common individual basketball statistics to use as features in the model

In [38]:
selected_features = ['PPG','MPG','RPG','APG','SPG','GP','TOV','Pk']
data_df = numerical_df[selected_features]

X = data_df.drop(columns=['Pk'])
y = data_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

## Preprocessing: Use SMOTEENN to oversample drafted players and undersample undrafted players

In [39]:
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X,y)

## Preprocessing: Split into training and test datasets

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,random_state=1)

## Preprocessing: Scale data

In [41]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

# Random Forest Model

In [42]:
rf = RandomForestClassifier(n_estimators=120,random_state=1)
rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_acc_score = accuracy_score(y_test,rf_pred)
rf_matrix = confusion_matrix(y_test,rf_pred)
rf_results = pd.DataFrame({"Prediction": rf_pred, "Actual": y_test}).reset_index(drop=True)
rf_report = classification_report(y_test,rf_pred,zero_division=True)

print(rf_acc_score,'\n')
print(rf_matrix,'\n')
print(rf_report,'\n')

0.9708654670094259 

[[6403   92]
 [ 248 4927]] 

              precision    recall  f1-score   support

           1       0.96      0.99      0.97      6495
           2       0.98      0.95      0.97      5175

    accuracy                           0.97     11670
   macro avg       0.97      0.97      0.97     11670
weighted avg       0.97      0.97      0.97     11670
 



# Logistic Regression Model

In [43]:
classifier = LogisticRegression(solver='lbfgs',random_state=1, max_iter=1000)
classifier.fit(X_train,y_train)

lr_pred = classifier.predict(X_test)

lr_acc_score = accuracy_score(y_test,lr_pred)
lr_matrix = confusion_matrix(y_test,lr_pred)
lr_results = pd.DataFrame({"Prediction": lr_pred, "Actual": y_test}).reset_index(drop=True)
lr_report = classification_report(y_test,lr_pred,zero_division=True)

print(lr_acc_score,'\n')
print(lr_matrix,'\n')
print(lr_report,'\n')

0.7974293059125964 

[[5341 1154]
 [1210 3965]] 

              precision    recall  f1-score   support

           1       0.82      0.82      0.82      6495
           2       0.77      0.77      0.77      5175

    accuracy                           0.80     11670
   macro avg       0.79      0.79      0.79     11670
weighted avg       0.80      0.80      0.80     11670
 



# Support Vector Machines (SVM) Model

In [44]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train,y_train)

svm_pred = svm_model.predict(X_test)

svm_acc_score = accuracy_score(y_test,svm_pred)
svm_matrix = confusion_matrix(y_test,svm_pred)
svm_results = pd.DataFrame({"Prediction": svm_pred, "Actual": y_test}).reset_index(drop=True)
svm_report = classification_report(y_test,svm_pred,zero_division=True)

print(svm_acc_score,'\n')
print(svm_matrix,'\n')
print(svm_report,'\n')

0.7949443016281063 

[[5286 1209]
 [1184 3991]] 

              precision    recall  f1-score   support

           1       0.82      0.81      0.82      6495
           2       0.77      0.77      0.77      5175

    accuracy                           0.79     11670
   macro avg       0.79      0.79      0.79     11670
weighted avg       0.80      0.79      0.79     11670
 



# Gradient Boosting Model

In [45]:
learning_rates = [0.05,0.1,0.25,0.5,0.75,1]

for rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=rate,random_state=1)
    classifier.fit(X_train,y_train)

    print(f'learning_rate={rate}; training: {classifier.score(X_train,y_train)}')
    print(f'learning_rate={rate}; validation: {classifier.score(X_test,y_test)}')

learning_rate=0.05; training: 0.809809466678093
learning_rate=0.05; validation: 0.8059982862039418
learning_rate=0.1; training: 0.8160939240723284
learning_rate=0.1; validation: 0.8139674378748929
learning_rate=0.25; training: 0.8419744622504071
learning_rate=0.25; validation: 0.8424164524421593
learning_rate=0.5; training: 0.913445882252121
learning_rate=0.5; validation: 0.9107969151670952
learning_rate=0.75; training: 0.936098494586797
learning_rate=0.75; validation: 0.9298200514138818
learning_rate=1; training: 0.9385551461136344
learning_rate=1; validation: 0.9352185089974293


In [46]:
classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=1,random_state=1)
classifier.fit(X_train,y_train)

gb_pred = classifier.predict(X_test)

gb_acc_score = accuracy_score(y_test,gb_pred)
gb_matrix = confusion_matrix(y_test,gb_pred)
gb_results = pd.DataFrame({"Prediction": gb_pred, "Actual": y_test}).reset_index(drop=True)
gb_report = classification_report(y_test,gb_pred,zero_division=True)

print(gb_acc_score,'\n')
print(gb_matrix,'\n')
print(gb_report,'\n')

0.9352185089974293 

[[6068  427]
 [ 329 4846]] 

              precision    recall  f1-score   support

           1       0.95      0.93      0.94      6495
           2       0.92      0.94      0.93      5175

    accuracy                           0.94     11670
   macro avg       0.93      0.94      0.93     11670
weighted avg       0.94      0.94      0.94     11670
 

