In [1]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
cleaned_data_url = '../Resources/2018MBB_StatsAndDraft_Cleaned.csv'

raw_df = pd.read_csv(cleaned_data_url)

numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['Player']

# X = numerical_df.drop(columns=['Pk'])
# y = numerical_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

# smoteenn = SMOTEENN(random_state=1)
# X_resampled, y_resampled = smoteenn.fit_resample(X,y)

# X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,random_state=1)

In [3]:
rf = RandomForestClassifier(n_estimators=120,random_state=1)
rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_acc_score = accuracy_score(y_test,rf_pred)
rf_matrix = confusion_matrix(y_test,rf_pred)
rf_results = pd.DataFrame({"Prediction": rf_pred, "Actual": y_test}).reset_index(drop=True)
rf_report = classification_report(y_test,rf_pred,zero_division=True)

In [4]:
importances = rf.feature_importances_
features = X.columns
feature_importances = sorted(zip(importances,features),reverse=True)

In [5]:
feature_importances

[(0.13749463168119358, 'FGM'),
 (0.09928689413143799, 'PPG'),
 (0.08891432818296253, 'MPG'),
 (0.08070328207504095, 'DRB'),
 (0.07823615531428209, 'FGA'),
 (0.05658736532147858, 'GP'),
 (0.05012630644733565, 'FG%'),
 (0.04522882567846539, 'FTA'),
 (0.043954980807591214, 'BPG'),
 (0.043482714195805905, 'FTM'),
 (0.04118152452677147, 'PF'),
 (0.03824726589884613, 'APG'),
 (0.035845437519958195, 'RPG'),
 (0.030566178491589786, 'ORB'),
 (0.02578444987542877, '3PA'),
 (0.02431641902209148, '3PM'),
 (0.023827140474352363, 'FT%'),
 (0.022168483028746844, '3P%'),
 (0.018540181867432944, 'SPG'),
 (0.015507435459188274, 'TOV')]

In [3]:
redundant_features = ['FGM','FGA','FTA','FTM','3PA','3PM','DRB','ORB']

In [7]:
num_features_desired = 5
important_features = []

for i in range(0,num_features_desired):
    important_features.append(feature_importances[i][1])

In [8]:
important_features

['FGM', 'PPG', 'MPG', 'DRB', 'FGA']

In [4]:
non_redundant_df = numerical_df.drop(columns=redundant_features)
non_redundant_df.head()

Unnamed: 0_level_0,GP,MPG,FG%,3P%,FT%,TOV,PF,RPG,APG,SPG,BPG,PPG,Pk
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Deandre Ayton,35,33.5,0.612,0.343,0.733,2.0,2.3,11.6,1.6,0.6,1.9,20.1,1.0
Marvin Bagley III,33,33.9,0.614,0.397,0.627,2.3,1.8,11.1,1.5,0.8,0.9,21.0,2.0
Trae Young,32,35.4,0.422,0.36,0.861,5.2,1.8,3.9,8.7,1.7,0.2,27.4,5.0
Mo Bamba,30,30.2,0.541,0.275,0.681,1.5,2.5,10.5,0.5,0.8,3.7,12.9,6.0
Collin Sexton,33,29.9,0.447,0.336,0.778,2.8,2.5,3.8,3.6,0.8,0.1,19.2,8.0


In [6]:
X_non_redundant = non_redundant_df.drop(columns=['Pk'])
y_non_redundant = non_redundant_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_non_r_resampled, y_non_r_resampled = smoteenn.fit_resample(X_non_redundant,y_non_redundant)

X_non_r_train, X_non_r_test, y_non_r_train, y_non_r_test = train_test_split(X_non_r_resampled, y_non_r_resampled,random_state=1)

In [9]:
rf_non_r = RandomForestClassifier(n_estimators=120,random_state=1)
rf_non_r.fit(X_non_r_train,y_non_r_train)

rf_non_r_pred = rf_non_r.predict(X_non_r_test)

rf_non_r_acc_score = accuracy_score(y_non_r_test,rf_non_r_pred)
rf_non_r_matrix = confusion_matrix(y_non_r_test,rf_non_r_pred)
rf_non_r_results = pd.DataFrame({"Prediction": rf_non_r_pred, "Actual": y_non_r_test}).reset_index(drop=True)
rf_non_r_report = classification_report(y_non_r_test,rf_non_r_pred,zero_division=True)

In [10]:
print(rf_non_r_acc_score)
print(rf_non_r_matrix)
print(rf_non_r_report)

0.9898876404494382
[[481   3]
 [  6 400]]
              precision    recall  f1-score   support

           1       0.99      0.99      0.99       484
           2       0.99      0.99      0.99       406

    accuracy                           0.99       890
   macro avg       0.99      0.99      0.99       890
weighted avg       0.99      0.99      0.99       890



In [11]:
non_r_importances = rf_non_r.feature_importances_
non_r_features = X_non_redundant.columns
non_r_feature_importances = sorted(zip(non_r_importances,non_r_features),reverse=True)

In [12]:
non_r_feature_importances

[(0.21739713761595422, 'PPG'),
 (0.14749448400825696, 'MPG'),
 (0.12049196632339824, 'RPG'),
 (0.09091624976742692, 'BPG'),
 (0.07369602602477157, 'APG'),
 (0.07027117369869237, 'GP'),
 (0.060494505785005495, 'FG%'),
 (0.053289504576064475, 'PF'),
 (0.049606607836634994, 'FT%'),
 (0.04895767358842511, '3P%'),
 (0.03776388217214145, 'TOV'),
 (0.029620788603228237, 'SPG')]

In [18]:
num_features_desired = 6
important_features = []

for i in range(0,num_features_desired):
    important_features.append(non_r_feature_importances[i][1])
    
important_features.append('Pk')

In [19]:
important_df = non_redundant_df[important_features]
important_df.head()

Unnamed: 0_level_0,PPG,MPG,RPG,BPG,APG,GP,Pk
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Deandre Ayton,20.1,33.5,11.6,1.9,1.6,35,1.0
Marvin Bagley III,21.0,33.9,11.1,0.9,1.5,33,2.0
Trae Young,27.4,35.4,3.9,0.2,8.7,32,5.0
Mo Bamba,12.9,30.2,10.5,3.7,0.5,30,6.0
Collin Sexton,19.2,29.9,3.8,0.1,3.6,33,8.0


In [20]:
X_important = important_df.drop(columns=['Pk'])
y_important = important_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_important_resampled, y_important_resampled = smoteenn.fit_resample(X_important,y_important)

X_important_train, X_important_test, y_important_train, y_important_test = train_test_split(X_important_resampled, y_important_resampled,random_state=1)

In [21]:
rf_important = RandomForestClassifier(n_estimators=120,random_state=1)
rf_important.fit(X_important_train,y_important_train)

rf_important_pred = rf_important.predict(X_important_test)

rf_important_acc_score = accuracy_score(y_important_test,rf_important_pred)
rf_important_matrix = confusion_matrix(y_important_test,rf_important_pred)
rf_important_results = pd.DataFrame({"Prediction": rf_important_pred, "Actual": y_important_test}).reset_index(drop=True)
rf_important_report = classification_report(y_important_test,rf_important_pred,zero_division=True)

In [22]:
print(rf_important_acc_score)
print(rf_important_matrix)
print(rf_important_report)

0.9887766554433222
[[479   1]
 [  9 402]]
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       480
           2       1.00      0.98      0.99       411

    accuracy                           0.99       891
   macro avg       0.99      0.99      0.99       891
weighted avg       0.99      0.99      0.99       891



In [23]:
important_importances = rf_important.feature_importances_
important_features = X_important.columns
important_feature_importances = sorted(zip(important_importances,important_features),reverse=True)

In [24]:
important_feature_importances

[(0.2931423133640146, 'PPG'),
 (0.18694986821879278, 'MPG'),
 (0.15979886219634157, 'RPG'),
 (0.1402717236195784, 'BPG'),
 (0.11860179750170616, 'APG'),
 (0.10123543509956646, 'GP')]