In [2]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [5]:
cleaned_data_url = '../sample_data/MBB_StatsAndDraft.csv'

raw_df = pd.read_csv(cleaned_data_url)

numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['Player']

X = numerical_df.drop(columns=['Pk','#'])
y = numerical_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X,y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,random_state=1)

In [6]:
X_train.head()

Unnamed: 0,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
35452,31,28.5,3.4,6.5,0.512,0.0,0.0,0.0,1.6,2.3,0.729,1.5,1.5,2.4,4.3,6.6,2.2,0.9,2.1,8.4
20690,31,37.235739,7.0,15.301312,0.458636,2.264917,6.131146,0.373531,3.499344,4.5,0.779531,2.7,2.066885,0.766885,3.533115,4.233115,4.63049,1.366229,0.0,19.831146
38776,31,31.0,3.3,6.9,0.481,0.0,0.4,0.0,1.3,2.4,0.52,1.6,2.0,2.5,2.7,5.2,2.4,1.1,0.6,7.9
32204,30,28.5,4.6,9.4,0.488,0.0,0.3,0.1,1.9,2.9,0.674,2.4,3.2,2.1,2.9,5.0,1.1,0.6,0.8,11.1
14121,34,21.271227,3.066745,7.885848,0.384861,1.242924,3.785848,0.336814,1.633255,2.204717,0.715885,1.476179,1.604717,0.619104,2.471462,3.090566,1.5,0.647641,0.628538,8.93349


In [7]:
rf = RandomForestClassifier(n_estimators=120,random_state=1)
rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_acc_score = accuracy_score(y_test,rf_pred)
rf_matrix = confusion_matrix(y_test,rf_pred)
rf_results = pd.DataFrame({"Prediction": rf_pred, "Actual": y_test}).reset_index(drop=True)
rf_report = classification_report(y_test,rf_pred,zero_division=True)

In [10]:
print(rf_acc_score)
print(rf_matrix)
print(rf_report)

0.9761639372374175
[[6575   51]
 [ 227 4810]]
              precision    recall  f1-score   support

           1       0.97      0.99      0.98      6626
           2       0.99      0.95      0.97      5037

    accuracy                           0.98     11663
   macro avg       0.98      0.97      0.98     11663
weighted avg       0.98      0.98      0.98     11663



In [8]:
importances = rf.feature_importances_
features = X.columns
feature_importances = sorted(zip(importances,features),reverse=True)

In [9]:
feature_importances

[(0.13694578402737617, 'GP'),
 (0.12728251059490786, 'BPG'),
 (0.08990333715318416, 'FGM'),
 (0.07797534603770345, 'PPG'),
 (0.05186175682781996, 'FG%'),
 (0.05070109593354251, 'FTM'),
 (0.04509907872224088, 'DRB'),
 (0.04410947545845876, 'PF'),
 (0.04190343998496972, 'FTA'),
 (0.0409661112455823, 'RPG'),
 (0.03702738939697323, 'FGA'),
 (0.03644396857532796, 'MPG'),
 (0.03205763966955923, 'APG'),
 (0.032001270207207395, 'SPG'),
 (0.03160952400892236, 'ORB'),
 (0.025581819108962085, '3PA'),
 (0.02549574850601654, 'FT%'),
 (0.024652858609055243, 'TOV'),
 (0.0244640151037693, '3P%'),
 (0.02391783082842114, '3PM')]

In [11]:
redundant_features = ['FGM','FGA','FTA','FTM','3PA','3PM','DRB','ORB']

In [12]:
non_redundant_df = numerical_df.drop(columns=redundant_features)
non_redundant_df.head()

Unnamed: 0_level_0,#,GP,MPG,FG%,3P%,FT%,TOV,PF,RPG,APG,SPG,BPG,PPG,Pk
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Anthony Bennett,124,35,27.1,0.533,0.375,0.701,1.9,2.3,8.1,1.0,0.7,1.2,16.1,1.0
Andrew Wiggins,135,35,32.8,0.448,0.341,0.775,2.3,2.7,5.9,1.5,1.2,1.0,17.1,1.0
John Wall,122,37,34.8,0.461,0.325,0.754,4.0,1.9,4.3,6.4,1.8,0.5,16.6,1.0
Blake Griffin,12,35,33.3,0.655,0.333,0.588,3.3,2.6,14.4,2.3,1.1,1.2,22.7,1.0
Blake Griffin,240,33,28.4,0.573,0.0,0.595,2.3,2.5,9.2,1.8,1.0,0.8,14.8,1.0


In [14]:
X_non_redundant = non_redundant_df.drop(columns=['Pk','#'])
y_non_redundant = non_redundant_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_non_r_resampled, y_non_r_resampled = smoteenn.fit_resample(X_non_redundant,y_non_redundant)

X_non_r_train, X_non_r_test, y_non_r_train, y_non_r_test = train_test_split(X_non_r_resampled, y_non_r_resampled,random_state=1)

In [15]:
rf_non_r = RandomForestClassifier(n_estimators=120,random_state=1)
rf_non_r.fit(X_non_r_train,y_non_r_train)

rf_non_r_pred = rf_non_r.predict(X_non_r_test)

rf_non_r_acc_score = accuracy_score(y_non_r_test,rf_non_r_pred)
rf_non_r_matrix = confusion_matrix(y_non_r_test,rf_non_r_pred)
rf_non_r_results = pd.DataFrame({"Prediction": rf_non_r_pred, "Actual": y_non_r_test}).reset_index(drop=True)
rf_non_r_report = classification_report(y_non_r_test,rf_non_r_pred,zero_division=True)

In [16]:
print(rf_non_r_acc_score)
print(rf_non_r_matrix)
print(rf_non_r_report)

0.9742694044474738
[[6538   79]
 [ 223 4897]]
              precision    recall  f1-score   support

           1       0.97      0.99      0.98      6617
           2       0.98      0.96      0.97      5120

    accuracy                           0.97     11737
   macro avg       0.98      0.97      0.97     11737
weighted avg       0.97      0.97      0.97     11737



In [17]:
non_r_importances = rf_non_r.feature_importances_
non_r_features = X_non_redundant.columns
non_r_feature_importances = sorted(zip(non_r_importances,non_r_features),reverse=True)

In [18]:
non_r_feature_importances

[(0.1643679351592678, 'BPG'),
 (0.15420740117322598, 'PPG'),
 (0.143551853866313, 'GP'),
 (0.08496468563285693, 'FG%'),
 (0.08280571838967381, 'RPG'),
 (0.06788294739699462, 'MPG'),
 (0.06450361352345983, 'SPG'),
 (0.060410857480198706, 'PF'),
 (0.04944445505162307, 'APG'),
 (0.04593996660978706, 'TOV'),
 (0.041298367818869974, '3P%'),
 (0.04062219789772928, 'FT%')]

In [19]:
num_features_desired = 6
important_features = []

for i in range(0,num_features_desired):
    important_features.append(non_r_feature_importances[i][1])
    
important_features.append('Pk')

In [20]:
important_df = non_redundant_df[important_features]
important_df.head()

Unnamed: 0_level_0,BPG,PPG,GP,FG%,RPG,MPG,Pk
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anthony Bennett,1.2,16.1,35,0.533,8.1,27.1,1.0
Andrew Wiggins,1.0,17.1,35,0.448,5.9,32.8,1.0
John Wall,0.5,16.6,37,0.461,4.3,34.8,1.0
Blake Griffin,1.2,22.7,35,0.655,14.4,33.3,1.0
Blake Griffin,0.8,14.8,33,0.573,9.2,28.4,1.0


In [21]:
X_important = important_df.drop(columns=['Pk'])
y_important = important_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_important_resampled, y_important_resampled = smoteenn.fit_resample(X_important,y_important)

X_important_train, X_important_test, y_important_train, y_important_test = train_test_split(X_important_resampled, y_important_resampled,random_state=1)

In [22]:
rf_important = RandomForestClassifier(n_estimators=120,random_state=1)
rf_important.fit(X_important_train,y_important_train)

rf_important_pred = rf_important.predict(X_important_test)

rf_important_acc_score = accuracy_score(y_important_test,rf_important_pred)
rf_important_matrix = confusion_matrix(y_important_test,rf_important_pred)
rf_important_results = pd.DataFrame({"Prediction": rf_important_pred, "Actual": y_important_test}).reset_index(drop=True)
rf_important_report = classification_report(y_important_test,rf_important_pred,zero_division=True)

In [23]:
print(rf_important_acc_score)
print(rf_important_matrix)
print(rf_important_report)

0.9716940548911641
[[6266   92]
 [ 237 5028]]
              precision    recall  f1-score   support

           1       0.96      0.99      0.97      6358
           2       0.98      0.95      0.97      5265

    accuracy                           0.97     11623
   macro avg       0.97      0.97      0.97     11623
weighted avg       0.97      0.97      0.97     11623



In [24]:
important_importances = rf_important.feature_importances_
important_features = X_important.columns
important_feature_importances = sorted(zip(important_importances,important_features),reverse=True)

In [25]:
important_feature_importances

[(0.23769831163545901, 'BPG'),
 (0.22624464574570374, 'PPG'),
 (0.18417074975713235, 'GP'),
 (0.12009105926048, 'RPG'),
 (0.11921314417138132, 'MPG'),
 (0.11258208942984364, 'FG%')]

In [27]:
selected_features = ['PPG','MPG','RPG','APG','SPG','GP','TOV','Pk']
selected_df = non_redundant_df[selected_features]

X_selected = selected_df.drop(columns=['Pk'])
y_selected = selected_df['Pk'].apply(lambda x: 1 if x <= 60 else 2)

smoteenn = SMOTEENN(random_state=1)
X_selected_resampled, y_selected_resampled = smoteenn.fit_resample(X_selected,y_selected)

X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(X_selected_resampled, y_selected_resampled,random_state=1)

In [28]:
rf_selected = RandomForestClassifier(n_estimators=120,random_state=1)
rf_selected.fit(X_selected_train,y_selected_train)

rf_selected_pred = rf_selected.predict(X_selected_test)

rf_selected_acc_score = accuracy_score(y_selected_test,rf_selected_pred)
rf_selected_matrix = confusion_matrix(y_selected_test,rf_selected_pred)
rf_selected_results = pd.DataFrame({"Prediction": rf_selected_pred, "Actual": y_selected_test}).reset_index(drop=True)
rf_selected_report = classification_report(y_selected_test,rf_selected_pred,zero_division=True)

In [29]:
print(rf_selected_acc_score)
print(rf_selected_matrix)
print(rf_selected_report)

0.9709511568123393
[[6403   92]
 [ 247 4928]]
              precision    recall  f1-score   support

           1       0.96      0.99      0.97      6495
           2       0.98      0.95      0.97      5175

    accuracy                           0.97     11670
   macro avg       0.97      0.97      0.97     11670
weighted avg       0.97      0.97      0.97     11670



In [30]:
selected_importances = rf_selected.feature_importances_
selected_features = X_selected.columns
selected_feature_importances = sorted(zip(selected_importances,selected_features),reverse=True)

In [31]:
selected_feature_importances

[(0.21204171234219257, 'PPG'),
 (0.2038338120947399, 'GP'),
 (0.17300283843533654, 'RPG'),
 (0.11863168662266475, 'MPG'),
 (0.10985075702599387, 'SPG'),
 (0.1002331066601409, 'TOV'),
 (0.08240608681893141, 'APG')]

In [32]:
total = 0
for feature in selected_feature_importances:
    total += feature[0]

total

1.0