In [48]:
import pandas as pd
from pathlib import Path
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN
import pickle

## Import data from s3.amazonaws

In [49]:
data_2007_2020 = 'https://s3.amazonaws.com/parkerhiggins-nba-draft-bucket/07-20_MBB_StatsAndDraft.csv'

raw_df = pd.read_csv(data_2007_2020)

## Preprocessing: Remove non-numerical columns

In [50]:
numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['#']

In [51]:
numerical_df['draft_status'] = (numerical_df['season_year']==numerical_df['draft_year']).astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Preprocessing: Select common individual basketball statistics to use as features in the model

In [52]:
numerical_df.columns

Index(['#', 'gp', 'mpg', 'fgm', 'fga', 'FG%', '3PM', '3PA', '3P%', 'ftm',
       'fta', 'FT%', 'tov', 'pf', 'orb', 'drb', 'rpg', 'apg', 'spg', 'bpg',
       'ppg', 'season_year', 'pk', 'draft_year', 'draft_status'],
      dtype='object')

In [53]:
selected_features = ['ppg','rpg','apg','spg','tov','draft_status','FG%','3P%','FT%']
data_df = numerical_df[selected_features]

X = data_df.drop(columns=['draft_status'])
y = data_df['draft_status']

## Preprocessing: Use SMOTEENN to oversample drafted players and undersample undrafted players

In [54]:
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X,y)

## Preprocessing: Split into training and test datasets

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,random_state=1,test_size=0.05)

# Logistic Regression Model

In [56]:
lr = LogisticRegression(solver='lbfgs',random_state=1, max_iter=1000)
lr.fit(X_train,y_train)

lr_pred = lr.predict(X_test)

lr_acc_score = accuracy_score(y_test,lr_pred)
lr_matrix = confusion_matrix(y_test,lr_pred)
lr_results = pd.DataFrame({"Prediction": lr_pred, "Actual": y_test}).reset_index(drop=True)
lr_report = classification_report(y_test,lr_pred,zero_division=True)

print(lr_acc_score,'\n')
print(lr_matrix,'\n')
print(lr_report,'\n')

model_filename = 'nba_LogReg_model.sav'
pickle.dump(lr, open(model_filename, 'wb'))

0.8697378872120731 

[[ 992  186]
 [ 142 1198]] 

              precision    recall  f1-score   support

       False       0.87      0.84      0.86      1178
        True       0.87      0.89      0.88      1340

    accuracy                           0.87      2518
   macro avg       0.87      0.87      0.87      2518
weighted avg       0.87      0.87      0.87      2518
 



# Predict 2021 Draft Picks

In [57]:
nba_2021_url = 'https://parkerhiggins-nba-draft-bucket.s3.amazonaws.com/season_stats_data_2020-2021.csv'

raw_2021_data = pd.read_csv(nba_2021_url)

In [58]:
raw_2021_data.columns

Index(['#', 'Player', 'Team', 'GP', 'MPG', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%', 'TOV', 'PF', 'ORB', 'DRB', 'RPG', 'APG',
       'SPG', 'BPG', 'PPG', 'Season_Year'],
      dtype='object')

In [59]:
raw_2021_data.head()

Unnamed: 0,#,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,...,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG,Season_Year
0,1,Max Abmas,ORU,28,37.0,8.0,16.8,0.478,3.6,8.3,...,2.3,2.2,0.4,2.8,3.2,3.9,1.5,0.2,24.6,2021
1,2,Luka Garza,UI,31,31.6,9.1,16.4,0.553,1.4,3.2,...,1.5,2.3,3.0,5.7,8.7,1.7,0.7,1.6,24.1,2021
2,3,Antoine Davis,UDM,22,38.5,8.1,19.1,0.424,3.8,10.1,...,3.3,1.6,0.2,2.7,2.9,4.8,1.5,0.0,24.0,2021
3,4,Cam Thomas,LSU,29,33.9,7.0,17.2,0.406,2.3,7.2,...,1.7,1.4,0.6,2.8,3.4,1.4,0.9,0.2,23.0,2021
4,5,A.J. Green,UNI,3,36.3,8.7,18.7,0.464,3.7,9.0,...,3.7,2.0,1.0,4.7,5.7,2.7,1.3,0.7,22.3,2021


In [60]:
numerical_cols_2021 = raw_2021_data.dtypes[raw_2021_data.dtypes != 'object'].index.tolist()

numerical_2021 = raw_2021_data[numerical_cols_2021]
numerical_2021.index = raw_2021_data['#']

selected_features_2021 = ['PPG','RPG','APG','SPG','TOV','FG%','3P%','FT%']

X_2021 = numerical_2021[selected_features_2021]

In [61]:
pred_2021 = lr.predict(X_2021)

In [62]:
draft_prob = lr.predict_proba(X_2021)

draft_prob_true = []
for prob in draft_prob:
    draft_prob_true.append(prob[1])

In [63]:
prediction_display_features = ['Player','Team']
for feature in selected_features_2021:
    prediction_display_features.append(feature)

draft_predictions_2021 = raw_2021_data[prediction_display_features]
draft_predictions_2021['draft_prediction'] = pred_2021
draft_predictions_2021['draft_prob'] = draft_prob_true

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [68]:
draft_picks_2021 = draft_predictions_2021.sort_values(by='draft_prob',ascending=False).head(60).drop('draft_prob',axis=1)

In [69]:
predictions_filepath = './2021_predictions.csv'
draft_picks_2021.to_csv(predictions_filepath)