# Build Sample Logistic Model to Get Baseline Check of Raw Data

## This script will import CSV files directly
sql_connection.ipynb will test the connection to the postgres database before both are combined into a .py file

In [13]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [14]:
cleaned_data_url = '../../Resources/2018MBB_StatsAndDraft_Cleaned.csv'

raw_df = pd.read_csv(cleaned_data_url)

In [15]:
raw_df.head()

Unnamed: 0,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,...,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG,Pk,Player1
0,Deandre Ayton,U of A,35,33.5,7.9,12.9,0.612,0.3,1.0,0.343,...,2.3,3.4,8.2,11.6,1.6,0.6,1.9,20.1,1.0,Deandre Ayton
1,Marvin Bagley III,DUKE,33,33.9,8.2,13.3,0.614,0.7,1.8,0.397,...,1.8,4.0,7.1,11.1,1.5,0.8,0.9,21.0,2.0,Marvin Bagley III
2,Trae Young,OU,32,35.4,8.2,19.3,0.422,3.7,10.2,0.36,...,1.8,0.4,3.5,3.9,8.7,1.7,0.2,27.4,5.0,Trae Young
3,Mo Bamba,UT,30,30.2,4.9,9.0,0.541,0.5,1.7,0.275,...,2.5,3.2,7.3,10.5,0.5,0.8,3.7,12.9,6.0,Mo Bamba
4,Collin Sexton,UA,33,29.9,5.9,13.3,0.447,1.3,4.0,0.336,...,2.5,1.0,2.7,3.8,3.6,0.8,0.1,19.2,8.0,Collin Sexton


In [16]:
raw_df[(raw_df['PPG'] < 10) & (raw_df['Pk'] > 0)]

Unnamed: 0,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,...,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG,Pk,Player1
36,Kostas Antetokounmpo,DAY,29,15.1,2.0,3.5,0.574,0.1,0.5,0.133,...,2.3,0.8,2.0,2.9,0.4,0.2,1.1,5.2,60.0,Kostas Antetokounmpo


In [17]:
raw_df[raw_df['Pk'] > 0].sort_values('Pk')

Unnamed: 0,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,...,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG,Pk,Player1
0,Deandre Ayton,U of A,35,33.5,7.9,12.9,0.612,0.3,1.0,0.343,...,2.3,3.4,8.2,11.6,1.6,0.6,1.9,20.1,1.0,Deandre Ayton
1,Marvin Bagley III,DUKE,33,33.9,8.2,13.3,0.614,0.7,1.8,0.397,...,1.8,4.0,7.1,11.1,1.5,0.8,0.9,21.0,2.0,Marvin Bagley III
2,Trae Young,OU,32,35.4,8.2,19.3,0.422,3.7,10.2,0.36,...,1.8,0.4,3.5,3.9,8.7,1.7,0.2,27.4,5.0,Trae Young
3,Mo Bamba,UT,30,30.2,4.9,9.0,0.541,0.5,1.7,0.275,...,2.5,3.2,7.3,10.5,0.5,0.8,3.7,12.9,6.0,Mo Bamba
4,Collin Sexton,UA,33,29.9,5.9,13.3,0.447,1.3,4.0,0.336,...,2.5,1.0,2.7,3.8,3.6,0.8,0.1,19.2,8.0,Collin Sexton
5,Kevin Knox,UK,37,32.4,5.3,11.9,0.445,1.5,4.5,0.341,...,2.2,0.9,4.5,5.4,1.4,0.8,0.3,15.6,9.0,Kevin Knox
6,Mikal Bridges,VILL,40,32.1,6.1,11.9,0.514,2.6,6.0,0.435,...,2.1,1.4,4.0,5.3,1.9,1.5,1.1,17.6,10.0,Mikal Bridges
7,Shai Gilgeous-Alexander,UK,37,33.7,4.9,10.2,0.485,0.6,1.5,0.404,...,1.7,0.9,3.2,4.1,5.1,1.6,0.5,14.4,11.0,Shai Gilgeous-Alexander
8,Miles Bridges,MSU,34,31.4,6.1,13.4,0.457,2.1,5.7,0.364,...,1.6,1.3,5.7,7.0,2.7,0.6,0.8,17.1,12.0,Miles Bridges
9,Jerome Robinson,BC,35,36.0,7.1,14.7,0.485,2.3,5.7,0.409,...,2.5,0.5,3.1,3.6,3.3,0.9,0.1,20.7,13.0,Jerome Robinson


## Preprocessing
Limited preprocessing done at this time
* No scaling or encoding  
* All numerical data used as-is  
* All categorical & text data dropped

In [18]:
numerical_cols = raw_df.dtypes[raw_df.dtypes != 'object'].index.tolist()

In [19]:
numerical_df = raw_df[numerical_cols]
numerical_df.index = raw_df['Player']
numerical_df.head()

Unnamed: 0_level_0,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,...,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG,Pk
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Deandre Ayton,35,33.5,7.9,12.9,0.612,0.3,1.0,0.343,4.0,5.5,...,2.0,2.3,3.4,8.2,11.6,1.6,0.6,1.9,20.1,1.0
Marvin Bagley III,33,33.9,8.2,13.3,0.614,0.7,1.8,0.397,4.0,6.3,...,2.3,1.8,4.0,7.1,11.1,1.5,0.8,0.9,21.0,2.0
Trae Young,32,35.4,8.2,19.3,0.422,3.7,10.2,0.36,7.4,8.6,...,5.2,1.8,0.4,3.5,3.9,8.7,1.7,0.2,27.4,5.0
Mo Bamba,30,30.2,4.9,9.0,0.541,0.5,1.7,0.275,2.7,4.0,...,1.5,2.5,3.2,7.3,10.5,0.5,0.8,3.7,12.9,6.0
Collin Sexton,33,29.9,5.9,13.3,0.447,1.3,4.0,0.336,5.9,7.6,...,2.8,2.5,1.0,2.7,3.8,3.6,0.8,0.1,19.2,8.0


In [20]:
y.value_counts()

NameError: name 'y' is not defined

In [21]:
X = numerical_df.drop(columns=['Pk'])
y = numerical_df['Pk'].apply(lambda x: 1 if x <= 60 else 0)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

## Set up sample Logistic Regression Model

In [23]:
classifier = LogisticRegression(solver='lbfgs',random_state=1, max_iter=1000)
classifier.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=1)

In [24]:
predictions = classifier.predict(X_test)

In [25]:
acc_score = accuracy_score(y_test,predictions)
acc_score

0.988

In [26]:
matrix = confusion_matrix(y_test,predictions)
print(matrix)

[[493   1]
 [  5   1]]


In [27]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [28]:
results[results['Prediction'] != 0]

Unnamed: 0,Prediction,Actual
93,1,0
400,1,1


In [29]:
report = classification_report(y_test,predictions,zero_division=True)

In [30]:
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       494
           1       0.50      0.17      0.25         6

    accuracy                           0.99       500
   macro avg       0.74      0.58      0.62       500
weighted avg       0.98      0.99      0.99       500



## Next Steps:
1. Obtain more data for draft picks, either through additional seasons or by oversampling the drafted players' data
    CORRECTION: Undersample the majority class (not drafted)
2. Test additional ML models, including RandomForest & SVM
3. Use PCA to find the most important features
4. Retest the various ML models