In [57]:
# Set the seed value for the notebook so the results are reproducible

from numpy.random import seed
seed(1)

In [58]:
# import necessary libraries

import warnings
warnings.simplefilter('ignore')

import numpy as np

import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification

In [59]:
# load pandas DataFrame
ncaaDF = pd.read_csv("NCAA_data.csv")
ncaaDF.head()

Unnamed: 0,player_id,name,class,college,assists,blocks,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,...,turnover_percentage,turnovers,win_shares_per_40_minutes,assists_per_40,blocks_per_40,points_per_40,steals_per_40,three_pointers_per_40,total_rebounds_per_40,turnovers_per_40
0,kenyon-martin-1,Kenyon Martin,Above Avg,cincinnati,142,292,0.587,0.586,0.494,0.581,...,14.3,139.0,0.346,2.781587,5.719882,25.053869,2.428991,0.039177,17.022527,2.722821
1,aj-guyton-1,A.J. Guyton,Below Avg,indiana,403,52,0.541,0.455,0.249,0.79,...,13.6,218.0,0.145,4.944785,0.638037,25.766871,1.582822,3.472393,5.263804,2.674847
2,jake-voskuhl-1,Jake Voskuhl,Below Avg,uconn,124,193,0.542,0.542,0.555,0.656,...,23.0,160.0,0.237,2.271062,3.534799,15.805861,1.575092,0.0,16.117216,2.930403
3,khalid-el-amin-1,Khalid El-Amin,Bust,uconn,479,10,0.486,0.416,0.257,0.822,...,15.6,188.0,0.158,8.915775,0.186133,30.711959,3.462075,3.629595,5.937645,3.499302
4,mike-smith-1,Mike Smith,Bust,louisiana monroe,147,42,0.507,0.428,0.357,0.769,...,20.0,241.0,0.165,3.034056,0.866873,21.919505,1.857585,2.683179,9.267286,4.9742


In [60]:
# Split the x variables (WITHOUT Player, Team, and College Columns) from the y variable

X = ncaaDF.drop(["class","player_id","name", "college"], axis=1)
y = ncaaDF["class"]
print(f"Number of Rows and Independent(X) Variable: {X.shape}, \n Number of Rows (One Dependent(Y) variable): {y.shape}")

Number of Rows and Independent(X) Variable: (853, 25), 
 Number of Rows (One Dependent(Y) variable): (853,)


In [76]:
# Label encoding for college column independent variable

college_X = ncaaDF["college"]

label_encoder = LabelEncoder()
label_encoder.fit(college_X)
encoded_college = label_encoder.transform(college_X)

for label, original_class in zip(college_X, encoded_college):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: 23
Encoded Label: cincinnati
------------
Original Class: 51
Encoded Label: indiana
------------
Original Class: 131
Encoded Label: uconn
------------
Original Class: 131
Encoded Label: uconn
------------
Original Class: 62
Encoded Label: louisiana monroe
------------
Original Class: 3
Encoded Label: arizona state
------------
Original Class: 92
Encoded Label: oklahoma
------------
Original Class: 116
Encoded Label: st johns ny
------------
Original Class: 136
Encoded Label: utah
------------
Original Class: 34
Encoded Label: duke
------------
Original Class: 91
Encoded Label: ohio state
------------
Original Class: 138
Encoded Label: vanderbilt
------------
Original Class: 65
Encoded Label: lsu
------------
Original Class: 119
Encoded Label: temple
------------
Original Class: 118
Encoded Label: syracuse
------------
Original Class: 49
Encoded Label: idaho
------------
Original Class: 87
Encoded Label: northern arizona
------------
Original Class: 6
Encoded Label: aubu

------------
Original Class: 80
Encoded Label: murray state
------------
Original Class: 3
Encoded Label: arizona state
------------
Original Class: 13
Encoded Label: bucknell
------------
Original Class: 113
Encoded Label: south dakota state
------------
Original Class: 81
Encoded Label: nc state
------------
Original Class: 34
Encoded Label: duke
------------
Original Class: 37
Encoded Label: florida
------------
Original Class: 61
Encoded Label: long beach state
------------
Original Class: 92
Encoded Label: oklahoma
------------
Original Class: 28
Encoded Label: colorado state
------------
Original Class: 95
Encoded Label: oregon
------------
Original Class: 64
Encoded Label: louisville
------------
Original Class: 77
Encoded Label: missouri
------------
Original Class: 91
Encoded Label: ohio state
------------
Original Class: 17
Encoded Label: california
------------
Original Class: 142
Encoded Label: virginia tech
------------
Original Class: 57
Encoded Label: kentucky
----------

In [82]:
# One-hot encoding for college column independent variable

onehot_college = to_categorical(encoded_college)
onehot_college = pd.DataFrame(onehot_college)

onehot_college.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145,146,147,148,149,150,151,152,153,154
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Append the one-hot encoded indepedent variable columns with the X dataframe
X = pd.concat([X, pd.DataFrame(onehot_college)])
X = X.fillna(0)

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,three_point_percentage,three_pointers,three_pointers_per_40,total_rebounds,total_rebounds_per_40,true_shooting_percentage,turnover_percentage,turnovers,turnovers_per_40,win_shares_per_40_minutes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.222,2.0,0.039177,869.0,17.022527,0.592,14.3,139.0,2.722821,0.346
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.414,283.0,3.472393,429.0,5.263804,0.572,13.6,218.0,2.674847,0.145
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.329163,0.0,0.0,880.0,16.117216,0.573,23.0,160.0,2.930403,0.237
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.355,195.0,3.629595,319.0,5.937645,0.527,15.6,188.0,3.499302,0.158
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.318,130.0,2.683179,449.0,9.267286,0.551,20.0,241.0,4.9742,0.165


In [85]:
# Label encoding for Dependent(Y) Variable

label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Star
Encoded Label: 3
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Lab

Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Below Avg
Encoded Label: 1
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Star
Encoded Label: 3
------------
Original Class: Above Avg
Encoded Label: 0
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Bust
Encoded Label: 2
------------
Original Class: Below Avg
Encoded Label: 1
---------

In [86]:
# Use train_test_split to create training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=float(0.2), random_state=1)

ValueError: Found input variables with inconsistent numbers of samples: [4265, 853]

In [72]:
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(682, 25)

In [67]:
#### Create a normal neural network with 12 inputs (12 independent variables), 6 hidden nodes, and 4 outputs
# input_dim refers to number of independent(X) variables
# units is output shape i.e. number of categories in the Dependent Variable Column
# In this example, final units=4 because Four Rankings: Star, Above Average, Below Average, Bust

model = Sequential()
model.add(Dense(units=30, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=15, activation='relu'))
model.add(Dense(units=len(y.unique()), activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 30)                780       
_________________________________________________________________
dense_5 (Dense)              (None, 25)                775       
_________________________________________________________________
dense_6 (Dense)              (None, 15)                390       
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 64        
Total params: 2,009
Trainable params: 2,009
Non-trainable params: 0
_________________________________________________________________


In [68]:
# Compile the model

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])



In [69]:
# Fit the model

model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    batch_size=10,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 0s - loss: 1.4385 - acc: 0.2302
Epoch 2/100
 - 0s - loss: 1.3044 - acc: 0.3959
Epoch 3/100
 - 0s - loss: 1.2479 - acc: 0.4062
Epoch 4/100
 - 0s - loss: 1.2129 - acc: 0.4370
Epoch 5/100
 - 0s - loss: 1.1918 - acc: 0.4795
Epoch 6/100
 - 0s - loss: 1.1734 - acc: 0.4941
Epoch 7/100
 - 0s - loss: 1.1554 - acc: 0.4941
Epoch 8/100
 - 0s - loss: 1.1433 - acc: 0.5088
Epoch 9/100
 - 0s - loss: 1.1294 - acc: 0.5132
Epoch 10/100
 - 0s - loss: 1.1158 - acc: 0.5308
Epoch 11/100
 - 0s - loss: 1.1005 - acc: 0.5323
Epoch 12/100
 - 0s - loss: 1.0839 - acc: 0.5484
Epoch 13/100
 - 0s - loss: 1.0729 - acc: 0.5367
Epoch 14/100
 - 0s - loss: 1.0564 - acc: 0.5381
Epoch 15/100
 - 0s - loss: 1.0374 - acc: 0.5572
Epoch 16/100
 - 0s - loss: 1.0308 - acc: 0.5704
Epoch 17/100
 - 0s - loss: 1.0157 - acc: 0.5616
Epoch 18/100
 - 0s - loss: 1.0046 - acc: 0.5821
Epoch 19/100
 - 0s - loss: 0.9858 - acc: 0.5850
Epoch 20/100
 - 0s - loss: 0.9733 - acc: 0.6012
Epoch 21/100
 - 0s - loss: 0.9552 - acc: 0.6085
E

<tensorflow.python.keras.callbacks.History at 0x1c5ef1d62e8>

In [70]:
# Save the model

model.save("Draft_Machine_Model.h5")

In [71]:
# Evaluate the model using the testing data

model = load_model("Draft_Machine_Model.h5")
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 3.6695 - acc: 0.3860
Loss: 3.6695196586742735, Accuracy: 0.38596490025520325
