In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#### Read CSV containing player stats

In [2]:
TOI_df = pd.read_csv('Final-DataFrames/NHLTotTime.csv')
TOI_df.head()

Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,GP,G,A,PTS,+/-,TOI/G,SHFT,SHFT/G,PROD,POS,hr,min,sec,TIMES
0,0,Suter Ryan,MIN,82.0,7,40,47,-8,26:42:00,2222,27.1,46:34:00,D,26.0,42.0,0.0,2132.0
1,1,Doughty Drew,LA,82.0,8,37,45,-34,26:36:00,2393,29.2,48:27:00,D,26.0,36.0,0.0,2132.0
2,2,Letang Kris,PIT,65.0,16,40,56,13,25:58:00,1923,29.6,30:07:00,D,25.0,58.0,0.0,1625.0
3,3,Jones Seth,CBJ,75.0,9,37,46,1,25:49:00,2214,29.5,42:06:00,D,25.0,49.0,0.0,1875.0
4,4,Josi Roman,NSH,82.0,15,41,56,9,25:10:00,2359,28.8,36:51:00,D,25.0,10.0,0.0,2050.0


#### Convert "Time On Ice per Game" column to numeric

In [3]:
# Create a list of the times in the "TOI/G" column
timelist = list(TOI_df['TOI/G'])

for i in range(len(TOI_df['TOI/G'])):
    timesplit = timelist[i].split(':')
    
    
    whole = int(timesplit[0])
    decimal = int(timesplit[1])/60
    
    minutes = whole + decimal
    
    TOI_df['TOI/G'][i] = minutes
    
TOI_df.head(2)

Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,GP,G,A,PTS,+/-,TOI/G,SHFT,SHFT/G,PROD,POS,hr,min,sec,TIMES
0,0,Suter Ryan,MIN,82.0,7,40,47,-8,26.7,2222,27.1,46:34:00,D,26.0,42.0,0.0,2132.0
1,1,Doughty Drew,LA,82.0,8,37,45,-34,26.6,2393,29.2,48:27:00,D,26.0,36.0,0.0,2132.0


#### Create new DataFrame with important columns

In [4]:
NHL_stats = TOI_df[['PLAYER', 'GP', 'TOI/G']]

# Preparing Data

#### Read our CSV containing Bio data

In [5]:
pd.read_csv('Final-DataFrames/NHLModel.csv').head(4)

Unnamed: 0.1,Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status
0,0,LW,74.0,214,Abdelkader Justin,32,Injured
1,1,LW,71.0,196,Aberg Pontus,25,Injured
2,2,RW,69.0,171,Abramov Vitaly,21,Not Injured
3,3,C,70.0,205,Acciari Noel,27,Not Injured


In [6]:
NHL_1 = pd.read_csv('Final-DataFrames/NHLModel.csv')


# Select columns of interest
NHL_select = NHL_1[['POSITION', 'HEIGHT', 'WEIGHT', 'PLAYER', 'AGE', 'Injury Status']]

#Merge Playerr Bio data with the player stat data 
NHL = NHL_select.merge(NHL_stats, how='right', on='PLAYER')

#Drop NA rows
NHL = NHL.dropna()

#### Create BMI column

In [7]:
NHL['BMI'] = (NHL['WEIGHT'] / (NHL['HEIGHT'] ** 2)) * 703
NHL.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status,GP,TOI/G,BMI
0,LW,74.0,214.0,Abdelkader Justin,32.0,Injured,71.0,15.4,27.472973
1,LW,71.0,196.0,Aberg Pontus,25.0,Injured,59.0,14.6,27.333466
2,RW,69.0,171.0,Abramov Vitaly,21.0,Not Injured,1.0,13.8667,25.249527
3,C,70.0,205.0,Acciari Noel,27.0,Not Injured,72.0,12.9833,29.411224
4,LW,72.0,199.0,Agostino Kenny,27.0,Not Injured,63.0,12.9167,26.986304


#### Hot Encode The NHL dataframe

In [8]:
# 1 = injured
# 0 = not injured
Encoded_df = pd.get_dummies(NHL, columns=['Injury Status'])[['POSITION', 'HEIGHT', 'WEIGHT', 'PLAYER', 'AGE', 'GP', 'TOI/G','BMI', 'Injury Status_Injured']]
Encoded_df.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,GP,TOI/G,BMI,Injury Status_Injured
0,LW,74.0,214.0,Abdelkader Justin,32.0,71.0,15.4,27.472973,1
1,LW,71.0,196.0,Aberg Pontus,25.0,59.0,14.6,27.333466,1
2,RW,69.0,171.0,Abramov Vitaly,21.0,1.0,13.8667,25.249527,0
3,C,70.0,205.0,Acciari Noel,27.0,72.0,12.9833,29.411224,0
4,LW,72.0,199.0,Agostino Kenny,27.0,63.0,12.9167,26.986304,0


#### Set y values equal to our Injury Status_Injured column values

In [9]:
import numpy as np
y = np.array(Encoded_df['Injury Status_Injured'])

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

# X = pd.get_dummies(Encoded_df[['POSITION', 'HEIGHT', 'WEIGHT', 'AGE', 'GP', 'TOI/G','BMI']], columns=['POSITION'])
X = Encoded_df[['HEIGHT', 'WEIGHT', 'AGE', 'GP', 'TOI/G','BMI']]

New_X = np.array(X)

X_train, X_test, y_train, y_test = train_test_split(New_X, y, train_size=0.75)


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Fit Data

In [12]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Example Prediction:

In [13]:
classifier.predict([X_test[12]])

array([1], dtype=uint8)

#### All Predictions:

In [14]:
classifier.predict(X_test)

array([1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
      dtype=uint8)

#### Scores:

In [15]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.5920852359208524
Testing Data Score: 0.6027397260273972


# K Nearest Neighbor

#### Create a StandardScater model and fit it to the training data

In [16]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train.reshape(-1, 1))
# X_scaler = StandardScaler().fit_transform(X_train)

##### Transform the training and testing data using the X_scaler and y_scaler models

In [17]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

k: 1, Train/Test Score: 1.000/0.703
k: 3, Train/Test Score: 0.854/0.753
k: 5, Train/Test Score: 0.817/0.753
k: 7, Train/Test Score: 0.808/0.763
k: 9, Train/Test Score: 0.795/0.772
k: 11, Train/Test Score: 0.785/0.781
k: 13, Train/Test Score: 0.787/0.767
k: 15, Train/Test Score: 0.776/0.776
k: 17, Train/Test Score: 0.770/0.767
k: 19, Train/Test Score: 0.776/0.753
k: 21, Train/Test Score: 0.773/0.763
k: 23, Train/Test Score: 0.770/0.749
k: 25, Train/Test Score: 0.766/0.763
k: 27, Train/Test Score: 0.767/0.758
k: 29, Train/Test Score: 0.767/0.758
k: 31, Train/Test Score: 0.772/0.753
k: 33, Train/Test Score: 0.756/0.763
k: 35, Train/Test Score: 0.753/0.772
k: 37, Train/Test Score: 0.747/0.758
k: 39, Train/Test Score: 0.741/0.753
k: 41, Train/Test Score: 0.746/0.753
k: 43, Train/Test Score: 0.743/0.749
k: 45, Train/Test Score: 0.741/0.744
k: 47, Train/Test Score: 0.743/0.744
k: 49, Train/Test Score: 0.746/0.735
k: 51, Train/Test Score: 0.741/0.740
k: 53, Train/Test Score: 0.741/0.731
k: 55,

<Figure size 640x480 with 1 Axes>

In [58]:
import math

# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []

k = int( math.sqrt(len(X_train)))

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_scaled, y_train)
train_score = knn.score(X_train_scaled, y_train)
test_score = knn.score(X_test_scaled, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 25, Train/Test Score: 0.766/0.763


Predictions Based On X_Test values

In [59]:
y_pred = knn.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=uint8)

#### Confusion Maxtrix

In [60]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[113,   0],
       [106,   0]], dtype=int64)

#### F1 Score

In [61]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.0

#### Accuracy Score

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5159817351598174

### Choosing Best K

In [63]:
accuracy_dict = {}
accuracy_list = []

for k in range(1, 400, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test)
    
    accuracy_dict[accuracy_score(y_test, y_pred)] = k
    
    accuracy_list.append(accuracy_score(y_test, y_pred))
    
    best_k = accuracy_dict[max(accuracy_list)]
    
print(f' The K value of {accuracy_dict[max(accuracy_list)]} has the best accuracy score of : {max(accuracy_list)}')

 The K value of 37 has the best accuracy score of : 0.7031963470319634


## Most Accurate K Value

In [64]:
knn = KNeighborsClassifier(n_neighbors= best_k)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test)
        
cm = confusion_matrix(y_test, y_pred)
cm

array([[95, 18],
       [47, 59]], dtype=int64)

# Random Forest

In [25]:
import time
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


#### Make random forest object

In [26]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Train the model

In [27]:
clf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Predict injury or no injury using clf model

In [28]:
#Here we run on the test set
preds=clf.predict(X_test)
print(len(preds))

219


#### Print summary information from running prediction on X_test set

In [29]:
newdf = pd.DataFrame(X_test)
newdf['predicted']=preds
#print(newdf['predicted'].value_counts)
#newdf.index
#newdf.head()
odf=Encoded_df.loc[newdf.index]
odf['predicted']=preds
print("Predicted as injured:")
print(newdf.loc[newdf.predicted==1].shape)
print("Predicted as not injured")
print(newdf.loc[newdf.predicted==0].shape)

Predicted as injured:
(103, 7)
Predicted as not injured
(116, 7)


In [30]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 0.9863013698630136
Testing Data Score: 0.7853881278538812


In [31]:
# Assess quality of preditor
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(odf['Injury Status_Injured'], odf['predicted']) 
print("MSE: %.4f"%mse)

MSE: 0.5251


## Import Our Model Functions From Our Python File

In [32]:
from function import train_test, logistic_model, knn_model

## Create Train and Test Variables Based On The DataFrame and Columns of Interest

In [33]:
columns = ['HEIGHT', 'WEIGHT', 'AGE', 'GP', 'TOI/G','BMI']
X_tn, X_tt, y_tn, y_tt = train_test(Encoded_df, columns)

In [34]:
trn_scores = []
tst_scores = []

for i in range(1, 10):
    X_tn, X_tt, y_tn, y_tt = train_test(Encoded_df, columns)
    trn, tst = logistic_model(X_tn, X_tt, y_tn, y_tt)
    trn_scores.append(trn)
    tst_scores.append(tst)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
---------------------------------
X_test Prediction: 
---------------------------------
[1 1 0 0 0 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 0
 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0 1 0 1 1
 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 1 0 1 0 1 0 0
 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 1 1 0
 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1]
---------------------------------
Training Data Score: 0.6453576864535768
Testing Data Score: 0.5570776255707762
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          in

In [35]:
import statistics
print(f'The Mean of our list of test scores is {statistics.mean(tst_scores)}')
print(f'The Standard Deviation of our list of test scores is {statistics.stdev(tst_scores)}')

The Mean of our list of test scores is 0.5839675291730086
The Standard Deviation of our list of test scores is 0.03335556621128024
