In [145]:
import numpy as np
import pandas as pd


In [146]:
NHL_1 = pd.read_csv('NHLModel.csv')
NHL_2 = NHL_1[['POSITION', 'HEIGHT', 'WEIGHT', 'PLAYER', 'AGE', 'Injury Status']]
# NHL.columns 
NHL = NHL_2.dropna()
NHL.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status
0,LW,74.0,214,Abdelkader Justin,32,Injured
1,LW,71.0,196,Aberg Pontus,25,Injured
2,RW,69.0,171,Abramov Vitaly,21,Not Injured
3,C,70.0,205,Acciari Noel,27,Not Injured
4,LW,72.0,199,Agostino Kenny,27,Not Injured


In [147]:
# 1 = injured
# 0 = not injured
Encoded_df = pd.get_dummies(NHL, columns=['Injury Status'])[['POSITION', 'HEIGHT', 'WEIGHT', 'PLAYER', 'AGE', 'Injury Status_Injured']]
Encoded_df.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status_Injured
0,LW,74.0,214,Abdelkader Justin,32,1
1,LW,71.0,196,Aberg Pontus,25,1
2,RW,69.0,171,Abramov Vitaly,21,0
3,C,70.0,205,Acciari Noel,27,0
4,LW,72.0,199,Agostino Kenny,27,0


In [148]:
# Sklearn requires a two-dimensional array of values
# so we use reshape 

y = Encoded_df['Injury Status_Injured'].values.reshape(-1, 1)

In [149]:
Encoded_df.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status_Injured
0,LW,74.0,214,Abdelkader Justin,32,1
1,LW,71.0,196,Aberg Pontus,25,1
2,RW,69.0,171,Abramov Vitaly,21,0
3,C,70.0,205,Acciari Noel,27,0
4,LW,72.0,199,Agostino Kenny,27,0


In [195]:
from sklearn.model_selection import train_test_split

# X = pd.get_dummies(NHL, columns=['POSITION','Injury Status'])

X = pd.get_dummies(Encoded_df[['POSITION', 'HEIGHT', 'WEIGHT', 'AGE']], columns=['POSITION'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=350, stratify=y)

X_train.head()



Unnamed: 0,HEIGHT,WEIGHT,AGE,POSITION_C,POSITION_D,POSITION_G,POSITION_LW,POSITION_RW
1300,72.0,197,23,0,0,0,1,0
1202,76.0,219,27,0,0,0,0,1
1055,73.0,221,25,0,1,0,0,0
1305,69.0,186,25,1,0,0,0,0
217,74.0,205,29,0,1,0,0,0


In [196]:
X_test.head()

Unnamed: 0,HEIGHT,WEIGHT,AGE,POSITION_C,POSITION_D,POSITION_G,POSITION_LW,POSITION_RW
1353,74.0,185,34,1,0,0,0,0
890,73.0,209,31,0,0,1,0,0
769,74.0,207,30,0,0,0,1,0
235,73.0,202,42,1,0,0,0,0
10,75.0,219,30,0,1,0,0,0


In [151]:
import time
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier


In [197]:
#create random forest object/classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [198]:
## Here we train the model
clf.fit(X_train, y_train.ravel())




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [199]:
#SHOW THE TEST DATA
# X_test = X_test.drop(columns=['predicted'])
X_test.head()

Unnamed: 0,HEIGHT,WEIGHT,AGE,POSITION_C,POSITION_D,POSITION_G,POSITION_LW,POSITION_RW
1353,74.0,185,34,1,0,0,0,0
890,73.0,209,31,0,0,1,0,0
769,74.0,207,30,0,0,0,1,0
235,73.0,202,42,1,0,0,0,0
10,75.0,219,30,0,1,0,0,0


In [200]:
# Predict injury or no injury using clf model
#Here we run on the test set
preds=clf.predict(X_test)
print(len(preds))
X_test.head()

340


Unnamed: 0,HEIGHT,WEIGHT,AGE,POSITION_C,POSITION_D,POSITION_G,POSITION_LW,POSITION_RW
1353,74.0,185,34,1,0,0,0,0
890,73.0,209,31,0,0,1,0,0
769,74.0,207,30,0,0,0,1,0
235,73.0,202,42,1,0,0,0,0
10,75.0,219,30,0,1,0,0,0


In [203]:
#Print summary information from running prediction on X_test set
newdf=X_test.copy()
newdf['predicted']=preds
#print(newdf['predicted'].value_counts)
#newdf.index
#newdf.head()
odf=Encoded_df.loc[newdf.index]
odf['predicted']=preds
print("Predicted as injured:")
print(newdf.loc[newdf.predicted==1].shape)
print("Predicted as not injured")
print(newdf.loc[newdf.predicted==0].shape)
odf.head()

Predicted as injured:
(84, 9)
Predicted as not injured
(256, 9)


Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status_Injured,predicted
1353,C,74.0,185,Zajac Travis,34,1,1
890,G,73.0,209,Neuvirth Michal,31,1,0
769,LW,74.0,207,Malone Brad,30,0,1
235,C,73.0,202,Cullen Matt,42,1,1
10,D,75.0,219,Alzner Karl,30,0,1


In [204]:
#Show original data with labels and predictions 
odf.head()

Unnamed: 0,POSITION,HEIGHT,WEIGHT,PLAYER,AGE,Injury Status_Injured,predicted
1353,C,74.0,185,Zajac Travis,34,1,1
890,G,73.0,209,Neuvirth Michal,31,1,0
769,LW,74.0,207,Malone Brad,30,0,1
235,C,73.0,202,Cullen Matt,42,1,1
10,D,75.0,219,Alzner Karl,30,0,1


In [205]:
#counts of inj/ non-inj in actual data 
odf['predicted'].value_counts()


0    256
1     84
Name: predicted, dtype: int64

In [206]:
#counts of inj/ non-inj in predicted 

odf['Injury Status_Injured'].value_counts()


0    229
1    111
Name: Injury Status_Injured, dtype: int64

In [208]:
# Assess quality of preditor
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(odf['Injury Status_Injured'], odf['predicted']) 
print("MSE: %.4f"%mse)


MSE: 0.3676


In [210]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 0.9568627450980393
Testing Data Score: 0.6323529411764706


In [214]:
# Model Accuracy
print('Test Acc: %.3f' % clf.score(X_test, y_test))

Test Acc: 0.632
