# Predicting on the holdout data
___

In [34]:
# import the libraries necessarry for cleaning and eda
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style='white')
plt.style.use('seaborn')

pd.set_option('display.max_columns', 100)  # display max rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.7f' % x)  # avoid outputing scientific notation

In [2]:
# read in the data
holdout = pd.read_csv("cardiac_test.csv")
print(holdout.shape)
holdout.head()

(12717, 13)


Unnamed: 0,age,gender,height,weight,bp_high,bp_low,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,62.31,female,1.54,70.0,130.0,60.0,above normal,normal,0,0,1,1,29.52
1,49.92,female,1.76,92.0,160.0,100.0,well above normal,above normal,0,0,1,0,29.7
2,55.56,male,1.68,69.0,140.0,90.0,above normal,normal,0,0,0,1,24.45
3,41.84,female,1.65,73.0,110.0,70.0,normal,normal,0,0,1,0,26.81
4,54.55,female,1.54,71.0,140.0,90.0,above normal,above normal,0,0,0,1,29.94


### Transforming the holdout
* We need to transform the train set the same way we did with the train set.

In [3]:
holdout['bmi_cat'] = pd.cut(holdout['bmi'], right= False, bins = [0,18.5,25,30,36], 
                       labels=['underweight','normal','overweight','obese'])
holdout['bmi_cat'].value_counts(dropna = False)

normal         5121
overweight     5000
obese          2460
underweight     136
Name: bmi_cat, dtype: int64

In [5]:
# create dummy columns from these categorical feature and assign it to a new data frame
dummy_holdout = pd.get_dummies(holdout, columns=['bmi_cat','gender'], prefix = '', prefix_sep = '', drop_first= True)

In [6]:
# we have more categorical columns that need to be converted into dummies
dummy_holdout = pd.get_dummies(dummy_holdout, columns=['cholesterol','gluc'], prefix =['chol','gluc'],)

In [7]:
dummy_holdout['bp_lvl'] = dummy_holdout['bp_high'].apply(lambda x: 'crisis' if (x >= 180) else
                                              ('hyper_2' if (x < 180) & (x >= 140) else
                                              ('hyper_1' if (x < 140) & (x >= 130) else
                                              ('normal' if (x < 130) & (x >= 90) else 'lowest'))))

In [8]:
dummy_holdout = pd.get_dummies(dummy_holdout, columns=['bp_lvl'], prefix =['bp'],) # make dummies for the new column

### Normalize the data

In [9]:
# normalized = value - mean / standard deviation
norm_holdout = (dummy_holdout - dummy_holdout.mean())/ dummy_holdout.std()
norm_holdout['cardio'] = dummy_holdout['cardio']  # plug back in the original values for the target

In [10]:
norm_holdout.head()

Unnamed: 0,age,height,weight,bp_high,bp_low,smoke,alco,active,cardio,bmi,normal,overweight,obese,male,chol_above normal,chol_normal,chol_well above normal,gluc_above normal,gluc_normal,gluc_well above normal,bp_crisis,bp_hyper_1,bp_hyper_2,bp_lowest,bp_normal
0,1.35,-1.4,-0.15,0.24,-2.24,-0.32,-0.24,0.49,1,0.84,-0.82,1.24,-0.49,-0.77,2.61,-1.81,-0.34,-0.27,0.41,-0.28,-0.11,2.59,-0.56,-0.03,-1.27
1,-0.48,1.42,1.77,2.07,2.01,-0.32,-0.24,0.49,0,0.89,-0.82,1.24,-0.49,-0.77,-0.38,-1.81,2.91,3.65,-2.45,-0.28,-0.11,-0.39,1.78,-0.03,-1.27
2,0.35,0.4,-0.24,0.85,0.95,-0.32,-0.24,-2.05,1,-0.51,1.22,-0.8,-0.49,1.3,2.61,-1.81,-0.34,-0.27,0.41,-0.28,-0.11,-0.39,1.78,-0.03,-1.27
3,-1.67,0.01,0.11,-0.98,-1.18,-0.32,-0.24,0.49,0,0.12,-0.82,1.24,-0.49,-0.77,-0.38,0.55,-0.34,-0.27,0.41,-0.28,-0.11,-0.39,-0.56,-0.03,0.79
4,0.2,-1.4,-0.06,0.85,0.95,-0.32,-0.24,-2.05,1,0.95,-0.82,1.24,-0.49,-0.77,2.61,-1.81,-0.34,3.65,-2.45,-0.28,-0.11,-0.39,1.78,-0.03,-1.27


In [14]:
# Split data to be used in the models
features_holdout = norm_holdout.drop(columns=['cardio'], axis = 1) # grabs everything else but 'catdio', creating a matrix of features

target = norm_holdout['cardio'] # y is the column we're trying to predict 

In [16]:
features_holdout.shape # this should have the same number of columns as the train set had before modelling = 24

(12717, 24)

### Predict the target 
___

In [17]:
import pickle 

pickle_in = open('DecisionTree.pickle','rb')
final_model = pickle.load(pickle_in)

In [18]:
final_model.feature_importances_

array([1.42183144e-01, 7.03642778e-04, 0.00000000e+00, 7.81001049e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.70083922e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.14045503e-02,
       0.00000000e+00, 1.00677476e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [19]:
predictions = final_model.predict(features_holdout)

In [21]:
predictions.size

12717

In [32]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

THRESHOLD = 0.5
predictions = np.where(final_model.predict_proba(features_holdout)[:,1] > THRESHOLD, 1, 0)

print("Test Recall: ", recall_score(target, predictions), '\n')
print("Test Accuracy: ", accuracy_score(target, predictions), '\n')
print('Test Precision: ', precision_score(target, predictions))

Test Recall:  0.7116938809170378 

Test Accuracy:  0.7271369033577102 

Test Precision:  0.7147589862514494


In [44]:
importance = pd.DataFrame(data=final_model.feature_importances_, index=[features_holdout.columns])
importance

Unnamed: 0,0
age,0.1421831
height,0.0007036
weight,0.0
bp_high,0.781001
bp_low,0.0
smoke,0.0
alco,0.0
active,0.0
bmi,0.0037008
normal,0.0
