In [1]:
%matplotlib inline

from pathlib import Path

#for data wrangling
import numpy as np
import pandas as pd

#for graphing
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# for training our model
X = pd.read_csv('./data/train_values.csv', index_col='patient_id')
y = pd.read_csv('./data/train_labels.csv', index_col='patient_id').heart_disease_present

test = pd.read_csv('./data/test_values.csv', index_col='patient_id')

In [3]:
len(X.thal)

180

In [4]:
X.head()

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [5]:
pd.isnull(X).any()
X.dtypes

slope_of_peak_exercise_st_segment         int64
thal                                     object
resting_blood_pressure                    int64
chest_pain_type                           int64
num_major_vessels                         int64
fasting_blood_sugar_gt_120_mg_per_dl      int64
resting_ekg_results                       int64
serum_cholesterol_mg_per_dl               int64
oldpeak_eq_st_depression                float64
sex                                       int64
age                                       int64
max_heart_rate_achieved                   int64
exercise_induced_angina                   int64
dtype: object

In [6]:
pd.isnull(y).any()

False

In [7]:
# Change data type of 'thal' column
X['thal'] = X['thal'].apply(lambda thal: 0 if thal == 'normal' else 1 if thal == 'reversible_defect' else 2)
test['thal'] = test['thal'].apply(lambda thal: 0 if thal == 'normal' else 1 if thal == 'reversible_defect' else 2)


In [8]:
X.head()
test

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
olalu7,2,1,170,1,0,0,2,288,0.2,1,59,159,0
z9n6mx,1,0,138,4,0,0,0,183,1.4,0,35,182,0
5k4413,2,1,120,4,0,0,2,177,2.5,1,43,120,1
mrg7q5,1,0,102,3,1,0,0,318,0.0,0,60,160,0
uki4do,2,0,138,4,1,0,2,166,3.6,1,61,125,1
kev1sk,2,0,122,3,0,0,0,213,0.2,0,43,165,0
9n6let,3,0,150,1,0,0,0,226,2.6,0,66,114,0
jxmtyg,2,1,140,3,3,0,2,254,2.0,1,69,146,0
51s2ff,1,0,138,4,0,0,2,271,0.0,1,59,182,0
wi9mcs,1,0,138,3,0,0,2,257,0.0,1,47,156,0


In [9]:
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import train_test_split # splitting data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines

scaler = MinMaxScaler()
test_values = scaler.fit_transform(test)
X_values= scaler.fit_transform(X)


#splitting data
train_features, test_features, train_outcome, test_outcome = train_test_split(
    X,
    y,
    test_size = 0.3,
    random_state = 11
)


In [10]:
##clf.get_params()

In [11]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='lbfgs')

pipe = make_pipeline(scaler, clf)

params = {'learning_rate': ["constant", "invscaling", "adaptive"] }

gs = GridSearchCV(clf,param_grid=params,n_jobs=-1,verbose=2,cv=10)

gs.fit(train_features, train_outcome)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.0s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': ['constant', 'invscaling', 'adaptive']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [13]:
gs.score(test_features, test_outcome)

0.23281595743342276

In [14]:
pred = gs.predict(test_features).astype(np.float64)

In [15]:
pred2 = gs.predict(X).astype(np.float64)
len(pred2)

180

In [16]:
len(pred)

54

In [17]:
predictions = gs.predict(test).astype(np.float64)

In [18]:
len(predictions)

90

In [152]:
submission_format = pd.read_csv('data/submission_format.csv')
submission_format = submission_format.drop(['heart_disease_present'], axis = 1)
submission_format['heart_disease_present'] = pd.Series(predictions)
submission_format.to_csv('mlfregressor.csv',index=False)

In [19]:
len(y)
len(predictions)


90

# Graphs

In [20]:
a = pd.read_csv('./data/train_values.csv')
compare = pd.read_csv('./data/train_labels.csv')
len(X.age)
a.age

0      45
1      54
2      77
3      40
4      59
5      42
6      60
7      57
8      59
9      50
10     66
11     42
12     64
13     45
14     38
15     50
16     45
17     60
18     29
19     58
20     71
21     52
22     67
23     66
24     70
25     68
26     57
27     52
28     60
29     51
       ..
150    54
151    54
152    35
153    44
154    62
155    59
156    69
157    48
158    44
159    60
160    49
161    60
162    62
163    45
164    53
165    67
166    57
167    46
168    63
169    64
170    58
171    66
172    39
173    51
174    54
175    67
176    55
177    64
178    48
179    54
Name: age, Length: 180, dtype: int64

In [21]:
list(X)

['slope_of_peak_exercise_st_segment',
 'thal',
 'resting_blood_pressure',
 'chest_pain_type',
 'num_major_vessels',
 'fasting_blood_sugar_gt_120_mg_per_dl',
 'resting_ekg_results',
 'serum_cholesterol_mg_per_dl',
 'oldpeak_eq_st_depression',
 'sex',
 'age',
 'max_heart_rate_achieved',
 'exercise_induced_angina']

In [22]:
compare['predictions'] = pred2
compare['age'] = a.age

In [23]:
compare['sex'] = a.sex
compare.head()

Unnamed: 0,patient_id,heart_disease_present,predictions,age,sex
0,0z64un,0,0.162129,45,1
1,ryoo3j,0,0.231954,54,0
2,yt1s1x,1,0.690964,77,1
3,l2xjde,1,0.485726,40,1
4,oyt4ek,0,0.449283,59,1


In [24]:
present = compare[compare.heart_disease_present == 1]
notPresent = compare[compare.heart_disease_present == 0]


In [49]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='tango222', api_key='BlQjq4nsuvthlCZY9BPV')

import plotly.plotly as py
import plotly.graph_objs as go

# Create a trace
heart_disease_present = go.Scatter(
    x = present.age,
    y = present.predictions,
    mode = 'markers',
    name = "1",
    marker = dict( size = 10,
            color = '#4da6ff',
            line = dict(width = 1)
    )
)

heart_disease_not_present = go.Scatter(
    x = notPresent.age,
    y = notPresent.predictions,
    mode = 'markers',
    name = "0",
    marker = dict( size = 10,
            color = '#ff8080',
            line = dict(width = 1)
)
)
data1 = [heart_disease_present,heart_disease_not_present]
    
layout1 = dict(title= 'Age v Predicted Probability of Heart Disease',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Age',
    ),
    yaxis=dict(
        title= 'Predicted Probability of Heart Disease',
    )
)

py.iplot(data1, layout = layout1)

In [51]:
# Plot for age v probability of heart disease
heart_disease_present2 = go.Scatter(
    x = present.sex,
    y = present.predictions,
    mode = 'markers', 
    name = "1",
    marker = dict(size= 14,
            color = '#4da6ff',
            line = dict(width = 1),
            opacity = 0.2
    )
)

heart_disease_not_present2 = go.Scatter(
    x = notPresent.sex,
    y = notPresent.predictions,
    mode = 'markers',
    name = '0',
    marker = dict(size= 14,
            color = '#ff8080',
            line = dict(width = 1),
            opacity = 0.2      
)
)
data2 = [heart_disease_present2,heart_disease_not_present2]
    
layout2 = dict(title= 'Sex v Predicted Probability of Heart Disease',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Sex',
    ),
    yaxis=dict(
        title= ' Predicted Probability of Heart Disease',
    )
)

py.iplot(data2, layout = layout2)

In [None]:
# prediction v actual


y0 = present.predictions
y1 = notPresent.predictions

trace1 = go.Box(
    y=y0, 
    name = '1'
)
trace2 = go.Box(
    y=y1,
    name = "0"
)
data4 = [trace1, trace2]
py.iplot(data4)
    
layout3 = dict(title= 'Predicted Probability of Heart Disease Distribution',
    hovermode= 'closest',
)

py.iplot(data4, layout = layout3)    
