In [2]:
%matplotlib inline

from pathlib import Path

#for data wrangling
import numpy as np
import pandas as pd

#for graphing
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
# for training our model
X = pd.read_csv('./data/train_values.csv', index_col='patient_id')
y = pd.read_csv('./data/train_labels.csv', index_col='patient_id').heart_disease_present

test = pd.read_csv('./data/test_values.csv', index_col='patient_id')

In [4]:
pd.isnull(X).any()
X.dtypes

slope_of_peak_exercise_st_segment         int64
thal                                     object
resting_blood_pressure                    int64
chest_pain_type                           int64
num_major_vessels                         int64
fasting_blood_sugar_gt_120_mg_per_dl      int64
resting_ekg_results                       int64
serum_cholesterol_mg_per_dl               int64
oldpeak_eq_st_depression                float64
sex                                       int64
age                                       int64
max_heart_rate_achieved                   int64
exercise_induced_angina                   int64
dtype: object

In [5]:
# Change data type of 'thal' column
X['thal'] = X['thal'].apply(lambda thal: 0 if thal == 'normal' else 1 if thal == 'reversible_defect' else 2)
test['thal'] = test['thal'].apply(lambda thal: 0 if thal == 'normal' else 1 if thal == 'reversible_defect' else 2)


# MLP Regressor


In [6]:
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import train_test_split # splitting data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines

scaler = MinMaxScaler()
test_values = scaler.fit_transform(test)
X_values= scaler.fit_transform(X)


#splitting data
train_features, test_features, train_outcome, test_outcome = train_test_split(
    X,
    y,
    test_size = 0.3,
    random_state = 11
)


In [7]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='lbfgs')

pipe = make_pipeline(scaler, clf)

params = {'learning_rate': ["constant", "invscaling", "adaptive"] }

gs = GridSearchCV(clf,param_grid=params,n_jobs=-1,verbose=2,cv=10)

gs.fit(train_features, train_outcome)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.9s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': ['constant', 'invscaling', 'adaptive']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [8]:
gs.score(test_features, test_outcome)

-0.1249593931216979

In [9]:
pred = gs.predict(test_features).astype(np.float64)

In [11]:
pred2 = gs.predict(X).astype(np.float64)
len(pred2)

180

In [12]:
predictions = gs.predict(test).astype(np.float64)

In [13]:
submission_format = pd.read_csv('data/submission_format.csv')
submission_format = submission_format.drop(['heart_disease_present'], axis = 1)
submission_format['heart_disease_present'] = pd.Series(predictions)
submission_format.to_csv('mlpregressor.csv',index=False)

# Graphs


In [15]:
a = pd.read_csv('./data/train_values.csv')
compare = pd.read_csv('./data/train_labels.csv')
len(X.age)


180

In [16]:
compare['predictions'] = pred2
compare['age'] = a.age

In [17]:
compare['sex'] = a.sex

In [18]:
present = compare[compare.heart_disease_present == 1]
notPresent = compare[compare.heart_disease_present == 0]

In [20]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='tango222', api_key='BlQjq4nsuvthlCZY9BPV')

import plotly.plotly as py
import plotly.graph_objs as go

# Create a trace
heart_disease_present = go.Scatter(
    x = present.age,
    y = present.predictions,
    mode = 'markers',
    name = "1",
    marker = dict( size = 10,
            color = '#4da6ff',
            line = dict(width = 1)
    )
)

heart_disease_not_present = go.Scatter(
    x = notPresent.age,
    y = notPresent.predictions,
    mode = 'markers',
    name = "0",
    marker = dict( size = 10,
            color = '#ff8080',
            line = dict(width = 1)
)
)
data1 = [heart_disease_present,heart_disease_not_present]
    
layout1 = dict(title= 'Age v Predicted Probability of Heart Disease',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Age',
    ),
    yaxis=dict(
        title= 'Predicted Probability of Heart Disease',
    )
)

py.iplot(data1, layout = layout1)

In [21]:
# Plot for age v probability of heart disease
heart_disease_present2 = go.Scatter(
    x = present.sex,
    y = present.predictions,
    mode = 'markers', 
    name = "1",
    marker = dict(size= 14,
            color = '#4da6ff',
            line = dict(width = 1),
            opacity = 0.2
    )
)

heart_disease_not_present2 = go.Scatter(
    x = notPresent.sex,
    y = notPresent.predictions,
    mode = 'markers',
    name = '0',
    marker = dict(size= 14,
            color = '#ff8080',
            line = dict(width = 1),
            opacity = 0.2      
)
)
data2 = [heart_disease_present2,heart_disease_not_present2]
    
layout2 = dict(title= 'Sex v Predicted Probability of Heart Disease',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Sex',
    ),
    yaxis=dict(
        title= ' Predicted Probability of Heart Disease',
    )
)


py.iplot(data2, layout = layout2)

In [22]:
# prediction v actual


y0 = present.predictions
y1 = notPresent.predictions

trace1 = go.Box(
    y=y0, 
    name = '1'
)
trace2 = go.Box(
    y=y1,
    name = "0"
)
data4 = [trace1, trace2]
py.iplot(data4)
    
layout3 = dict(title= 'Predicted Probability of Heart Disease Distribution',
    hovermode= 'closest',
)

py.iplot(data4, layout = layout3) 