## Cardiovascular Disease Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import, clean and tidy data

In [44]:
patients = pd.read_csv(
                       './data/processed.cleveland.data',
                       header = None,
                       names = [
                                "age",
                                "sex", # male (1),female (0)
                                "c_pain", # chest pain 
                                              #   typical angina (1)
                                              #   atypical angina (2)
                                              #   non-anginal pain (3)
                                              #   asymptomatic (4)
                                "b_press", # resting blood pressure upon admission [mmHg]
                                "choles", # serum cholesterol [mg/dl]
                                "b_sugar", # fasting blood sugar
                                           #   <= 120 mg/dl (0)
                                           #   > 120 mg/dl (1)
                                "ecg", # resting electrocardiograph results  
                                       #   normal (0)
                                       #   ST-T wave abnormality (1)
                                       #   left ventricular hypertrophy (2)
                                "h_rate", # maximum heart rate
                                "angina", # exercise induced angina; true (1), false (0)    
                                "s_dep", # ST depression induced by exercise relative to rest
                                "s_slope", # peak exercise ST element slope; up (1), flat(2), down (3)
                                "b_vess", # number of major vessels (0-3) colored by fluoroscope
                                "thal", # normal (3), 6 = fixed defect (6), 7 = reversible defect (7)
                                "diag" # diagnosis of heart disease (angiographic disease status)
                                ],
                        na_values = ["?"]
)

# drop NaN observations
patients = patients.dropna()

# convert diagnosis to boolean
patients = patients.assign(diag = patients.diag.astype(bool))

patients

Unnamed: 0,age,sex,c_pain,b_press,choles,b_sugar,ecg,h_rate,angina,s_dep,s_slope,b_vess,thal,diag
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,False
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,True
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,True
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,True
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,True
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,True
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,True


### Correlation matrix

In [45]:
patients.drop(columns = 'diag').corr()

Unnamed: 0,age,sex,c_pain,b_press,choles,b_sugar,ecg,h_rate,angina,s_dep,s_slope,b_vess,thal
age,1.0,-0.092399,0.110471,0.290476,0.202644,0.132062,0.149917,-0.394563,0.096489,0.197123,0.159405,0.36221,0.126586
sex,-0.092399,1.0,0.008908,-0.06634,-0.198089,0.03885,0.033897,-0.060496,0.143581,0.106567,0.033345,0.091925,0.383652
c_pain,0.110471,0.008908,1.0,-0.03698,0.072088,-0.057663,0.063905,-0.339308,0.377525,0.203244,0.151079,0.235644,0.2685
b_press,0.290476,-0.06634,-0.03698,1.0,0.131536,0.18086,0.149242,-0.049108,0.066691,0.191243,0.121172,0.097954,0.138183
choles,0.202644,-0.198089,0.072088,0.131536,1.0,0.012708,0.165046,-7.5e-05,0.059339,0.038596,-0.009215,0.115945,0.010859
b_sugar,0.132062,0.03885,-0.057663,0.18086,0.012708,1.0,0.068831,-0.007842,-0.000893,0.008311,0.047819,0.152086,0.062209
ecg,0.149917,0.033897,0.063905,0.149242,0.165046,0.068831,1.0,-0.07229,0.081874,0.113726,0.135141,0.129021,0.018795
h_rate,-0.394563,-0.060496,-0.339308,-0.049108,-7.5e-05,-0.007842,-0.07229,1.0,-0.384368,-0.34764,-0.389307,-0.268727,-0.274831
angina,0.096489,0.143581,0.377525,0.066691,0.059339,-0.000893,0.081874,-0.384368,1.0,0.28931,0.250572,0.148232,0.326927
s_dep,0.197123,0.106567,0.203244,0.191243,0.038596,0.008311,0.113726,-0.34764,0.28931,1.0,0.579037,0.294452,0.344976


In [46]:
(patients.diag != 0).mean()

0.4612794612794613

### Pre-processing

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
train_data, test_data = train_test_split(patients, test_size = 0.2, random_state = 618)

In [51]:
train_data.head()

Unnamed: 0,age,sex,c_pain,b_press,choles,b_sugar,ecg,h_rate,angina,s_dep,s_slope,b_vess,thal,diag
37,57.0,1.0,4.0,150.0,276.0,0.0,2.0,112.0,1.0,0.6,2.0,1.0,6.0,True
152,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,False
11,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,False
33,59.0,1.0,4.0,135.0,234.0,0.0,0.0,161.0,0.0,0.5,2.0,0.0,7.0,False
141,59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,True


In [52]:
test_data.head()

Unnamed: 0,age,sex,c_pain,b_press,choles,b_sugar,ecg,h_rate,angina,s_dep,s_slope,b_vess,thal,diag
118,63.0,1.0,4.0,130.0,330.0,1.0,2.0,132.0,1.0,1.8,1.0,3.0,7.0,True
109,39.0,1.0,4.0,118.0,219.0,0.0,0.0,140.0,0.0,1.2,2.0,0.0,7.0,True
291,55.0,0.0,2.0,132.0,342.0,0.0,0.0,166.0,0.0,1.2,1.0,0.0,3.0,False
214,52.0,1.0,4.0,112.0,230.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,True
171,53.0,1.0,4.0,142.0,226.0,0.0,2.0,111.0,1.0,0.0,1.0,0.0,7.0,False
