In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##### Load datasets

Attributes:
1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
                    elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy
                    by Estes' criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak: ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
        -- Value 1: upsloping
        -- Value 2: flat
        -- Value 3: downsloping
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. num: the predicted attribute: heart disease presence (values 1,2,3,4) and absence (value 0).  

In [17]:
cleveland = pd.read_csv('processed.cleveland.data', 
            names=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'])

In [18]:
hungarian = pd.read_csv('processed.hungarian.data', 
            names=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'])

In [19]:
switzerland = pd.read_csv('processed.switzerland.data', 
            names=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'])

In [20]:
va = pd.read_csv('processed.va.data', 
            names=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'])

In [167]:
data = pd.concat([cleveland, hungarian, switzerland, va])

In [168]:
data.shape

(920, 14)

In [169]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,54.0,0.0,4.0,127,333,1,1,154,0,0,?,?,?,1
196,62.0,1.0,1.0,?,139,0,1,?,?,?,?,?,?,0
197,55.0,1.0,4.0,122,223,1,1,100,0,0,?,?,6,2
198,58.0,1.0,4.0,?,385,1,2,?,?,?,?,?,?,0


#### Preprocess

In [170]:
from scipy import stats

In [171]:
data.trestbps = data.trestbps.map(lambda x: x if x!="?" else np.nan)
data.trestbps = data.trestbps.astype('float64')
data.trestbps.fillna(data.trestbps.mean(), inplace=True)

In [173]:
data.chol = data.chol.map(lambda x: x if x!="?" else np.nan)
data.chol = data.chol.astype('float64')
data.chol.fillna(data.chol.mean(), inplace=True)

In [175]:
data.fbs = data.fbs.map(lambda x: x if x!="?" else np.nan)
data.fbs = data.fbs.astype('float64')
data.fbs.fillna(data.fbs.mode().values[0], inplace=True)

In [177]:
data.restecg = data.restecg.map(lambda x: x if x!="?" else np.nan)
data.restecg = data.restecg.astype('float64') 
data.restecg.fillna(data.restecg.mode().values[0], inplace=True)

In [179]:
data.thalach = data.thalach.map(lambda x: x if x!="?" else np.nan)
data.thalach = data.thalach.astype('float64')
data.thalach.fillna(data.thalach.median(), inplace=True)

In [182]:
data.exang = data.exang.map(lambda x: x if x!="?" else np.nan)
data.exang = data.exang.astype('float64')
data.exang.fillna(data.exang.mode().values[0], inplace=True)

In [183]:
data.oldpeak = data.oldpeak.map(lambda x: x if x!="?" else np.nan)
data.oldpeak = data.oldpeak.astype('float64')
data.oldpeak.fillna(data.oldpeak.mode().values[0], inplace=True)


In [187]:
data.slope = data.slope.map(lambda x: x if x!="?" else np.nan)
data.slope = data.slope.astype('float64')
data.slope.fillna(data.slope.mode().values[0], inplace=True)

In [192]:
data.ca = data.ca.map(lambda x: x if x!="?" else np.nan)
data.ca = data.ca.astype('float64')
data.ca.fillna(data.ca.mode().values[0], inplace=True)

In [199]:
data.thal = data.thal.map(lambda x: x if x!="?" else np.nan)
data.thal = data.thal.astype('float64')
data.thal.fillna(data.thal.mode().values[0], inplace=True)

In [206]:
data.age = data.age.astype('int32')
data.sex = data.sex.astype('int32') 
data.cp = data.cp.astype('int32')
data.restecg = data.restecg.astype('int32') 
data.thalach = data.thalach.astype('int32') 
data.exang = data.exang.astype('int32')
data.fps = data.fbs.astype('int32')
data.slope = data.slope.astype('int32')
data.ca = data.ca.astype('int32')
data.thal = data.thal.astype('int32')
data.num = data.num.astype('int32') 

In [209]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,53.51087,0.78913,3.25,132.132404,199.130337,0.15,0.603261,137.692391,0.366304,0.819565,1.847826,0.227174,3.984783,0.995652
std,9.424685,0.408148,0.930969,18.443895,108.957634,0.357266,0.805443,25.145235,0.482056,1.076582,0.516007,0.628936,1.68,1.142693
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,1.0,0.0,3.0,0.0
25%,47.0,1.0,3.0,120.0,177.75,0.0,0.0,120.0,0.0,0.0,2.0,0.0,3.0,0.0
50%,54.0,1.0,4.0,130.0,221.0,0.0,0.0,140.0,0.0,0.2,2.0,0.0,3.0,1.0
75%,60.0,1.0,4.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,2.0,0.0,6.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0
