In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
data_path = '../data/heart.csv'

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df.shape

(918, 12)

In [6]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [7]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

### Segregating the target and independent variable

In [6]:
target = df['HeartDisease'].values
data = df.drop('HeartDisease',axis=1)

In [7]:
d_copy = data.copy()

In [10]:
def continuous_categorical(data):
    int_col = [c for c in data.columns if data[c].dtype == 'int64' or data[c].dtype == 'float64']
    str_col = [c for c in data.columns if c not in int_col]
    return data[int_col],data[str_col]

In [11]:
df_int, df_str = continuous_categorical(d_copy)

In [12]:
df_int

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,40,140,289,0,172,0.0
1,49,160,180,0,156,1.0
2,37,130,283,0,98,0.0
3,48,138,214,0,108,1.5
4,54,150,195,0,122,0.0
...,...,...,...,...,...,...
913,45,110,264,0,132,1.2
914,68,144,193,1,141,3.4
915,57,130,131,0,115,1.2
916,57,130,236,0,174,0.0


In [13]:
df_str

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up
...,...,...,...,...,...
913,M,TA,Normal,N,Flat
914,M,ASY,Normal,N,Flat
915,M,ASY,Normal,Y,Flat
916,F,ATA,LVH,N,Flat


In [27]:
for c in df_str.columns:
    print('{}---> {}'.format(c,len(df_str[c].unique())))

Sex---> 2
ChestPainType---> 4
RestingECG---> 3
ExerciseAngina---> 2
ST_Slope---> 3


## Upon checking the unique values in df_str data, we can convert the columns with 2 unique values into a binary column and for the rest we could use onehotencoder

In [39]:
bin_col = []
mult_catcol = []
for c in df_str.columns:
    if len(df_str[c].unique()) == 2:
        print('{}---> {}'.format(c,df_str[c].unique()))
        bin_col.append(c)
    else:
        mult_catcol.append(c)

Sex---> ['M' 'F']
ExerciseAngina---> ['N' 'Y']


In [40]:
mult_catcol

['ChestPainType', 'RestingECG', 'ST_Slope']

In [31]:
dd = data.copy()

In [35]:
dd['Sex'] = dd['Sex'].map({'M':1,'F':0})

In [37]:
dd['ExerciseAngina'] = dd['ExerciseAngina'].map({'Y':1,'N':0})

In [38]:
dd.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up


In [42]:
from sklearn.preprocessing import OneHotEncoder

In [56]:
oh = OneHotEncoder(drop='first').fit(df_str)

In [57]:
oh_str = oh.transform(df_str)

In [59]:
oh_str.toarray()

array([[1., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 1., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [61]:
df_strenc = pd.get_dummies(d_copy,drop_first=True)

In [62]:
df_strenc

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,0,1,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,0,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,0,0,1,0,1,1,0
916,57,130,236,0,174,0.0,0,1,0,0,0,0,0,1,0


In [64]:
for c in df_int.columns:
    print(len(df_int[c].unique()))

50
67
222
2
119
53
