In [1]:
# import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# packages for building the model
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
# packages for training model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
# misc package
import category_encoders as category_encoder

In [2]:
data = pd.read_csv('heart.csv')

In [3]:
data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
# 918 samples, 12 features
data.shape

(918, 12)

In [6]:
# check for missing values - there is no missing data
data.isnull().sum(axis=0)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
# check for duplicate values - there is no duplicated data
data.duplicated().sum(axis=0)

0

In [8]:
category_columns = []

# viewing the unique values, number of dimensions, and shape of each column in the data frame
for col in data.columns:
    print(f'{col}')
    print(f'Values: {data[col].unique()}')
    print(f'N-dim: {data[col].ndim}')
    print(f'Shape: {data[col].shape}')
    print('\n')
    
    if data[col].dtype == 'object':
        category_columns.append(col)
category_columns

Age
Values: [40 49 37 48 54 39 45 58 42 38 43 60 36 44 53 52 51 56 41 32 65 35 59 50
 47 31 46 57 55 63 66 34 33 61 29 62 28 30 74 68 72 64 69 67 73 70 77 75
 76 71]
N-dim: 1
Shape: (918,)


Sex
Values: ['M' 'F']
N-dim: 1
Shape: (918,)


ChestPainType
Values: ['ATA' 'NAP' 'ASY' 'TA']
N-dim: 1
Shape: (918,)


RestingBP
Values: [140 160 130 138 150 120 110 136 115 100 124 113 125 145 112 132 118 170
 142 190 135 180 108 155 128 106  92 200 122  98 105 133  95  80 137 185
 165 126 152 116   0 144 154 134 104 139 131 141 178 146 158 123 102  96
 143 172 156 114 127 101 174  94 148 117 192 129 164]
N-dim: 1
Shape: (918,)


Cholesterol
Values: [289 180 283 214 195 339 237 208 207 284 211 164 204 234 273 196 201 248
 267 223 184 288 215 209 260 468 188 518 167 224 172 186 254 306 250 177
 227 230 294 264 259 175 318 216 340 233 205 245 194 270 213 365 342 253
 277 202 297 225 246 412 265 182 218 268 163 529 100 206 238 139 263 291
 229 307 210 329 147  85 269 275 179 392 466 129 241 255 276 2

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [9]:
# labels - 0 does not have heart disease, 1 does have heart disease
data['HeartDisease'].unique()

array([0, 1])

In [10]:
# Convert the nomimnal categorical label values to numerical values

# get_dummies method
mod_data = data.copy()
mod_data = pd.get_dummies(mod_data, columns=category_columns)
mod_data.info(), mod_data.head()

# encoder = category_encoder.BinaryEncoder(cols = category_columns)
# data_mod = data.copy()
# df_category_encorder = encoder.fit_transform(data_mod)
# df_category_encorder

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   Cholesterol        918 non-null    int64  
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   HeartDisease       918 non-null    int64  
 7   Sex_F              918 non-null    uint8  
 8   Sex_M              918 non-null    uint8  
 9   ChestPainType_ASY  918 non-null    uint8  
 10  ChestPainType_ATA  918 non-null    uint8  
 11  ChestPainType_NAP  918 non-null    uint8  
 12  ChestPainType_TA   918 non-null    uint8  
 13  RestingECG_LVH     918 non-null    uint8  
 14  RestingECG_Normal  918 non-null    uint8  
 15  RestingECG_ST      918 non-null    uint8  
 16  ExerciseAngina_N   918 non

(None,
    Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease  \
 0   40        140          289          0    172      0.0             0   
 1   49        160          180          0    156      1.0             1   
 2   37        130          283          0     98      0.0             0   
 3   48        138          214          0    108      1.5             1   
 4   54        150          195          0    122      0.0             0   
 
    Sex_F  Sex_M  ChestPainType_ASY  ...  ChestPainType_NAP  ChestPainType_TA  \
 0      0      1                  0  ...                  0                 0   
 1      1      0                  0  ...                  1                 0   
 2      0      1                  0  ...                  0                 0   
 3      1      0                  1  ...                  0                 0   
 4      0      1                  0  ...                  1                 0   
 
    RestingECG_LVH  RestingECG_Normal  RestingE

In [11]:
mod_data.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_F', 'Sex_M', 'ChestPainType_ASY',
       'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA',
       'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down',
       'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [12]:
features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR',
        'Oldpeak', 'Sex_F', 'Sex_M', 'ChestPainType_ASY',
       'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA',
       'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down',
       'ST_Slope_Flat', 'ST_Slope_Up']
output = ['HeartDisease']

X = np.array(mod_data[features])
y = np.array(mod_data[output])

len(features), X.shape, y.shape

(20, (918, 20), (918, 1))

In [13]:
X.dtype, y.dtype

(dtype('float64'), dtype('int64'))

In [14]:
# split dataframe into train samples/labels and test samples/labels
train_samples, test_samples, train_labels, test_labels = train_test_split(X, y, test_size=0.20)
# convert the dataframes to numpy arrays (tensors)
train_labels = np.array(train_labels)
train_samples = np.array(train_samples)
test_labels = np.array(test_labels)
test_samples = np.array(test_samples)

train_samples.shape, train_labels.shape, len(features)

((734, 20), (734, 1), 20)

In [15]:
# building a Sequential model - linear stack of layers
# init the model
model = Sequential()
# add Dense layers to the models;
# units = number of nodes, input_shape = tensor shape the input layer expect (inits weights); activation - activation function
model.add(Dense(units=16, activation='relu', input_shape=(20, )))
model.add(Dense(units=32, activation='relu'))
# use sigmoid in last layer because it is a binary classification problem
model.add(Dense(units=1, activation='sigmoid'))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 913
Trainable params: 913
Non-trainable params: 0
_________________________________________________________________


In [17]:
# compilation step - 1) loss function 2) optimizer 3) Metrics to monitor during training and testing
opt = Adam(learning_rate=0.01) # defining the optimizer
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# training the model by fitting the normalized training data
model.fit(x=train_samples, y=train_labels, batch_size=20, epochs=30, verbose=2)

Epoch 1/30
37/37 - 0s - loss: 1.6396 - accuracy: 0.6131
Epoch 2/30
37/37 - 0s - loss: 0.5603 - accuracy: 0.7439
Epoch 3/30
37/37 - 0s - loss: 0.6339 - accuracy: 0.7003
Epoch 4/30
37/37 - 0s - loss: 0.5791 - accuracy: 0.7384
Epoch 5/30
37/37 - 0s - loss: 0.4592 - accuracy: 0.8215
Epoch 6/30
37/37 - 0s - loss: 0.4246 - accuracy: 0.8243
Epoch 7/30
37/37 - 0s - loss: 0.4560 - accuracy: 0.7997
Epoch 8/30
37/37 - 0s - loss: 0.4909 - accuracy: 0.7888
Epoch 9/30
37/37 - 0s - loss: 0.4780 - accuracy: 0.8011
Epoch 10/30
37/37 - 0s - loss: 0.3996 - accuracy: 0.8297
Epoch 11/30
37/37 - 0s - loss: 0.3885 - accuracy: 0.8283
Epoch 12/30
37/37 - 0s - loss: 0.3692 - accuracy: 0.8542
Epoch 13/30
37/37 - 0s - loss: 0.3820 - accuracy: 0.8556
Epoch 14/30
37/37 - 0s - loss: 0.3862 - accuracy: 0.8460
Epoch 15/30
37/37 - 0s - loss: 0.3694 - accuracy: 0.8515
Epoch 16/30
37/37 - 0s - loss: 0.3845 - accuracy: 0.8297
Epoch 17/30
37/37 - 0s - loss: 0.4254 - accuracy: 0.8283
Epoch 18/30
37/37 - 0s - loss: 0.3533 - 

<keras.callbacks.History at 0x7fb75999a160>

In [19]:
# # .iloc[:, 0:11] --> get all rows and only the first 11 columns (the feature columns)
# X = data.iloc[:, 0:11]
# # .iloc[:, -1] --> get all rows and only the last column
# y = data.iloc[:, -1]

# X.shape, y.shape