In [1]:
!pip install scikeras



In [2]:
import pandas as pd

data = pd.read_csv('../../data/processed/cleaned_heart_data.csv')

In [3]:
from sklearn.model_selection import train_test_split

seed_value = 42
# Separate features (X) and target (y)
y = data['HeartDisease']
X = data.drop('HeartDisease', axis=1) # Drop target column

# Since the data is pre-processed we don't need to do any further processing but simply split it
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_value)

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input, BatchNormalization
from keras.regularizers import l2

# MLP model creation
def create_model(neurons=64, activation='relu', optimizer='adam', input_shape=(X_train.shape[1],)):
    model = Sequential([
        Input(shape=input_shape),
        Dense(neurons, activation=activation),
        BatchNormalization(),
        Dense(neurons // 2, activation=activation), # Half neurons in the second layer
        Dropout(0.3),
        Dense(1, activation='sigmoid') # Output layer for binary classification
    ])

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [5]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier


# 2. Create a KerasClassifier
model = KerasClassifier(model=create_model, verbose=0)

# # 3. Define the hyperparameter grid
param_grid = {
    'model__neurons': [64, 128],
    'model__activation': ['relu', 'leaky_relu', 'tanh'],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [16, 32],
    'epochs': [50, 20]
}

# 4. Create GridSearchCV object
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    cv=3,
                    scoring='accuracy',
                    error_score='raise',
                    n_jobs=-1)

# 5. Fit the grid search to your training data
grid_result = grid.fit(X_train, y_train)

# 6. Print the best parameters and score
print(f"Best Parameters: {grid_result.best_params_}")
print(f"Best Accuracy: {grid_result.best_score_:.4f}")



Best Parameters: {'batch_size': 32, 'epochs': 50, 'model__activation': 'leaky_relu', 'model__neurons': 128, 'optimizer': 'adam'}
Best Accuracy: 0.8440


In [6]:

# 7. Evaluate the best model on the test set
best_model = grid_result.best_estimator_.model_

# loss, accuracy = best_model.model.evaluate(X_test, y_test)  # Access the underlying Keras model
best_model.summary()
loss, accuracy = best_model.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 940us/step - accuracy: 0.9051 - loss: 0.2587
Test Loss: 0.2634
Test Accuracy: 0.9067


In [7]:
#baseline - 'Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBP', 'RestingECG'
#stress test - 'MaxHR', 'Oldpeak', 'ExerciseAngina', 'ST_Slope'

# Baseline features
X_train_reduced = X_train.drop(columns=['MaxHR', 'Oldpeak', 'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'])
X_test_reduced = X_test.drop(columns=['MaxHR', 'Oldpeak', 'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'])


In [8]:
X_train_reduced.head(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST
70,57,140,265,0,True,True,False,False,False,True
164,52,140,225,0,False,True,False,False,True,False
710,56,140,294,0,False,True,False,False,False,False
265,54,160,305,0,True,True,False,False,True,False
250,44,135,491,0,True,False,False,False,True,False


In [9]:
X_test_reduced.head(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST
208,28,130,132,0,True,True,False,False,False,False
259,55,122,320,0,False,True,False,False,True,False
97,39,160,147,1,True,False,True,False,True,False
148,50,120,168,0,True,True,False,False,True,False
395,71,130,221,0,True,False,False,False,False,True


In [10]:
# Here we will do two different experiments

# 1. Re-train the best model with just the reduced data.
# for this we will create a new model using all the parameters
# found by usage of grid search but using the new data shape
best_model_reduced = create_model(
    neurons=grid_result.best_params_.get('model__neurons'),
    activation=grid_result.best_params_.get('model__activation'),
    optimizer=grid_result.best_params_.get('optimizer'),
    input_shape=(X_train_reduced.shape[1],))

best_model_reduced.summary()

In [11]:
import numpy as np

X_train_reduced = X_train_reduced.astype(np.float32)
X_test_reduced = X_test_reduced.astype(np.float32)

In [12]:
best_model_reduced.fit(X_train_reduced,
                       y_train,
                       epochs=grid_result.best_params_.get('epochs'),
                       batch_size=grid_result.best_params_.get('batch_size'),
                       validation_data=(X_test_reduced, y_test),
                       verbose=1)

Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4958 - loss: 0.7864 - val_accuracy: 0.5267 - val_loss: 6.5170
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6253 - loss: 0.6914 - val_accuracy: 0.5267 - val_loss: 4.4529
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6223 - loss: 0.7467 - val_accuracy: 0.5267 - val_loss: 1.5802
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6338 - loss: 0.6523 - val_accuracy: 0.5267 - val_loss: 0.9714
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6310 - loss: 0.6706 - val_accuracy: 0.5200 - val_loss: 0.8898
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6225 - loss: 0.6887 - val_accuracy: 0.5267 - val_loss: 1.1304
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x30e221e80>

In [14]:
loss, accuracy = best_model_reduced.evaluate(X_test_reduced, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5118 - loss: 1.4746 
Test Loss: 1.5273
Test Accuracy: 0.5067


In [18]:
model_reduced = KerasClassifier(model=create_model, verbose=0, input_shape=(X_train_reduced.shape[1],))

# 4. Create GridSearchCV object
grid_reduced = GridSearchCV(estimator=model_reduced,
                            param_grid=param_grid,
                            cv=3,
                            scoring='accuracy',
                            error_score='raise',
                            n_jobs=-1)

# 5. Fit the grid search to your training data
grid_reduced_result = grid_reduced.fit(X_train_reduced, y_train)

# 6. Print the best parameters and score
print(f"Best Parameters: {grid_reduced_result.best_params_}")
print(f"Best Accuracy: {grid_reduced_result.best_score_:.4f}")



Best Parameters: {'batch_size': 32, 'epochs': 50, 'model__activation': 'relu', 'model__neurons': 128, 'optimizer': 'rmsprop'}
Best Accuracy: 0.7248


In [20]:
best_model_reduced_grid = grid_reduced_result.best_estimator_.model_

# loss, accuracy = best_model.model.evaluate(X_test, y_test)  # Access the underlying Keras model
best_model_reduced_grid.summary()
loss, accuracy = best_model_reduced_grid.evaluate(X_test_reduced, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 966us/step - accuracy: 0.6324 - loss: 0.7160
Test Loss: 0.7658
Test Accuracy: 0.6200
