<a href="https://colab.research.google.com/github/wh2353/datasciencecamp/blob/main/SpringBoard_DS_track/DataScienceCapstone2/Cardiovascular_modeling_multilayer_perceptron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cardiovascular Disease Prediction- Multilayer Perceptron network

In [1]:
try:
  %tensorflow_version 2.x # enable TF 2.x in Colab
except Exception:
  pass

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x # enable TF 2.x in Colab`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [2]:
import tensorflow as tf
print(tf.__version__)

2.7.0


<b>Import modules</b>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#!pip install keras_tuner
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping 
from tensorflow.keras.optimizers import Adam 
from keras_tuner.tuners import RandomSearch 
import warnings
import sklearn.metrics as metrics
from tensorflow.keras.layers import Dropout
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.model_selection import train_test_split
import pickle
warnings.filterwarnings('ignore') # To ignore warnings.


<b>Load train and test data sets</b>

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls drive/MyDrive/*pkl

Mounted at /content/drive
drive/MyDrive/train_test_data_from_featuretools.pkl
drive/MyDrive/train_test_data_from_preprocessing.pkl


In [4]:
#split train and test data from trimmed dataset


with open("drive/MyDrive/train_test_data_from_preprocessing.pkl", "rb") as f:
#with open("drive/MyDrive/train_test_data_from_featuretools.pkl", "rb") as f:
#with open("drive/MyDrive/train_test_data_from_proprocessing_plus_PCA.pkl", "rb") as f:

  train_data, test_data, train_labels, test_labels = pickle.load(f)
f.close()


print(f"Training data sizes are: {train_data.shape}")
print(f"Training label sizes are: {train_labels.shape}")


print(f"Test data sizes are: {test_data.shape}")
print(f"Test label sizes are: {test_labels.shape}")

Training data sizes are: (54870, 10)
Training label sizes are: (54870,)
Test data sizes are: (13718, 10)
Test label sizes are: (13718,)


<b> Separate validation set from training dataset</b>

In [18]:
valid_sets = np.random.choice(train_data.shape[0], int(train_data.shape[0]*0.2), replace=False)

rest_train = list(set(range(train_data.shape[0])).difference(set(valid_sets)))

rest_train_data = np.array(pd.DataFrame(train_data).iloc[rest_train])
rest_train_labels = np.array(pd.DataFrame(train_labels).iloc[rest_train])

valid_data = np.array(pd.DataFrame(train_data).iloc[valid_sets])
valid_labels = np.array(pd.DataFrame(train_labels).iloc[valid_sets])


print(f"The remaining training set shape is {rest_train_data.shape}")
print(f"The validation set shape is {valid_data.shape}")


The remaining training set shape is (43896, 10)
The validation set shape is (10974, 10)


<b>Define function to build sequenital models, apply random search for hyperparameter tuning</b>

In [22]:


def build_model(hp):
    model = Sequential()
    for i in range(hp.Int('layers', 2, 10)):
        model.add(Dense(units=hp.Int('units_' + str(i),
                                            min_value=100,
                                            max_value=10000,
                                            step=32),
                        input_dim=train_data.shape[1],
                               activation='relu'))
        
    model.add(Dense(1, activation='sigmoid'))

    
    
    
    model.compile(
        optimizer=Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy'])
    
    return model





In [23]:
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=2,
    overwrite=False,
    directory='drive/MyDrive/',
    project_name='20211206_catboost_Cardiovascular_Prediction_3_trials')
 
tuner.search(rest_train_data,rest_train_labels,batch_size=128,epochs=200,validation_data=(valid_data,valid_labels))



Trial 3 Complete [00h 09m 47s]
val_accuracy: 0.7365591526031494

Best val_accuracy So Far: 0.7365591526031494
Total elapsed time: 01h 25m 14s
INFO:tensorflow:Oracle triggered exit


<b>Obtain the best model based on random search results from tuner</b>

In [24]:
best_model = tuner.get_best_models(num_models=1)[0]

best_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5540)              60940     
                                                                 
 dense_1 (Dense)             (None, 804)               4454964   
                                                                 
 dense_2 (Dense)             (None, 1)                 805       
                                                                 
Total params: 4,516,709
Trainable params: 4,516,709
Non-trainable params: 0
_________________________________________________________________


<b>Make prediction on test data with best model, obtain confusion matrix, test accuracy and AUC scores</b>

In [25]:
best_model.evaluate(test_data, test_labels)



[0.5417953729629517, 0.7362589240074158]

In [27]:
#make predictions

result = best_model.predict(test_data) # Prediction using model


#create new labels

new_label = [int(x>=0.5) for x in result]


#Calculate confusion matrix
cnf_matrix= confusion_matrix(test_labels, new_label)
print(f"With Multilayer perceptron network, the confusion matrix is:\n{cnf_matrix}")


#Calculate test accuracy
accuracy = np.sum(new_label - test_labels == 0) / len(result)
print(f"The test accuracy is {accuracy}")


#Calculate auc score
auc_score = roc_auc_score(test_labels, new_label)
print(f"The auc score is {auc_score}")

With Multilayer perceptron network, the confusion matrix is:
[[5517 1479]
 [2139 4583]]
The test accuracy is 0.7362589298731593
The auc score is 0.7351923077904509
