<a href="https://colab.research.google.com/github/supertime1/BP_PPG/blob/master/BP_PPG_Class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1.Introduction

This notebook trains an simple PPG DNN by using labeled PPG data from Afib_Data_Clean notebook;
The loaded data is 30s segemented PPG signals with 125Hz sampling rate.

#2.Setup Environment



In [1]:
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext tensorboard
import numpy as np
import os
import shutil
import glob
import wfdb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.models import load_model 
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.layers import Conv1D, BatchNormalization,\
MaxPooling1D,Dropout,Flatten,TimeDistributed,Bidirectional,Dense,LSTM 
import tensorflow_datasets as tfds
import multiprocessing
from datetime import datetime
import sklearn.metrics
import itertools
import io
import pickle
print(tf.__version__)

2.1.0


#3.Data Pipeline

In [0]:
#load the data filename
train_data_dir = r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\train25\data*"
train_data_fn = glob.glob(train_data_dir)
train_label_dir = r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\train25\label*"
train_label_fn = glob.glob(train_label_dir)

In [0]:
#run assert to make sure the data and label are in the same order
for i in range(len(train_label_fn)):
  assert(train_data_fn[i][-1] == train_label_fn[i][-1])

In [0]:
val_data_dir = r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\validation25\data*"
val_data_fn = glob.glob(val_data_dir)
val_label_dir = r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\validation25\label*"
val_label_fn = glob.glob(val_label_dir)

In [0]:
for i in range(len(val_label_fn)):
  assert(val_data_fn[i][-1] == val_label_fn[i][-1])

In [0]:
#use generator to input data, since the data size(>160GB) is larger than memory size (64GB)
def train_data_generator():
  for i in range(len(train_data_fn)):
    data = pickle.load(open(train_data_fn[i],'rb'))
    yield data

In [0]:
def train_label_generator():
  for i in range(len(train_label_fn)):
    label = pickle.load(open(train_label_fn[i],'rb'))
    yield label

In [0]:
def val_data_generator():
  for i in range(len(val_data_fn)):
    data = pickle.load(open(val_data_fn[i],'rb'))
    yield data

In [0]:
def val_label_generator():
  for i in range(len(val_label_fn)):
    label = pickle.load(open(val_label_fn[i],'rb'))
    yield label

In [40]:
#calculate number of elements in training for later use in shuffle and model.fit
number_of_element = 0
for i in range(len(train_label_fn)):
  label = pickle.load(open(train_label_fn[i],'rb'))
  number_of_element += len(label)
print("There are in total", number_of_element, "in training dataset")

There are in total 2494543 in training dataset


In [41]:
#calculate number of elements in validation
number_of_val_element = 0
for i in range(len(val_label_fn)):
  label = pickle.load(open(val_label_fn[i],'rb'))
  number_of_val_element += len(label)
print("There are in total", number_of_val_element, "in validation dataset")

There are in total 277340 in validation dataset


In [0]:
#input the data by using generator and use flat_map to removing the 
#first dimension (number of elements) and flat all data
train_data = tf.data.Dataset.from_generator(train_data_generator,(tf.float32),output_shapes=[None,10,150,1])
train_label = tf.data.Dataset.from_generator(train_label_generator,(tf.float32),output_shapes=[None,2])
train_ds = train_data.flat_map(lambda x: train_data.from_tensor_slices(x))
train_lb = train_label.flat_map(lambda x: train_label.from_tensor_slices(x))
train = tf.data.Dataset.zip((train_ds,train_lb))

In [0]:
#do the same to validation
val_data = tf.data.Dataset.from_generator(val_data_generator,(tf.float32),output_shapes=[None,10,150,1])
val_label = tf.data.Dataset.from_generator(val_label_generator,(tf.float32),output_shapes=[None,2])
val_ds = val_data.flat_map(lambda x: val_data.from_tensor_slices(x))
val_lb = val_label.flat_map(lambda x: val_label.from_tensor_slices(x))
validation = tf.data.Dataset.zip((val_ds,val_lb))

In [0]:
batch_size = 256
train_dataset = train.cache(filename=r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data")
train_dataset = train_dataset.shuffle(number_of_element//100).repeat().batch(batch_size,drop_remainder=True)
train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
val_dataset = validation.repeat().batch(batch_size, drop_remainder=True)

#4.Train Model

##4.1 CNN + LSTM

In [0]:
class Simple_CNN(tf.keras.layers.Layer):
  def __init__(self,input_shape):
    super(Simple_CNN, self).__init__()
    self.conv1E = TimeDistributed(Conv1D(16, 1, strides=1, activation='relu'),input_shape=input_shape)
    self.batch_normE = TimeDistributed(BatchNormalization())
    self.flatE = TimeDistributed(Flatten())

  def call(self, inputs):

    x = self.conv1E(inputs)
    x = self.batch_normE(x)
    x = self.flatE(x)

    return x

In [0]:
class CNN_LSTM(Model):
  def __init__(self, input_shape):
    super(CNN_LSTM, self).__init__()
    self.cnn = Simple_CNN(input_shape=input_shape)
    self.bi_lstmA = Bidirectional(LSTM(32, return_sequences = True))
    self.bi_lstmB = Bidirectional(LSTM(16))
    self.dense = Dense(2)

  def call(self,inputs):
    x = self.cnn(inputs)
    x = self.bi_lstmA(x)
    x = self.bi_lstmB(x)
    x = self.dense(x)

    return x

In [0]:
model = CNN_LSTM((10,150,1))

##4.2 Define callbacks

###4.2.1 Learning rate scheduler

In [0]:
def decay(epoch):
  if epoch < 20:
    return 1e-4
  elif epoch >= 20 and epoch < 100:
    return 1e-5
  else:
    return 1e-6

In [0]:
#callback: schedule a learning rate incline iteration
lr_schedule = tf.keras.callbacks.LearningRateScheduler(decay)

###4.2.2 Tensorboard

In [0]:
#callback: tensorboard
log_dir=r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\logs\fitt\\" + datetime.now().strftime("%Y%m%d-%H%M%S") +"CNN+LSTM+all+25Hz"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

###4.2.4 Checkpoint

In [0]:
#callback: checkpoint
filepath = r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\models\CNN+LSTM+all+25Hz-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='auto')

##4.3 Train the model 

### 4.3.1 Start Training

In [0]:
#clear history if necessary
tf.keras.backend.clear_session()
#strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()) ##to overwrite NCCL cross device communication as this is running in Windows
#with strategy.scope():

#loss_object = tf.keras.losses.MeanSquaredError()
#optimizer = tf.keras.optimizers.Adam()

model = model
#model.load_weights(r'C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\models\CNN+LSTM+all-30-205.0911.hdf5')
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='mse', 
              metrics=['mae'])

callbacks_list = [tensorboard_callback, checkpoint, lr_schedule]

#start training
model.fit(train_dataset,
          epochs=300,
          steps_per_epoch = number_of_element//batch_size,
          verbose=1,
          validation_data=val_dataset,
          validation_steps=number_of_val_element//batch_size,
          callbacks=callbacks_list
          )

# 5.Model Evaluation

In [0]:
#os.chdir(r"C:\Users\57lzhang.US04WW4008\Desktop\Blood pressure\BP data\models")
#model = tf.keras.models.load_model('CNN+LSTM-285-10.2110.hdf5')

## 5.2 MAE

In [0]:
test_data_dir = "D:/WFDB/matched/BP/Cleaned Data/test/data*"
test_data_fn = glob.glob(test_data_dir)
test_label_dir = "D:/WFDB/matched/BP/Cleaned Data/test/label*"
test_label_fn = glob.glob(test_label_dir)

In [0]:
def test_data_generator():
  for i in range(len(test_data_fn)):
    data = pickle.load(open(test_data_fn[i],'rb'))
    yield data

In [0]:
bp_estimate=model.predict_generator(test_data_generator,steps=len(test_data_fn)/batch_size)

In [0]:
from sklearn.metrics import mean_absolute_error
sys_mae = mean_absolute_error(test_labels[:,0], bp_estimate[:,0])
dia_mae = mean_absolute_error(test_labels[:,1], bp_estimate[:,1])
print("Systolic MAE in test dataset:", round(sys_mae,1))
print("Diastolic MAE in test dataset:", round(dia_mae,1))