<a href="https://colab.research.google.com/github/supertime1/Afib_PPG/blob/master/Afib_ECG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1.Introduction

This notebook trains an ECG DNN by using labeled ECG data from "The PhysioNet Computing in Cardiology Challenge 2017" (https://physionet.org/content/challenge-2017/1.0.0/). The ECG DNN model will be used to label the ECG data from MIMIC-III waveform dataset, so as the concurrent PPG data.

The ECG data used in training and validation has the following important attributes:
*   sampling frequency: 300Hz
*   4 lables: Normal (N), AF (A), Other rhythm (O), Noisy (~)
*   length: 9 - 60s with 30s mean.
*   preprosessed: data has been band pass filtered by AliveCor device


Only time length >30s is used in training, since PPG data usually requires 30s for Afib detection.

#2.Setup Environment

In [0]:
!pip install tensorflow-gpu

In [0]:
!pip install wfdb

In [0]:
!pip install tensorflow_datasets

In [0]:
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext tensorboard
import numpy as np
import os
import shutil
import glob
import wfdb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model 
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_datasets as tfds
import multiprocessing
from datetime import datetime

print(tf.__version__)

In [0]:
#run this cell if multiple GPUs are used
tf.debugging.set_log_device_placement(True)

In [0]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

In [0]:
tf.test.is_built_with_cuda()

#3.Data Pipeline

##3.1 Load data

In [0]:
hd_names = []
for name in glob.glob("C:/Users/57lzhang.US04WW4008/Desktop/Afib/Afib_ECG data/training2017/*.hea"):#'/content/drive/My Drive/training2017/*.hea'): 
  position = name.index('.hea')
  name = name[0:position] #remove the .hea part to comply with the wfdb.rdrecord format
  hd_names.append(name)
print('There are total', len(hd_names), 'records')

In [0]:
qualified_names = [] #a list of file names that contain ECG Lead I data
for name in hd_names:
  record = wfdb.rdheader(name)
  if record.sig_len >= 9000: #extact only records contrains ECG lead I >30s
    qualified_names.append(name)
print('There are total', len(qualified_names), 'qualified (>30s) records')

In [0]:
#load label numpy file
df = pd.read_csv(r'C:\Users\57lzhang.US04WW4008\Desktop\Afib\Afib_ECG data\training2017\REFERENCE.csv', sep=',', header=None) #'/content/drive/My Drive/training2017/REFERENCE.csv',sep=',', header=None)#
#create a new name list that only stores the file name
init_labels =[]
new_names = []
for name in qualified_names:
  temp_name = name[-6:] #remove the dir and keep only the file name
  temp_label = df[df[0] == temp_name][1].to_numpy()
  if temp_label != '~':
    init_labels.append(temp_label)
    new_names.append(name)
init_labels = np.array(init_labels)
print('There are total', len(init_labels),'none-nosiy labels')
print('There are total', len(new_names), 'none-nosiy records')

In [0]:
init_labels[init_labels =='N'] = '0'
init_labels[init_labels =='A'] = '1'
init_labels[init_labels =='O'] = '0'

In [0]:
##read signals
ECG_signals = [] #create a  list to store all  ECG signals
for name in new_names:
  record = wfdb.rdrecord(name)
  ECG_signals.append(record.p_signal)

print('ECG signals len:', len(ECG_signals))

In [0]:
#A function to split the raw signal data with 30s per segment and keep the label
##source is the raw signal (e.g. ECG_signals) and seg_len = 30s * 300Hz = 9000
def generate_segment_data(source,init_labels,seg_len):
  n=0
  signals =[]
  labels = []
  for signal in source:
    for i in range(int(len(signal)/seg_len)):
      seg = signal[seg_len*i:seg_len*(i+1)]
      label = init_labels[n]
      signals.append(seg)
      labels.append(label)
  n+=1
#convert list into a numpy array and change its dim from (num of records, seg_len, 1) to (num of records, seg_len)
  signals = np.asarray(list(map(lambda x: np.reshape(x,9000),signals)))
  labels = np.asarray(list(map(lambda x: np.reshape(x,1),labels)))

  return signals,labels

In [0]:
#use generate_segment_data() to generate segments with labels
#After segmentation, more data than previous is generated, because some source data are 60s 
signals, labels = generate_segment_data(ECG_signals,init_labels,9000)
print('signals dim:',signals.shape)
print('labels dim:',labels.shape)
#resize the dimension for use in CNN
signals = np.expand_dims(signals,axis=1)
signals = np.expand_dims(signals,axis=3)
print('signals dim after resize:',signals.shape)
print('labels dim after resize:',labels.shape)

##3.2 Extract, Transform and Load data

###3.2.1 Parallelize Extraction (how?)

In [0]:
labels = tf.strings.to_number(labels) 
dataset = tf.data.Dataset.from_tensor_slices((signals,labels))
dataset

In [0]:
#print and visualize the data
for signal, label in tfds.as_numpy(dataset.take(2)):
  print(signal.shape, label)

In [0]:
#optional: data preprocessing use dataset.map(function)
#possible functions:
#1.Spectrogram
#2.Hample filter
#3.Flat line and peak removal
#4.Normalization


### 3.2.2 Parallelize Transformation


In [0]:
cores = multiprocessing.cpu_count()
print(cores)
#dataset = dataset.map(function, num_parallel_calls = cores)

In [0]:
###ratio value is between 0 and 1, and train,validation and test ratios should sum up to 1.0
def slice_dataset(dataset,train_ratio,val_ratio,test_ratio=None):
  DATASET_SIZE =len(list(dataset)) #only works in eager mode (e.g. TF version >= 2.0.x)
  train_size = int(train_ratio * DATASET_SIZE)
  val_size = int(val_ratio * DATASET_SIZE)
  test_size = int(test_ratio * DATASET_SIZE)

  train_dataset = dataset.take(train_size)
  remain_dataset = dataset.skip(train_size)
  val_dataset = remain_dataset.take(val_size)
  test_dataset = remain_dataset.skip(val_size)

  return train_dataset,val_dataset,test_dataset

In [0]:
#create training, validation and test dataset
train_dataset, val_dataset, test_dataset = slice_dataset(dataset, 0.8, 0.1, 0.1)
print('size of train_dataset:',len(list(train_dataset)))
print('size of val_dataset:',len(list(val_dataset)))
print('size of test_dataset:',len(list(test_dataset)))

### 3.2.3 Parallelize Loading

In [0]:
batch_size = 32
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(1024).repeat().batch(batch_size,drop_remainder=True)
train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.repeat().batch(batch_size, drop_remainder=True)

#4. Create Model

##4.1 Build the model and find the optimal learning rate

In [0]:
#clear history if necessary
tf.keras.backend.clear_session()

#train model
learning_rate = 0.0001
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()) ##to overwrite NCCL cross device communication as this is running in Windows
with strategy.scope():

  model = tf.keras.Sequential([
      #1st Conv2D
      tf.keras.layers.Conv2D(8, (1, 1), strides=(1, 1), 
                            activation='relu', input_shape=(1,9000,1)),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #2nd Conv2D
      tf.keras.layers.Conv2D(16, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #3rd Conv2D
      tf.keras.layers.Conv2D(32, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #4th Conv2D
      tf.keras.layers.Conv2D(64, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #5th Conv2D
      tf.keras.layers.Conv2D(16, (1, 1), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      #Full connection layer
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(1)
  ])

model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), 
              loss=tf.keras.losses.binary_crossentropy, 
              metrics=['accuracy'])

#callback: schedule a learning rate incline iteration
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-4 * 10**(epoch / 20))

#callback: tensorboard
!rm -rf logs\\fit
log_dir="logs\\fit\\" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, update_freq='batch', histogram_freq=1)

#start training
history = model.fit(train_dataset,
                    epochs=1,
                    steps_per_epoch = len(list(dataset))*0.8/batch_size,
                    verbose=1,
                    validation_data=val_dataset,
                    validation_steps = len(list(dataset))*0.1/batch_size,
                    callbacks=[lr_schedule,tensorboard_callback]
                    )

In [0]:
#Visualize learning rate vs epoches
plt.semilogx(history.history["lr"], history.history["loss"])
plt.axis([1e-4, 1e-2,0,5])

In [0]:
!rm -rf logs\\fit
log_dir="logs\\fit\\" + datetime.now().strftime("%Y%m%d-%H%M%S")
%tensorboard --logdir log_dir

##4.2 Train the model with the optimal learning rate

In [0]:
#clear history if necessary
tf.keras.backend.clear_session()

#train model
learning_rate = 0.001 #choose the optimal learning rate
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()) ##to overwrite NCCL cross device communication as this is running in Windows
with strategy.scope():

  model = tf.keras.Sequential([
      #1st Conv2D
      tf.keras.layers.Conv2D(8, (1, 1), strides=(1, 1), 
                            activation='relu', input_shape=(1,9000,1)),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #2nd Conv2D
      tf.keras.layers.Conv2D(16, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #3rd Conv2D
      tf.keras.layers.Conv2D(32, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #4th Conv2D
      tf.keras.layers.Conv2D(64, (1, 3), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.MaxPooling2D(pool_size=(1, 2),strides=(1, 2)),
      tf.keras.layers.Dropout(0.2),
      #5th Conv2D
      tf.keras.layers.Conv2D(16, (1, 1), strides=(1, 1),
                            activation='relu'),
      tf.keras.layers.BatchNormalization(),
      #Full connection layer
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(1)
  ])

model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), 
              loss=tf.keras.losses.binary_crossentropy, 
              metrics=['accuracy'])

#callback: tensorboard
!rm -rf logs\\fit
log_dir="logs\\fit\\" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, update_freq='batch', histogram_freq=1)

#start training
history = model.fit(train_dataset,
                    epochs=1,
                    steps_per_epoch = len(list(dataset))*0.8/batch_size,
                    verbose=1,
                    validation_data=val_dataset,
                    validation_steps = len(list(dataset))*0.1/batch_size,
                    callbacks=[tensorboard_callback]
                    )