<a href="https://colab.research.google.com/github/supertime1/BP_PPG/blob/master/Simplified_BP_Data_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run all cells to generate cleaned data

In [0]:
%matplotlib inline
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import wfdb
import sklearn
from sklearn import preprocessing
import io
import pickle
import numba
from numba import jit
import tensorflow as tf
from scipy.signal import find_peaks

In [0]:
@jit(nopython=True)
def flat_line(signals,threshold = 0, percent = .15):
  clean_signals = []
  #create a list to store the index of the removed segments, this will be used
  #to remove the PPG signals with same index
  rm_list = []
  for i in range(len(signals)):
    #use np.diff to find consecutive points: diff = [i] - [i+1]
    signal_diff = np.diff(signals[i])
    #change value less than threshold to 0, and the rest to 1
    less = np.abs(signal_diff) <= threshold
    more = np.abs(signal_diff) > threshold
    signal_diff[less] = 0
    signal_diff[more] = 1
    #calculate what percent of 0 in the signal, remove the entire signal if 
    #percentage is higher than defined percent
    zero_per = np.sum(signal_diff==0)/len(signal_diff)
    if zero_per < percent:
      clean_signals.append(signals[i])
    else:
      rm_list.append(i)
    
    #track the progress for impatient programmer
    #if i%10000 == 0:
      #print("Processing on", i, "th sample")

  return clean_signals,rm_list

In [0]:
def generate_segment_data(source,seg_len):
  signals =[]
  for signal in source:
    for i in range(int(len(signal)/seg_len)):
      seg = signal[seg_len*i:seg_len*(i+1)]
      signals.append(seg)
#convert list into a numpy array and change its dim from (num of records, seg_len, 1) to (num of records, seg_len)
  signals = np.asarray(list(map(lambda x: np.reshape(x,7500),signals)))

  return signals

In [0]:
def peak_segmentation(signal,distance = 40):
  valleys, _ = find_peaks(signal*-1, distance=distance)
  
  segments = []
  for i in range(len(valleys)-1):
    seg = signal[valleys[i]:valleys[i+1]]
    segments.append(seg)
  
  return segments

In [0]:
#signals: A list of list (i.e. a list of 1 min signals that contains cyclic segments: ABP_ps_signals)
#cyc_ratio: threhold for flat line removal for a cycle
#seg_ratio: threhold to remove the whole 1min signal
def flat_peak_remove(signals,cyc_ratio = 0.05, seg_ratio = 0.1):

  clean_segments = []
  remove_index = [] 
  for i in range(len(signals)):
    #in case some lists are empty
    if signals[i] == []: 
      remove_index.append(i) 
      continue  
    
    #this returns a list of cleaned cycles (<5% flatline) and a list of cycles that has more than
    #5% flatline    
    clean_sig, rm_sig = flat_line(signals[i],0,cyc_ratio)
    
    if len(rm_sig)/(len(clean_sig) + len(rm_sig)) >= seg_ratio: 
      remove_index.append(i)
      continue
    
    clean_segments.append(clean_sig)

  return clean_segments, remove_index

In [0]:
#A function that returns average systolic and diastolic value of a 1min data
#Input signals: ABP_fpr_signals
def bp_ground_truth(signals):
  gt_ls = []
  for i in range(len(signals)):
    cycles = signals[i]    #a list of cycles in 1min signal
    
    cyc_sys_list = []
    cyc_dia_list = []
    for j in range(len(cycles)):
      cyc_sys_list.append(max(cycles[j]))
      cyc_dia_list.append(cycles[j][0])

    gt_ls.append([np.average(np.asarray(cyc_sys_list)),
                  np.average(np.asarray(cyc_dia_list))])

  return gt_ls

In [0]:
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [0]:
#use numba to improve the speed of for loop
@jit(nopython=True)
def hampel_filter_forloop_numba(input_series, window_size, n_sigmas=3):
    
    n = len(input_series)
    new_series = input_series.copy()
    k = 1.4826 # scale factor for Gaussian distribution
    #indices = []
    
    for i in range((window_size),(n - window_size)):
        x0 = np.nanmedian(input_series[(i - window_size):(i + window_size)])
        S0 = k * np.nanmedian(np.abs(input_series[(i - window_size):(i + window_size)] - x0))
        if (np.abs(input_series[i] - x0) > n_sigmas * S0):
            new_series[i] = x0
            #indices.append(i)
    
    return new_series#, indices

In [0]:
def process_data(directory):
  #load data
  ABP_names = glob.glob(directory + "ABP*.pkl")
  PPG_names = glob.glob(directory + "PPG*.pkl")
  assert(len(ABP_names) == len(PPG_names))
  for i in range(len(ABP_names)):
    
    print("processing", i, "th data")

    with open(ABP_names[i], "rb") as fp:
      ABP_raw_signals = pickle.load(fp)

    with open(PPG_names[i], "rb") as fp:
      PPG_raw_signals = pickle.load(fp)

    #remove flatlines
    PPG_clean_signals,PPG_rm_list = flat_line(PPG_raw_signals,0,percent=0.10)
    ABP_clean_signals,ABP_rm_list = flat_line(ABP_raw_signals,0,percent=0.10)
    
    ABP_list = pd.DataFrame(ABP_rm_list)
    PPG_list = pd.DataFrame(PPG_rm_list)

    try:
      total_list = ABP_list.merge(PPG_list,how="outer")
    except:
      total_list = pd.concat([ABP_list,PPG_list],axis=0)
      
    removal_list=total_list.values.tolist()
    ABP_cl_signals = np.delete(ABP_raw_signals,total_list,0)
    PPG_cl_signals = np.delete(PPG_raw_signals,total_list,0)  

    #segment into 1min data
    ABP_seg_signals = generate_segment_data(ABP_cl_signals, 7500)
    PPG_seg_signals = generate_segment_data(PPG_cl_signals, 7500)

    #PROCESS ABP SIGNAL
    #1.peak segmentation
    ABP_ps_signals = [peak_segmentation(i) for i in ABP_seg_signals]
    #2.flat peak removal
    ABP_fpr_signals, remove_index = flat_peak_remove(ABP_ps_signals,0.05,0.1)
    #3.remove corresponding PPG signal
    PPG_fpr_signals = np.delete(PPG_seg_signals,remove_index,0)
    #4.generate ground truth ABP
    gt_ls = bp_ground_truth(ABP_fpr_signals)

    #PROCESS PPG SIGNAL
    #1.standardize PPG signal
    PPG_norm_signals = [sklearn.preprocessing.robust_scale(i) for i in PPG_fpr_signals]
    #2.band pass filter on PPG sinal
    PPG_bf_signals =[butter_bandpass_filter(i,0.5,8,300,order=4) for i in PPG_norm_signals]
    #3. hampel filter
    PPG_hf_signals = [hampel_filter_forloop_numba(i, 6) for i in PPG_bf_signals]
    #4. resample signal
    ##PLACEHOLDER for resampling signal to a lower frequency, if needed
    with open(directory + "BP_data" + "_" + str(i), "wb") as fp:
      pickle.dump(PPG_hf_signals,fp)

    with open(directory + "BP_label" + "_" + str(i), "wb") as fp:
      pickle.dump(gt_ls,fp)


  return None

In [0]:
directory = 'D:/WFDB//matched/BP/'
process_data(directory)