## Data Processing

In [1]:
import numpy as np
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import wfdb
from utils import qrs_detect, comp_cosEn, save_dict

#pd.set_option("display.max_rows", None, "display.max_columns", None)
%matplotlib inline

In [2]:
def load_record(sample_path):
    
    '''  returns signal, global label, local labels ''' 
    
    sig, fields = wfdb.rdsamp(sample_path)
    ann_ref = wfdb.rdann(sample_path, 'atr')
    
    #print(wfdb.rdsamp(sample_path))
    #print("\n\n", wfdb.rdann(sample_path, 'atr').aux_note)
    
    label = fields['comments'][0]
    fs = fields['fs']
    sig = sig[:, 1]
    length = len(sig)
    
    #print("Signal: ", sig)
    #print("\nLabel: ", label)
    
    beat_loc = np.array(ann_ref.sample) # r-peak locations
    ann_note = np.array(ann_ref.aux_note) # rhythm change flag
    
    return sig, length, fs, label, ann_note, beat_loc


In [7]:
def build_input():
    
    ''' Builds input DF '''

    DATA_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data"
    RESULT_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/output"
    if not os.path.exists(RESULT_PATH):
        os.makedirs(RESULT_PATH)
        
    test_set = open(os.path.join(DATA_PATH, 'RECORDS'), 'r').read().splitlines()
    
    input_df = pd.DataFrame(columns=["Signal", "Signal Length", "Label"])
  
    for i, sample in enumerate(test_set):
        
        #print("\n\n\n", sample)
        sample_path = os.path.join(DATA_PATH, sample)
        sig, sig_len, fs, label, label_arr, beat_loc  = load_record(sample_path)
        
        input_df.at[i, 'Signal'] = sig
        input_df.at[i, 'Signal Length'] = sig_len
        input_df.at[i, 'Label'] = label
    
        #input_df.append(build_seq_input(sample_path))
        #pred_dict = challenge_entry(sample_path)
        
    return input_df
    
df = build_input()
df.to_csv("/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/df.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1425 entries, 0 to 1424
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Signal         1425 non-null   object
 1   Signal Length  1425 non-null   object
 2   Label          1425 non-null   object
dtypes: object(3)
memory usage: 76.8+ KB


In [28]:
def build_chunked_input():
    
    ''' Builds chunked DF input  '''

    DATA_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data"
    RESULT_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/output"
    if not os.path.exists(RESULT_PATH):
        os.makedirs(RESULT_PATH)
        
    test_set = open(os.path.join(DATA_PATH, 'RECORDS'), 'r').read().splitlines()[0:10]
    seconds = 30
    chunksize = seconds*200
    
    input_df = pd.DataFrame(columns=["Signal", "Granular Labels", "Sequence Label", "Chunk Label", "Signal Length"])
  
    for i, sample in enumerate(test_set):
        
        #print("\n\n\n", sample)
        sample_path = os.path.join(DATA_PATH, sample)
        sig, sig_len, fs, label, label_arr, beat_loc = load_record(sample_path)
        loc_labels = ['non atrial fibrillation']*sig_len
        r_peaks = beat_loc #qrs_detect(sig, fs)
        
        printlabelarr = False 
        af_ranges = []
        
        ## Calculate exact AF ranges in sequence
        ''' Label arr acts as an index for which peaks in r_peaks are afib; '''
        for l in label_arr:
            if l == "(AFIB":
                #print("AFIB detected")
                printlabelarr = True
                start = int(r_peaks[np.where(label_arr == l)[0][0]])
                #print(np.where(label_arr == l)[0])
                #loc_labels[ r_peaks[ label_arr.index(mini_label) ]] = 'AFIB'
            if l == "(N":
                stop = int(r_peaks[np.where(label_arr == l)[0][0]])
                af_ranges.append((start,stop))
        
        ## Label AF for AF sections of the signal
        for rng in af_ranges:
            start = rng[0]
            stop = rng[1]
            #print("Signal AF start/stop ranges: ", start, stop)
            #print(r_peaks[start], r_peaks[stop])
            loc_labels[ start : stop ] = ['AFIB'] * (stop-start) 
        
        '''  
        print("Signal: ", sig)
        print("Signal len: ", sig_len)
        print("QRS Peak locations: ", r_peaks)
        print(len(r_peaks))
        
        
        if printlabelarr: 
            print("Label Arr: ", label_arr)
            print(len(label_arr))
            print("AF ranges: ", af_ranges)
            print("Granular Labels: ", loc_labels)
            print(len(loc_labels))
        print("Sequence Label: ", label)
        '''
         
        chunked_sig = np.array_split(sig, chunksize)
        chunked_label = np.array_split(loc_labels, chunksize)

        input_df.at[i, 'Signal'] = chunked_sig
        input_df.at[i, 'Granular Labels'] = chunked_label
        input_df.at[i, 'Signal Length'] = sig_len
        input_df.at[i, 'Sequence Label'] = label
    
        #input_df.append(build_seq_input(sample_path))
        #pred_dict = challenge_entry(sample_path)
        
    input_df = input_df.explode(["Signal", "Granular Labels"])
    
    
    input_df["Chunk Label"] = input_df["Granular Labels"].apply(lambda x: Counter(x).most_common(1)[0][0])
    input_df = input_df.drop(['Granular Labels'], axis=1)
    return input_df
    
chunk_df = build_chunked_input()
chunk_df.to_csv("/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/chunk_df.csv")
chunk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Signal           60000 non-null  object
 1   Granular Labels  60000 non-null  object
 2   Sequence Label   60000 non-null  object
 3   Chunk Label      60000 non-null  object
 4   Signal Length    60000 non-null  object
dtypes: object(5)
memory usage: 2.7+ MB


In [26]:
chunk_df.head()

Unnamed: 0,Signal,Granular Labels,Sequence Label,Chunk Label,Signal Length
0,"[0.06019023062191457, 0.07659801947788845, 0.0...","[non atrial fibrillation, non atrial fibrillat...",non atrial fibrillation,non atrial fibrillation,520088
0,"[0.0381714880638978, 0.022783872608295345, 0.0...","[non atrial fibrillation, non atrial fibrillat...",non atrial fibrillation,non atrial fibrillation,520088
0,"[-0.30613703456146096, -0.2464568906397321, -0...","[non atrial fibrillation, non atrial fibrillat...",non atrial fibrillation,non atrial fibrillation,520088
0,"[0.7537381139744274, 0.8215796450991277, 1.004...","[non atrial fibrillation, non atrial fibrillat...",non atrial fibrillation,non atrial fibrillation,520088
0,"[-0.1860116166677247, -0.16875368331144128, -0...","[non atrial fibrillation, non atrial fibrillat...",non atrial fibrillation,non atrial fibrillation,520088


In [None]:
plt.plot(df["Signal"][0][:(30*200)])