In [75]:
import numpy as np
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import wfdb
from utils import qrs_detect, comp_cosEn, save_dict
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')


pd.set_option("display.max_rows", None, "display.max_columns", None)
%matplotlib inline

## Data Preprocessing

In [76]:
def load_record(sample_path):
    
    '''  returns signal, global label, local labels ''' 
    
    sig, fields = wfdb.rdsamp(sample_path)
    ann_ref = wfdb.rdann(sample_path, 'atr')
    
    #print(wfdb.rdsamp(sample_path))
    #print("\n\n", wfdb.rdann(sample_path, 'atr').aux_note)
    
    label = fields['comments'][0]
    fs = fields['fs']
    sig = sig[:, 1]
    length = len(sig)
    
    #print("Signal: ", sig)
    #print("\nLabel: ", label)
    
    beat_loc = np.array(ann_ref.sample) # r-peak locations
    ann_note = np.array(ann_ref.aux_note) # rhythm change flag
    
    return sig, length, fs, label, ann_note, beat_loc


In [77]:
def normalize(signal):

    values = signal
    values = values.reshape((len(values), 1))
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(values)
    normalized = scaler.transform(values)
    normalized = [item for sublist in normalized for item in sublist]
    
    #print(normalized)

    return normalized

In [None]:
def build_input():
    
    ''' Builds input DF '''

    DATA_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data"
    RESULT_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/output"
    if not os.path.exists(RESULT_PATH):
        os.makedirs(RESULT_PATH)
        
    test_set = open(os.path.join(DATA_PATH, 'RECORDS'), 'r').read().splitlines()[0:10]
    
    input_df = pd.DataFrame(columns=["Signal", "Signal Length", "Label"])
  
    for i, sample in enumerate(test_set):
        
        #print("\n\n\n", sample)
        sample_path = os.path.join(DATA_PATH, sample)
        sig, sig_len, fs, label, label_arr, beat_loc  = load_record(sample_path)
        
        #sig = normalize_signal(sig)
        
        input_df.at[i, 'Signal'] = sig
        input_df.at[i, 'Signal Length'] = sig_len
        input_df.at[i, 'Label'] = label
    
        #input_df.append(build_seq_input(sample_path))
        #pred_dict = challenge_entry(sample_path)
        
    return input_df
    
df = build_input()
df.to_pickle("/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/df.pkl")
df.info()

In [78]:
def chunks(lst, n):
    return [lst[i:i + n] for i in range(0, len(lst), n)]

In [384]:
def build_chunked_input(start, stop):
        
    ''' Builds chunked DF input  '''

    DATA_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data"
    RESULT_PATH = "/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/output"
    if not os.path.exists(RESULT_PATH):
        os.makedirs(RESULT_PATH)
        
    test_set = open(os.path.join(DATA_PATH, 'RECORDS'), 'r').read().splitlines()[start:stop]
    seconds = 30
    chunksize = seconds*200
    
    input_df = pd.DataFrame(columns=["Sequence Number", "Sequence Name", "Signal", "Granular Labels", "Sequence Label", "Chunk Label", "Signal Length", "AF Burden"])
  
    #test_set=["Training_set_II/data_86_9"]
    
    for i, sample in enumerate(test_set):
        
        #print("\n\n\n", sample)
        #print(i, end='\r')
        sample_path = os.path.join(DATA_PATH, sample)
        sig, sig_len, fs, label, label_arr, beat_loc = load_record(sample_path)
        loc_labels = [0]*sig_len
        r_peaks = beat_loc #qrs_detect(sig, fs)
        
        #print(sig)
        sig = normalize(sig)
        
        printlabelarr = False 
        af_ranges = []
        af_range = []
        
        ## Calculate exact AF ranges in sequence
        ''' Label arr acts as an index for which peaks in r_peaks are afib; '''
        
        #print('Label arr: ', label_arr)
        #print("\nQRS Peak locations: ", r_peaks)
        
        for li, l in enumerate(label_arr):
            if l == "(AFIB" or l == "(AFL":
                #print("AFIB detected")
                printlabelarr = True
                start = r_peaks[li]
                af_range.append(start)
                #print(np.where(label_arr == l)[0])
                #loc_labels[ r_peaks[ label_arr.index(mini_label) ]] = 'AFIB'
            if l == "(N":
                stop = r_peaks[li]
                af_range.append(stop)
                af_ranges.append(af_range)
                af_range = []
        
        #print("\n", af_ranges)
        ## Label AF for AF sections of the signal
        for rng in af_ranges:
            start = rng[0]
            stop = rng[1]
            #print("Signal AF start/stop ranges: ", start, stop)
            #print(r_peaks[start], r_peaks[stop])
            loc_labels[ start : stop ] = [1] * (stop-start) 
        
       
        #print("Signal: ", sig)
        #print("Signal len: ", sig_len)
        
        #print(len(r_peaks))
        
        
        #if printlabelarr: 
            #print("Label Arr: ", label_arr)
            #print(len(label_arr))
            #print("AF ranges: ", af_ranges)
            #print("Granular Labels: ", loc_labels[0:10])
            #print(len(loc_labels))
        #print("Sequence Label: ", label)
       
        #chunked_sig   = np.array_split(sig, chunksize)
        #chunked_label = np.array_split(loc_labels, chunksize)
        
        chunked_sig = chunks(sig, chunksize)[:-1]
        chunked_label = chunks(loc_labels, chunksize)[:-1]
        
        burden=0
        for rng in af_ranges:
            burden+=rng[1]-rng[0] 
        burden = burden/sig_len
            
        #print('Burden: ', burden)

        input_df.at[i, 'Sequence Number'] = i
        input_df.at[i, 'Sequence Name'] = sample
        input_df.at[i, 'Signal'] = chunked_sig
        input_df.at[i, 'Granular Labels'] = chunked_label
        input_df.at[i, 'Signal Length'] = sig_len
        input_df.at[i, 'Sequence Label'] = label
        input_df.at[i, 'AF Burden'] = burden
        
    
        #input_df.append(build_seq_input(sample_path))
        #pred_dict = challenge_entry(sample_path)
        
    input_df = input_df.explode(["Signal", "Granular Labels"])
    
    #print("\n", input_df["Granular Labels"])
    
    #input_df["Chunk Label"] = input_df["Granular Labels"].apply(lambda x: Counter(x).most_common(1)[0][0])
    #input_df["Chunk Label"] = input_df["Granular Labels"].apply(lambda x: Counter(x).most_common(1)[0][0])
    input_df["Chunk Label"] = input_df["Granular Labels"].str[0]
     
    input_df = input_df.drop(['Granular Labels'], axis=1)
        
    return input_df

dfs = []

for i in range(0,1):
    df = build_chunked_input(0+(600*i), 600+(600*i))
    print(df.shape)
    dfs.append(df)   
    
chunk_df = pd.concat(dfs)
chunk_df.dropna(inplace=True)
chunk_df.info(null_counts=True)

(31783, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31779 entries, 0 to 599
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sequence Number  31779 non-null  object 
 1   Sequence Name    31779 non-null  object 
 2   Signal           31779 non-null  object 
 3   Sequence Label   31779 non-null  object 
 4   Chunk Label      31779 non-null  float64
 5   Signal Length    31779 non-null  object 
 6   AF Burden        31779 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.9+ MB


In [183]:
chunk_df.to_pickle("/Users/Hasan/Desktop/Workspace/cpsc2021-AFIB/afib_data/chunk_df.pkl")

In [374]:
chunk_df['Chunk Label'].value_counts()

0.0    23996
1.0     7783
Name: Chunk Label, dtype: int64

## Model

In [85]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding, MaxPooling1D
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from sklearn.preprocessing import LabelEncoder
import time
from keras import metrics

In [234]:
from sklearn.model_selection import train_test_split

X = pd.DataFrame(chunk_df['Signal'].tolist())
y = pd.DataFrame(chunk_df['Chunk Label'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((8445, 6000), (3620, 6000), (8445, 1), (3620, 1))

In [387]:
chunk_df.tail()

Unnamed: 0,Sequence Number,Sequence Name,Signal,Sequence Label,Chunk Label,Signal Length,AF Burden
599,599,Training_set_I/data_41_11,"[0.4543817618604368, 0.45355774953077077, 0.45...",non atrial fibrillation,0.0,586960,0.0
599,599,Training_set_I/data_41_11,"[0.1168876749118764, 0.10947156394488272, 0.10...",non atrial fibrillation,0.0,586960,0.0
599,599,Training_set_I/data_41_11,"[0.45461065417423285, 0.46408679596539154, 0.4...",non atrial fibrillation,0.0,586960,0.0
599,599,Training_set_I/data_41_11,"[0.5242549555185937, 0.5244075503944579, 0.524...",non atrial fibrillation,0.0,586960,0.0
599,599,Training_set_I/data_41_11,"[0.4555262234294173, 0.4564875711473609, 0.426...",non atrial fibrillation,0.0,586960,0.0


In [390]:
chunk_df.sort_values(by=['Sequence Number'], inplace=True)

In [392]:
train_df = chunk_df[chunk_df['Sequence Number'] < 400]
test_df = chunk_df[chunk_df['Sequence Number'] >= 400]

In [393]:
#train_df = chunk_df.sample(frac = 0.7)
#test_df = chunk_df.sample(frac = 0.3)

X_train = pd.DataFrame(train_df['Signal'].tolist())
X_test = pd.DataFrame(test_df['Signal'].tolist())

y_train = pd.DataFrame(train_df['Chunk Label'].tolist())
y_test = pd.DataFrame(test_df['Chunk Label'].tolist())

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

KeyboardInterrupt: 

In [376]:
batch = 16
epochs = 1
shape = np.size(X_train, 1)


model = Sequential()
model.add(Dense(100, activation='relu', input_shape = (shape,1)))
model.add(Conv1D(100, 10, activation='relu'))
model.add(Conv1D(100, 10, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(160, 10, activation='relu'))
model.add(Conv1D(160, 10, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid')) 
model.summary()
model.compile(loss='binary_crossentropy',
        optimizer='rmsprop',
        metrics=['accuracy'])

X_train = np.expand_dims(X_train, 2)
X_test = np.expand_dims(X_test, 2)


model.fit(X_train,y_train, batch_size = batch, epochs = epochs)
score = model.evaluate(X_test, y_test, batch_size = batch)
score

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_22 (Dense)            (None, 6000, 100)         200       
                                                                 
 conv1d_36 (Conv1D)          (None, 5991, 100)         100100    
                                                                 
 conv1d_37 (Conv1D)          (None, 5982, 100)         100100    
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, 1994, 100)        0         
 g1D)                                                            
                                                                 
 conv1d_38 (Conv1D)          (None, 1985, 160)         160160    
                                                                 
 conv1d_39 (Conv1D)          (None, 1976, 160)         256160    
                                                     

OverflowError: cannot convert float infinity to integer

In [377]:
score = model.evaluate(X_test, y_test, batch_size = batch)
score



OverflowError: cannot convert float infinity to integer

In [380]:
X_test

array([], shape=(0, 0, 1), dtype=float64)

## Evaluation

In [335]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

threshold = 0.5

y_pred = model.predict(X_test, batch_size = batch)
y_test['y_pred'] = y_pred[:,0] > threshold
if "index" in y_test.columns: y_test.drop('index', inplace=True)
y_test.columns=['y_true', 'y_pred']
y_test['y_pred'] = y_test['y_pred'].astype(float)

cm = confusion_matrix(y_test['y_true'], y_test['y_pred'])

print("\n Accuracy: ", score[1])
print("\n Loss: ", score[0])
print("\n F1 score: ", f1_score(y_test['y_true'], y_test['y_pred'], average='macro'))
print("\n Confusion Matrix: \n", cm)


 Accuracy:  0.7046818733215332

 Loss:  0.6038762927055359

 F1 score:  0.613558219299176

 Confusion Matrix: 
 [[1983  135]
 [ 849  365]]


In [336]:
test_df.reset_index(inplace=True)
y_test.reset_index(inplace=True)

final = pd.concat([test_df, y_test], axis=1)

In [337]:
train_df[train_df["Sequence Number"]==2]

Unnamed: 0,Sequence Number,Sequence Name,Signal,Sequence Label,Chunk Label,Signal Length,AF Burden
2,2,Training_set_I/data_31_1,"[0.2241466131567302, 0.20606412036683808, 0.20...",paroxysmal atrial fibrillation,0.0,30483,0.142079
2,2,Training_set_I/data_31_1,"[0.2067202783330536, 0.214350022126257, 0.2071...",paroxysmal atrial fibrillation,0.0,30483,0.142079
2,2,Training_set_I/data_31_1,"[0.2526513359681382, 0.2470968824866861, 0.233...",paroxysmal atrial fibrillation,0.0,30483,0.142079
2,2,Training_set_I/data_31_1,"[0.2170051729662918, 0.21953824790563534, 0.21...",paroxysmal atrial fibrillation,0.0,30483,0.142079
2,2,Training_set_I/data_31_1,"[0.4459127462499809, 0.4473318785955167, 0.454...",paroxysmal atrial fibrillation,0.0,30483,0.142079


In [338]:
test_df[test_df["Sequence Number"]==2]

Unnamed: 0,index,Sequence Number,Sequence Name,Signal,Sequence Label,Chunk Label,Signal Length,AF Burden


In [370]:
final.head()

Unnamed: 0,index,Sequence Number,Sequence Name,Signal,Sequence Label,Chunk Label,Signal Length,AF Burden,index.1,y_true,y_pred,y_pred_2
0,150,150,Training_set_I/data_24_25,"[0.5300840797766009, 0.5392092533532726, 0.548...",persistent atrial fibrillation,1.0,21969,0.999954,0,1.0,0.0,0.0
1,150,150,Training_set_I/data_24_25,"[0.4437764179878836, 0.4480338150244916, 0.463...",persistent atrial fibrillation,1.0,21969,0.999954,1,1.0,0.0,0.0
2,150,150,Training_set_I/data_24_25,"[0.49362916393267486, 0.5075915950742376, 0.50...",persistent atrial fibrillation,1.0,21969,0.999954,2,1.0,0.0,0.0
3,151,151,Training_set_I/data_53_6,"[0.2733737201104787, 0.31974730288556913, 0.32...",non atrial fibrillation,0.0,53095,0.0,3,0.0,0.0,0.0
4,151,151,Training_set_I/data_53_6,"[0.28548975325408577, 0.2827125265133597, 0.24...",non atrial fibrillation,0.0,53095,0.0,4,0.0,0.0,0.0


In [339]:
final['y_pred_2'] = final['y_pred']

In [371]:
final2 = (final.drop(columns=['Signal'])
      .groupby(['Sequence Number', 'Sequence Label', 'AF Burden', 'Signal Length'])
      .agg({'Chunk Label': lambda x: x.tolist() , 'y_pred': lambda x: x.tolist(), 'y_pred_2':'sum', })
      .rename({'y_pred_2' : 'AF Episodes'},axis=1)
      .reset_index())
final2['Predicted AF Burden'] = (final2['AF Episodes']*30*200) / final2['Signal Length']

In [372]:
final2[final2['AF Episodes']!=0]

Unnamed: 0,Sequence Number,Sequence Label,AF Burden,Signal Length,Chunk Label,y_pred,AF Episodes,Predicted AF Burden
9,159,persistent atrial fibrillation,1.0,4742450,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",365.0,0.461787
25,175,non atrial fibrillation,0.0,401824,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",45.0,0.671936
28,178,non atrial fibrillation,0.0,414798,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ...",63.0,0.911287
37,187,non atrial fibrillation,0.0,869143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,0.04142
44,194,non atrial fibrillation,0.0,420518,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20.0,0.285362
47,197,paroxysmal atrial fibrillation,0.052267,99183,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.060494


In [369]:
365*30*200

2190000