In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
#from utils import extract_basic_features

#import wfdb
import os
#import wfdb.processing as wp
import matplotlib.pyplot as plt
from scipy import signal
#from utils import find_noise_features, extract_basic_features
import shutil
import gc
import time
import random as rn
#from lightgbm import LGBMClassifier
from scipy import sparse
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold,StratifiedKFold
#from xgboost import XGBClassifier

import warnings
import scipy.io as sio

#from resnet_ecg.utils import one_hot,get_batches
from resnet_ecg.ecg_preprocess import ecg_preprocessing


from keras.utils import to_categorical
from keras.optimizers import SGD,Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler,EarlyStopping,ReduceLROnPlateau
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF


path = '/media/jdcloud/'

warnings.filterwarnings("ignore")

config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
config.gpu_options.per_process_gpu_memory_fraction = 0.8
session = tf.Session(config=config)
KTF.set_session(session )

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
rn.seed(12345)
tf.set_random_seed(1234)

class Config(object):
    def __init__(self):
        self.conv_subsample_lengths = [1, 2, 1, 2, 1, 2, 1, 2]
        self.conv_filter_length = 32
        self.conv_num_filters_start = 12
        self.conv_init = "he_normal"
        self.conv_activation = "relu"
        self.conv_dropout = 0.5
        self.conv_num_skip = 2
        self.conv_increase_channels_at = 2
        self.batch_size = 32#128
        self.input_shape = [2560, 12]#[1280, 1]
        self.num_categories = 2

    @staticmethod
    def lr_schedule(epoch):
        lr = 0.1
        if epoch >= 10 and epoch < 20:
            lr = 0.01
        if epoch >= 20:
            lr = 0.001
        print('Learning rate: ', lr)
        return lr


import keras.backend as K


def precision(y_true, y_pred):
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    # Calculates the F score, the weighted harmonic mean of precision and recall.
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    # Calculates the f-measure, the harmonic mean of precision and recall.
    return fbeta_score(y_true, y_pred, beta=1)



def read_data_seg(data_path, split="Train", preprocess=False, fs=500, newFs=256, winSecond=10, winNum=10, n_index=0):
    """ Read data """

    # Fixed params
    # n_index = 0
    n_class = 9
    winSize = winSecond * fs
    new_winSize = winSecond * newFs
    # Paths
    path_signals = os.path.join(data_path, split)

    # Read labels and one-hot encode
    # label_path = os.path.join(data_path, "reference.txt")
    # labels = pd.read_csv(label_path, sep='\t',header = None)
    # labels = pd.read_csv("reference.csv")

    # Read time-series data
    channel_files = os.listdir(path_signals)
    # print(channel_files)
    channel_files.sort()
    n_channels = 12  # len(channel_files)
    # posix = len(split) + 5

    # Initiate array
    list_of_channels = []

    X = np.zeros((len(channel_files), new_winSize, n_channels)).astype('float32') 
    i_ch = 0

    channel_name = ['V6', 'aVF', 'I', 'V4', 'V2', 'aVL', 'V1', 'II', 'aVR', 'V3', 'III', 'V5']
    channel_mid_name = ['II', 'aVR', 'V2', 'V5']
    channel_post_name = ['III', 'aVF', 'V3', 'V6']

    for i_ch, fil_ch in enumerate(channel_files[:]):  # tqdm
        
        if i_ch % 2000 == 0:
            print(i_ch)
            
        ecg = sio.loadmat(os.path.join(path_signals, fil_ch))
        ecg_length = ecg["I"].shape[1]

        if ecg_length > fs * winNum * winSecond:
            print(" too long !!!", ecg_length)
            ecg_length = fs * winNum * winSecond
        if ecg_length < 4500:
            print(" too short !!!", ecg_length)
            break

        slide_steps = int((ecg_length - winSize) / winSecond)

        if ecg_length <= 4500:
            slide_steps = 0

        ecg_channels = np.zeros((new_winSize, n_channels)).astype('float32') 

        for i_n, ch_name in enumerate(channel_name):

            ecg_channels[:, i_n] = signal.resample(ecg[ch_name]
                                                   [:, n_index * slide_steps:n_index * slide_steps + winSize].T
                                                   , new_winSize).T
            if preprocess:
                data = ecg_preprocessing(ecg_channels[:, i_n].reshape(1, new_winSize), 'sym8', 8, 3, newFs)
                ecg_channels[:, i_n] = data[0]
            else:
                pass
                ecg_channels[:, i_n] = ecg_channels[:, i_n]

        X[i_ch, :, :] = ecg_channels

    return X

def read_train_data(path):

    ecg12_seg0 = read_data_seg(path, n_index=0)
    ecg12_seg1 = read_data_seg(path, n_index=1)
    ecg12_seg2 = read_data_seg(path, n_index=2)
    ecg12_seg3 = read_data_seg(path, n_index=3)
    ecg12_seg4 = read_data_seg(path, n_index=4)

    ecg12_seg5 = read_data_seg(path, n_index=5)
    ecg12_seg6 = read_data_seg(path, n_index=6)
    ecg12_seg7 = read_data_seg(path, n_index=7)
    ecg12_seg8 = read_data_seg(path, n_index=8)
    ecg12_seg9 = read_data_seg(path, n_index=9)

    X = [ecg12_seg0, ecg12_seg1, ecg12_seg2, ecg12_seg3,
         ecg12_seg4, ecg12_seg5, ecg12_seg6, ecg12_seg7,
         ecg12_seg8, ecg12_seg9,
           ]

    del ecg12_seg0, ecg12_seg1, ecg12_seg2, ecg12_seg3, ecg12_seg4
    del ecg12_seg5, ecg12_seg6, ecg12_seg7, ecg12_seg8, ecg12_seg9

    gc.collect()

    return X

def read_test_data(path):

    test_x_seg0 = read_data_seg(path, split='Val', n_index=0)
    test_x_seg1 = read_data_seg(path, split='Val', n_index=1)
    test_x_seg2 = read_data_seg(path, split='Val', n_index=2)
    test_x_seg3 = read_data_seg(path, split='Val', n_index=3)
    test_x_seg4 = read_data_seg(path, split='Val', n_index=4)

    test_x_seg5 = read_data_seg(path, split='Val', n_index=5)
    test_x_seg6 = read_data_seg(path, split='Val', n_index=6)
    test_x_seg7 = read_data_seg(path, split='Val', n_index=7)
    test_x_seg8 = read_data_seg(path, split='Val', n_index=8)
    test_x_seg9 = read_data_seg(path, split='Val', n_index=9)

    test_x = [test_x_seg0, test_x_seg1, test_x_seg2, test_x_seg3, test_x_seg4,
              test_x_seg5, test_x_seg6, test_x_seg7, test_x_seg8, test_x_seg9,
             ]

    del test_x_seg0, test_x_seg1, test_x_seg2, test_x_seg3, test_x_seg4
    del test_x_seg5, test_x_seg6, test_x_seg7, test_x_seg8, test_x_seg9

    gc.collect()

    return test_x

def preprocess_y(labels,y,num_class=9):
    bin_label = np.zeros((len(y),num_class)).astype('int8') 
    for i in range(len(y)):
        label_nona = labels.loc[y[i]].dropna()
        for j in range(1,label_nona.shape[0]):
            bin_label[i,int(label_nona[j])]=1
    return bin_label


def add_compile(model, config):
    optimizer = SGD(lr=config.lr_schedule(0), momentum=0.9)  # Adam()#
    model.compile(loss='binary_crossentropy',  # weighted_loss,#'binary_crossentropy',
                  optimizer='adam',  # optimizer,#'adam',
                  metrics=['accuracy', fmeasure, precision])#recall
    # ['accuracy',fbetaMacro,recallMacro,precisionMacro])
    # ['accuracy',fmeasure,recall,precision])

if __name__ == '__main__':

    train_dataset_path = path + "/Train/"
    val_dataset_path = path + "/Val/"

    train_files = os.listdir(train_dataset_path)
    train_files.sort()
    val_files = os.listdir(val_dataset_path)
    val_files.sort()

    labels = pd.read_csv(path+"reference.csv")

    #print(labels.head())

    bin_label = np.zeros((6500,9))
    for i in range(labels.shape[0]):
        label_nona = labels.loc[i].dropna()
        for j in range(1,label_nona.shape[0]):
            bin_label[i,int(label_nona[j])]=1

    cv_pred_all = 0
    en_amount = 1

    labels_en = pd.read_csv(path + "kfold_labels_en.csv")
    #print(labels_en.shape)
    #print(labels_en.head())

    data_info = pd.read_csv(path + "data_info.csv")
    #print(data_info.head())

    train_index = np.arange(6500).astype('int16')

    label2_list = data_info[data_info.labels_num == 2].index.tolist()
    label3_list = data_info[data_info.labels_num == 3].index.tolist()
    label4_list = data_info[data_info.labels_num == 4].index.tolist()
    label5_list = data_info[data_info.labels_num == 5].index.tolist()
    label6_list = data_info[data_info.labels_num == 6].index.tolist()

    train_index = np.insert(train_index, label2_list, label2_list)  # [145:155]

    train_index = np.insert(train_index, label3_list, label3_list)
    train_index = np.insert(train_index, label3_list, label3_list)

    train_index = np.insert(train_index, label4_list, label4_list)
    train_index = np.insert(train_index, label4_list, label4_list)
    train_index = np.insert(train_index, label4_list, label4_list)

    train_index = np.insert(train_index, label5_list, label5_list)
    train_index = np.insert(train_index, label5_list, label5_list)
    train_index = np.insert(train_index, label5_list, label5_list)
    train_index = np.insert(train_index, label5_list, label5_list)

    train_index = np.insert(train_index, label6_list, label6_list)
    train_index = np.insert(train_index, label6_list, label6_list)
    train_index = np.insert(train_index, label6_list, label6_list)
    train_index = np.insert(train_index, label6_list, label6_list)
    train_index = np.insert(train_index, label6_list, label6_list)

    #print(train_index.dtype)

    train_index = train_index.astype(np.int16)

    train_index.sort()

    print("train_index shape :",train_index.shape)
    #print(train_index)
    
    ecg12_seg0 = read_data_seg(path, n_index=0) 
    ecg12_seg1 = read_data_seg(path, n_index=1) 
    ecg12_seg2 = read_data_seg(path, n_index=2) 
    ecg12_seg3 = read_data_seg(path, n_index=3) 
    ecg12_seg4 = read_data_seg(path, n_index=4) 

    ecg12_seg5 = read_data_seg(path, n_index=5)
    ecg12_seg6 = read_data_seg(path, n_index=6)
    ecg12_seg7 = read_data_seg(path, n_index=7)
    ecg12_seg8 = read_data_seg(path, n_index=8)
    ecg12_seg9 = read_data_seg(path, n_index=9)
    #train_x = np.array(read_train_data(path),dtype=np.float32)
    #test_x = read_test_data(path)

    test_x_seg0 = read_data_seg(path, split='Val', n_index=0)
    test_x_seg1 = read_data_seg(path, split='Val', n_index=1)
    test_x_seg2 = read_data_seg(path, split='Val', n_index=2)
    test_x_seg3 = read_data_seg(path, split='Val', n_index=3)
    test_x_seg4 = read_data_seg(path, split='Val', n_index=4)

    test_x_seg5 = read_data_seg(path, split='Val', n_index=5)
    test_x_seg6 = read_data_seg(path, split='Val', n_index=6)
    test_x_seg7 = read_data_seg(path, split='Val', n_index=7)
    test_x_seg8 = read_data_seg(path, split='Val', n_index=8)
    test_x_seg9 = read_data_seg(path, split='Val', n_index=9)

    test_x = [test_x_seg0, test_x_seg1, test_x_seg2, test_x_seg3, test_x_seg4,
              test_x_seg5, test_x_seg6, test_x_seg7, test_x_seg8, test_x_seg9,
             ]

    del test_x_seg0, test_x_seg1, test_x_seg2, test_x_seg3, test_x_seg4
    del test_x_seg5, test_x_seg6, test_x_seg7, test_x_seg8, test_x_seg9

    gc.collect()

Using TensorFlow backend.


train_index shape : (7703,)
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
2000
4000
6000
0
0
0
0
0
0
0
0
0
0


In [2]:
from resnet_ecg import attentionmodel  

from keras.layers import Input
from keras.models import Model,load_model

'''   '''
inputs0 = Input(shape=(2560,12),dtype="float32")
inputs1 = Input(shape=(2560,12),dtype="float32")
inputs2 = Input(shape=(2560,12),dtype="float32")
inputs3 = Input(shape=(2560,12),dtype="float32")
inputs4 = Input(shape=(2560,12),dtype="float32")
inputs5 = Input(shape=(2560,12),dtype="float32")
inputs6 = Input(shape=(2560,12),dtype="float32")
inputs7 = Input(shape=(2560,12),dtype="float32")
inputs8 = Input(shape=(2560,12),dtype="float32")
inputs9 = Input(shape=(2560,12),dtype="float32")

inputs_list = [inputs0,inputs1,inputs2,inputs3,inputs4,inputs5,inputs6,inputs7,inputs8,inputs9]

outputs = attentionmodel.build_network(inputs_list,0.5,num_classes=9,block_size=4,relu=False)

model = Model(inputs =inputs_list,outputs=outputs)

#print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [4]:
#print("train_x shape :", train_x.shape)
       
model_path = './attention_model/'#'./model/'

for seed in range(en_amount):
    print("************************")
    n_fold = 3
    n_classes = 9

    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
    kf = kfold.split(train_index, labels_en['label1'])

    blend_train = np.zeros((6500, n_fold,n_classes)).astype('float32') #len(train_x)
    blend_test = np.zeros((500, n_fold, n_classes)).astype('float32') #len(test_x)

    count = 0

    for i, (index_train, index_valid) in enumerate(kf):
        print('fold: ', i+1, ' training')
        t = time.time()

        index_tr = train_index[index_train]
        index_vld = np.arange(6500).astype('int16')#train_index[index_valid]

        X_vld = [ecg12_seg0[index_vld], ecg12_seg1[index_vld], ecg12_seg2[index_vld], ecg12_seg3[index_vld],
                 ecg12_seg4[index_vld], ecg12_seg5[index_vld], ecg12_seg6[index_vld], ecg12_seg7[index_vld],
                 ecg12_seg8[index_vld], ecg12_seg9[index_vld],
               ]

        y_vld = preprocess_y(labels,index_vld)

        # Evaluate best trained model
        model.load_weights(model_path+'attention_weights-best_k{}_r{}.hdf5'.format(seed, i))

        test_y = model.predict(test_x)
        val_y = model.predict(X_vld)

        del X_vld

        gc.collect()
        gc.collect()

        blend_train[:,i, :] = val_y
        blend_test[:, i, :] = test_y

        count += 1


************************
fold:  1  training
fold:  2  training
fold:  3  training


In [5]:
blend_train.shape

(6500, 3, 9)

In [6]:
np.argmax(blend_train,axis=1)[-20:]

array([0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
np.argmax(blend_train[-1])

0

In [21]:
x_tr_y =  0.*blend_train[:, 0, :] +0.2 * blend_train[:, 1, :] +0.8 * blend_train[:, 2, :]

In [24]:
weight = 0.2

In [22]:
#x_tr_y = blend_train

threshold = np.arange(0.1,0.9,0.1)

out = x_tr_y
y_test = bin_label#y_tr

acc = []
accuracies = []
best_threshold = np.zeros(out.shape[1])
for i in range(out.shape[1]):
    y_prob = np.array(out[:,i])
    for j in threshold:
        y_pred = [1 if prob>=j else 0 for prob in y_prob]
        #acc.append( matthews_corrcoef(y_test[:,i],y_pred))
        acc.append(f1_score(y_test[:,i],y_pred,average='macro'))
    acc   = np.array(acc)
    index = np.where(acc==acc.max()) 
    accuracies.append(acc.max()) 
    best_threshold[i] = threshold[index[0][0]]
    acc = []
    
print("best_threshold: ",best_threshold)

y_pred = np.array([[1 if out[i,j]>=best_threshold[j] else 0 for j in range(y_test.shape[1])] for i in range(len(y_test))])

y_pred 

y_test

#best_threshold:  [0.7 0.4 0.5 0.4 0.3 0.2 0.3 0.4 0.4]
#0.022393162393162393

#best_threshold:  [0.7 0.4 0.5 0.4 0.4 0.2 0.4 0.4 0.5]
#0.022615384615384617

#hamming_loss(y_test,y_pred)

best_threshold:  [0.7 0.4 0.5 0.7 0.3 0.3 0.2 0.1 0.4]


array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [23]:
f1_score(y_test,y_pred,average='macro')

0.8970381260536389

In [63]:
''' for i in range(y_pred.shape[0]):
    if (y_pred[i][0] == 1) & (np.argmax(blend_train[i])==0) &((y_pred[i][7] != 1) ):
        y_pred[i] = [1,0,0,0,0,0,0,0,0]
    elif (y_pred[i][0] == 1) & (np.argmax(blend_train[i])==0) &((y_pred[i][7] == 1) ):
        y_pred[i] = [0,0,0,0,0,0,0,1,0]
f1_score(y_test,y_pred,average='macro')
'''
pass

In [24]:
sum_f1 = 0
for i in range(9):
    sum_f1 += f1_score(y_test[:,i],y_pred[:,i],average='macro')
    print("f1 score of ab {} is {}".format(i, f1_score(y_test[:,i],y_pred[:,i],average='macro')))

f1 score of ab 0 is 0.9501539138876158
f1 score of ab 1 is 0.987301922195947
f1 score of ab 2 is 0.9439078350142455
f1 score of ab 3 is 0.9934088334120804
f1 score of ab 4 is 0.894263811789585
f1 score of ab 5 is 0.9620662242557618
f1 score of ab 6 is 0.9197889234153733
f1 score of ab 7 is 0.889570764856806
f1 score of ab 8 is 0.9334864126711857


In [11]:
sum_f1/9

0.9410313587270029

In [12]:
b = 0.3*blend_test[:,0,:]+0.7*blend_test[:,1,:]

In [10]:
blend_test[:,1,:]

array([[9.5819497e-01, 0.0000000e+00, 5.1856041e-06, ..., 1.7875433e-04,
        3.7002474e-02, 2.1824539e-03],
       [1.1602640e-03, 3.2782555e-07, 1.7881393e-07, ..., 1.1679530e-03,
        5.6624413e-07, 3.1960011e-04],
       [5.0628185e-04, 9.2651951e-01, 9.1991425e-03, ..., 9.7807944e-03,
        2.9823184e-04, 9.2111492e-01],
       ...,
       [1.2218952e-06, 9.5695257e-05, 6.9725513e-04, ..., 4.2690039e-03,
        7.3015690e-06, 1.7085671e-04],
       [3.8334012e-02, 1.2814999e-06, 6.7942220e-01, ..., 1.4413595e-03,
        3.7550926e-06, 6.1189079e-01],
       [9.9801010e-01, 8.9406967e-07, 2.3245811e-06, ..., 1.1432171e-04,
        4.4703484e-07, 5.9485435e-05]], dtype=float32)

In [25]:
test_yy = 0.*blend_test[:,0,:]+0.2*blend_test[:,0,:]+0.8*blend_test[:,1,:] # blend_test.mean(axis=1)

In [26]:
import csv
classes = [0,1,2,3,4,5,6,7,8]

test_y = test_yy

y_pred = [[1 if test_y[i,j]>=best_threshold[j] else 0 for j in range(test_y.shape[1])] 
          for i in range(len(test_y))]

''' 
# trick
for i in range(len(y_pred)):
    if (y_pred[i][0] == 1) & (np.argmax(test_yy[i])==0):
        y_pred[i] = [1,0,0,0,0,0,0,0,0]
'''      
                
pred=[]
for j in range(test_y.shape[0]):
    pred.append([classes[i] for i in range(9) if y_pred[j][i] == 1])

with open('answers_attention_0610.csv','w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File_name', 'label1', 'label2',
                    'label3', 'label4', 'label5', 'label6', 'label7', 'label8'])
    count = 0
    for file_name in val_files:
        if file_name.endswith('.mat'):
            
            record_name = file_name.strip('.mat')
            answer = []
            answer.append(record_name) 
            
            result = pred[count]
            
            answer.extend(result)
            for i in range(8-len(result)):
                answer.append('')
                
            #print(answer)
            count += 1
            writer.writerow(answer)
    csvfile.close()