In [1]:
import os
import shutil
import gc
import time
import random as rn
import numpy as np
import pandas as pd
import warnings
import csv

import scipy.io as sio
from scipy import signal
from tqdm import tqdm
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold


''' '''
# from resnet_ecg.utils import one_hot,get_batches
from resnet_ecg.ecg_preprocess import ecg_preprocessing
from resnet_ecg import attentionmodel
from keras.utils import to_categorical
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
import keras.backend as K
from keras.layers import Input
from keras.models import Model, load_model
import keras
import pywt


warnings.filterwarnings("ignore")

'''
config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.per_process_gpu_memory_fraction = 0.8
session = tf.Session(config=config)
KTF.set_session(session)
'''
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
rn.seed(12345)
tf.set_random_seed(1234)


# path of training data
path = '/media/jdcloud/'



Using TensorFlow backend.


In [2]:
class Config(object):
    def __init__(self):
        self.conv_subsample_lengths = [1, 2, 1, 2, 1, 2, 1, 2]
        self.conv_filter_length = 32
        self.conv_num_filters_start = 12
        self.conv_init = "he_normal"
        self.conv_activation = "relu"
        self.conv_dropout = 0.5
        self.conv_num_skip = 2
        self.conv_increase_channels_at = 2
        self.batch_size = 32  # 128
        self.input_shape = [2560, 12]  # [1280, 1]
        self.num_categories = 2

    @staticmethod
    def lr_schedule(epoch):
        lr = 0.1
        if epoch >= 10 and epoch < 20:
            lr = 0.01
        if epoch >= 20:
            lr = 0.001
        # print('Learning rate: ', lr)
        return lr

def wavelet(ecg, wavefunc, lv, m, n):  #

    coeff = pywt.wavedec(ecg, wavefunc, mode='sym', level=lv)  #
    # sgn = lambda x: 1 if x > 0 else -1 if x < 0 else 0

    for i in range(m, n + 1):
        cD = coeff[i]
        for j in range(len(cD)):
            Tr = np.sqrt(2 * np.log(len(cD)))
            if cD[j] >= Tr:
                coeff[i][j] = np.sign(cD[j]) - Tr
            else:
                coeff[i][j] = 0

    denoised_ecg = pywt.waverec(coeff, wavefunc)
    return denoised_ecg


def wavelet_db6(sig):
    """
    R J, Acharya U R, Min L C. ECG beat classification using PCA, LDA, ICA and discrete
     wavelet transform[J].Biomedical Signal Processing and Control, 2013, 8(5): 437-448.
    param sig: 1-D numpy Array
    return: 1-D numpy Array
    """
    coeffs = pywt.wavedec(sig, 'db6', level=9)
    coeffs[-1] = np.zeros(len(coeffs[-1]))
    coeffs[-2] = np.zeros(len(coeffs[-2]))
    coeffs[0] = np.zeros(len(coeffs[0]))
    sig_filt = pywt.waverec(coeffs, 'db6')
    return sig_filt


def precision(y_true, y_pred):
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    # Calculates the F score, the weighted harmonic mean of precision and recall.
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    # Calculates the f-measure, the harmonic mean of precision and recall.
    return fbeta_score(y_true, y_pred, beta=1)


def read_data_seg(data_path, split="Train", preprocess=False, fs=500, newFs=256, winSecond=10, winNum=10, n_index=0,pre_type="sym"):
    """ Read data """

    # Fixed params
    # n_index = 0
    n_class = 10
    winSize = winSecond * fs
    new_winSize = winSecond * newFs
    # Paths
    path_signals = os.path.join(data_path, split)

    # Read labels and one-hot encode
    # label_path = os.path.join(data_path, "reference.txt")
    # labels = pd.read_csv(label_path, sep='\t',header = None)
    # labels = pd.read_csv("reference.csv")

    # Read time-series data
    channel_files = os.listdir(path_signals)
    # print(channel_files)
    channel_files.sort()
    n_channels = 12  # len(channel_files)
    # posix = len(split) + 5

    # Initiate array
    list_of_channels = []

    X = np.zeros((len(channel_files), new_winSize, n_channels)).astype('float32')
    i_ch = 0

    channel_name = ['V6', 'aVF', 'I', 'V4', 'V2', 'aVL', 'V1', 'II', 'aVR', 'V3', 'III', 'V5']
    channel_mid_name = ['II', 'aVR', 'V2', 'V5']
    channel_post_name = ['III', 'aVF', 'V3', 'V6']

    for i_ch, fil_ch in enumerate(channel_files[:]):  # tqdm

        if i_ch % 1000 == 0:
            print(i_ch)

        ecg = sio.loadmat(os.path.join(path_signals, fil_ch))
        ecg_length = ecg["I"].shape[1]

        if ecg_length > fs * winNum * winSecond:
            print(" too long !!!", ecg_length)
            ecg_length = fs * winNum * winSecond
        if ecg_length < 4500:
            print(" too short !!!", ecg_length)
            break

        slide_steps = int((ecg_length - winSize) / winSecond)

        if ecg_length <= 4500:
            slide_steps = 0

        ecg_channels = np.zeros((new_winSize, n_channels)).astype('float32')

        for i_n, ch_name in enumerate(channel_name):

            ecg_channels[:, i_n] = signal.resample(ecg[ch_name]
                                                   [:, n_index * slide_steps:n_index * slide_steps + winSize].T
                                                   , new_winSize).T
            if preprocess:
                if pre_type == "sym":
                    ecg_channels[:, i_n] = ecg_preprocessing(ecg_channels[:, i_n].reshape(1, new_winSize), 'sym8', 8, 3,
                                                             newFs, removebaseline=False, normalize=False)[0]
                elif pre_type == "db4":
                    ecg_channels[:, i_n] = wavelet(ecg_channels[:, i_n], 'db4', 4, 2, 4)
                elif pre_type == "db6":
                    ecg_channels[:, i_n] = wavelet_db6(ecg_channels[:, i_n])

                # ecg_channels[:, i_n] = (ecg_channels[:, i_n]-np.mean(ecg_channels[:, i_n]))/np.std(ecg_channels[:, i_n])
            else:
                pass
                print(" no preprocess !!! ")

        X[i_ch, :, :] = ecg_channels

    return X


def preprocess_y(labels, y, num_class=10):
    bin_label = np.zeros((len(y), num_class)).astype('int8')
    for i in range(len(y)):
        label_nona = labels.loc[y[i]].dropna()
        for j in range(1, label_nona.shape[0]):
            bin_label[i, int(label_nona[j])] = 1
    return bin_label


class DataGenerator(keras.utils.Sequence):
    # ' Generates data for Keras '

    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        # 'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        # 'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # 'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        # 'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        # 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size,  *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load("training_data/" + ID+".npy")
            # Store class
            y[i,:] = preprocess_y(self.labels,self.labels[self.labels["File_name"] == ID.split("_")[0]].index)

        # X_list = [(X[:, i]-np.mean(X[:, i]))/np.std(X[:, i]) for i in range(10)]
        X_list = [X[:, 0], X[:, 1], X[:, 2], X[:, 3], X[:, 4], X[:, 5], X[:, 6], X[:, 7], X[:, 8], X[:, 9]]
        del X

        return X_list, y  # keras.utils.to_categorical(y, num_classes=self.n_classes)

def add_compile(model, config):
    optimizer = SGD(lr=config.lr_schedule(0), momentum=0.9)  # Adam()#
    model.compile(loss='binary_crossentropy',  # weighted_loss,#'binary_crossentropy',
                  optimizer='adam',  # optimizer,#'adam',
                  metrics=['accuracy', fmeasure, precision])  # recall
    # ['accuracy',fbetaMacro,recallMacro,precisionMacro])
    # ['accuracy',fmeasure,recall,precision])

In [4]:
train_dataset_path = path + "/Train/"
val_dataset_path = path + "/Val/"

train_files = os.listdir(train_dataset_path)
train_files.sort()
val_files = os.listdir(val_dataset_path)
val_files.sort()

labels = pd.read_csv(path + "REFERENCE.csv")
labels_en = pd.read_csv(path + "kfold_labels_en.csv")
#data_info = pd.read_csv(path + "data_info.csv")

input_size = (2560, 12)
net_num = 10
inputs_list = [Input(shape=input_size) for _ in range(net_num)]
outputs = attentionmodel.build_network(inputs_list, 0.5, num_classes=10, block_size=4, relu=False)
model = Model(inputs=inputs_list, outputs=outputs)
# print(model.summary())

raw_IDs = labels_en["File_name"].values.tolist()
extend_db4_IDs = [i + "_db4" for i in raw_IDs]
extend_db6_IDs = [i + "_db6" for i in raw_IDs]
extend_ori_IDs = [i + "_ori" for i in raw_IDs]
all_IDs = raw_IDs + extend_db4_IDs + extend_db6_IDs+extend_ori_IDs

train_labels = labels_en["label1"].values
all_train_labels = np.hstack((train_labels, train_labels, train_labels))

# Parameters
params = {'dim': (10, 2560),
          'batch_size': 64,
          'n_classes': 10,
          'n_channels': 12,
          'shuffle': True}

en_amount = 1
model_path = './official_attention_model/'

for seed in range(en_amount):
    print("************************")
    n_fold = 3
    n_classes = 10

    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1234)
    # kf = kfold.split(all_IDs, all_train_labels)
    kf = kfold.split(labels["File_name"].values.tolist(), labels["label1"].values)

    for i, (index_train, index_valid) in enumerate(kf):
        print('fold: ', i + 1, ' training')
        t = time.time()

        #tr_IDs = np.array(all_IDs)[index_train]
        #val_IDs = np.array(all_IDs)[index_valid]
        #print(tr_IDs.shape)
        tr_IDs = labels["File_name"].values[index_train].tolist() 
        val_IDs = labels["File_name"].values[index_valid].tolist()

        for j in range(4):
            for ids in labels[labels.label1==4]["File_name"]:
                if ids in tr_IDs:
                    tr_IDs.append(ids)

        for j in range(2):
            for ids in labels[labels.label1==7]["File_name"]:
                if ids in tr_IDs:
                    tr_IDs.append(ids)

        for j in range(1):
            for ids in labels[labels.label1==9]["File_name"]:
                if ids in tr_IDs:
                    tr_IDs.append(ids)

        tr_IDs_db4 = [ids+"_db4" for ids in tr_IDs]
        tr_IDs_db6 = [ids+"_db6" for ids in tr_IDs]
        tr_IDs_ori = [ids+"_ori" for ids in tr_IDs]

        val_IDs_db4 = [ids+"_db4" for ids in val_IDs]
        val_IDs_db6 = [ids+"_db6" for ids in val_IDs]
        val_IDs_ori = [ids+"_ori" for ids in val_IDs]

        tr_IDs = tr_IDs+ tr_IDs_db4 + tr_IDs_db6 + tr_IDs_ori
        val_IDs = val_IDs + val_IDs_db4 + val_IDs_db6 + val_IDs_ori
        print("tr_IDs : ",len(tr_IDs))
        print("val_IDs : ",len(val_IDs))


        # Generators
        training_generator = DataGenerator(tr_IDs, labels, **params)
        validation_generator = DataGenerator(val_IDs, labels, **params)

        checkpointer = ModelCheckpoint(filepath=model_path + 'attention_extend_weights-best_k{}_r{}_0809_30.hdf5'.format(seed, i),
                                       monitor='val_fmeasure', verbose=1, save_best_only=True,
                                       save_weights_only=True,
                                       mode='max')  # val_fmeasure
        reduce = ReduceLROnPlateau(monitor='val_fmeasure', factor=0.5, patience=2, verbose=1, min_delta=1e-5,
                                   mode='max')

        earlystop = EarlyStopping(monitor='val_fmeasure', patience=10)

        config = Config()
        add_compile(model, config)

        callback_lists = [checkpointer, reduce]

        history = model.fit_generator(generator=training_generator,
                                      validation_data=validation_generator,
                                      use_multiprocessing=False,
                                      epochs=30, # 50
                                      verbose=1,
                                      callbacks=callback_lists)

************************
fold:  1  training
tr_IDs :  21048
val_IDs :  8932
Epoch 1/30

Epoch 00001: val_fmeasure improved from -inf to 0.31972, saving model to ./official_attention_model/attention_extend_weights-best_k0_r0_0809_30.hdf5
Epoch 2/30

Epoch 00002: val_fmeasure improved from 0.31972 to 0.69353, saving model to ./official_attention_model/attention_extend_weights-best_k0_r0_0809_30.hdf5
Epoch 3/30

Epoch 00003: val_fmeasure did not improve from 0.69353
Epoch 4/30

Epoch 00004: val_fmeasure improved from 0.69353 to 0.71919, saving model to ./official_attention_model/attention_extend_weights-best_k0_r0_0809_30.hdf5
Epoch 5/30

Epoch 00005: val_fmeasure improved from 0.71919 to 0.73474, saving model to ./official_attention_model/attention_extend_weights-best_k0_r0_0809_30.hdf5
Epoch 6/30

Epoch 00006: val_fmeasure improved from 0.73474 to 0.77051, saving model to ./official_attention_model/attention_extend_weights-best_k0_r0_0809_30.hdf5
Epoch 7/30

Epoch 00007: val_fmeasure im

In [9]:
def predcit_net_kfolds(pre_type = "sym"):

    #pre_type = "sym" # "sym"
    path = "/media/jdcloud/"
    labels = pd.read_csv(path + "REFERENCE.csv")
    raw_IDs = labels["File_name"].values.tolist()

    IDs = {}
    IDs["sym"] = raw_IDs
    IDs["db4"] = [i + "_db4" for i in raw_IDs]
    IDs["db6"] = [i + "_db6" for i in raw_IDs]
    IDs["ori"] = [i + "_ori" for i in raw_IDs]
    
    input_size = (2560, 12)
    net_num = 10
    inputs_list = [Input(shape=input_size) for _ in range(net_num)]
    outputs = attentionmodel.build_network(inputs_list, 0.5, num_classes=10, block_size=4, relu=False)
    model = Model(inputs=inputs_list, outputs=outputs)

    net_num = 10
    test_x = [read_data_seg(path, split='Val', preprocess=True, n_index=i, pre_type=pre_type) for i in range(net_num)]

    model_path = './official_attention_model/'
    model_name = 'attention_extend_weights-best_one_fold.hdf5'

    en_amount = 1
    for seed in range(en_amount):
        print("************************")
        n_fold = 3  # 3
        n_classes = 10

        kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
        kf = kfold.split(IDs[pre_type], labels['label1'])

        blend_train = np.zeros((6689, n_fold, n_classes)).astype('float32')  # len(train_x)
        blend_test = np.zeros((558, n_fold, n_classes)).astype('float32')  # len(test_x)

        count = 0

        for i, (index_train, index_valid) in enumerate(kf):
            print('fold: ', i + 1, ' training')
            t = time.time()

            tr_IDs = np.array(IDs[pre_type]) # [index_train]
            # val_IDs = np.array(IDs[pre_type])[index_valid]
            print(tr_IDs.shape)

            X = np.empty((tr_IDs.shape[0], 10, 2560, 12))
            for j, ID in enumerate(tr_IDs):
                X[j, ] = np.load("training_data/" + ID + ".npy")
            # X_tr = [(X[:, i] - np.mean(X[:, i])) / np.std(X[:, i]) for i in range(10)]
            X_tr = [X[:, 0], X[:, 1], X[:, 2], X[:, 3], X[:, 4], X[:, 5], X[:, 6], X[:, 7], X[:, 8], X[:, 9]]
            # print(X.shape)
            del X

            # Evaluate best trained model
            model.load_weights(model_path + 'attention_extend_weights-best_k{}_r{}_0809_30.hdf5'.format(seed, i))

            blend_train[:, i, :] = model.predict(X_tr)
            blend_test[:, i, :] = model.predict(test_x)

            del X_tr
            gc.collect()
            gc.collect()
            count += 1

    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.linear_model import LogisticRegression
    LR = LogisticRegression(penalty="l2",C=1.0)

    #pre_type = "sym"#"db6"#"sym"
    labels = pd.read_csv(path + "REFERENCE.csv")

    index = np.arange(6689)
    y_train = preprocess_y(labels, index)

    x_train = np.hstack([blend_train[:,0,:],blend_train[:,1,:],blend_train[:,2,:]])

    clf = OneVsRestClassifier(LR)
    clf.fit(x_train,y_train)

    y_pred = clf.predict(x_train)

    print(" train data f1_score  :", f1_score(y_train, y_pred, average='macro'))
    for i in range(10):
        print("f1 score of ab {} is {}".format(i, f1_score(y_train[:, i], y_pred[:, i], average='macro')))


    '''
    index = np.arange(6689)
    y_train = preprocess_y(labels, index)

    train_y = 0.1 * blend_train[:, 0, :] + 0.1 * blend_train[:, 1, :] + 0.8 * blend_train[:, 2, :]

    threshold = np.arange(0.1, 0.9, 0.1)
    acc = []
    accuracies = []
    best_threshold = np.zeros(train_y.shape[1])

    for i in range(train_y.shape[1]):
        y_prob = np.array(train_y[:, i])
        for j in threshold:
            y_pred = [1 if prob >= j else 0 for prob in y_prob]
            acc.append(f1_score(y_train[:, i], y_pred, average='macro'))
        acc = np.array(acc)
        index = np.where(acc == acc.max())
        accuracies.append(acc.max())
        best_threshold[i] = threshold[index[0][0]]
        acc = []

    print("best_threshold :", best_threshold)

    y_pred = np.array([[1 if train_y[i, j] >= best_threshold[j] else 0 for j in range(train_y.shape[1])]
              for i in range(len(train_y))])
    print(" train data f1_score  :", f1_score(y_train, y_pred, average='macro'))

    for i in range(10):
        print("f1 score of ab {} is {}".format(i, f1_score(y_train[:, i], y_pred[:, i], average='macro')))


    out = 0.1 * blend_test[:, 0, :] + 0.1 * blend_test[:, 1, :] + 0.8 * blend_test[:, 2, :]

    y_pred_test = np.array(
        [[1 if out[i, j] >= best_threshold[j] else 0 for j in range(out.shape[1])] for i in range(len(out))])

    classes = [0, 1, 2, 3, 4, 5, 6, 7, 8,9]

    test_y = y_pred_test

    y_pred = [[1 if test_y[i, j] >= best_threshold[j] else 0 for j in range(test_y.shape[1])]
              for i in range(len(test_y))]
    '''
    out = np.hstack([blend_test[:,0,:],blend_test[:,1,:],blend_test[:,2,:]])#

    y_pred = clf.predict(out)

    classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    pred = []
    for j in range(y_pred.shape[0]):#test_y
        pred.append([classes[i] for i in range(10) if y_pred[j][i] == 1])

    val_dataset_path = path + "/Val/"
    val_files = os.listdir(val_dataset_path)
    val_files.sort()

    with open('jupyter_answers_attention_{}_0809.csv'.format(pre_type), 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['File_name', 'label1', 'label2',
                         'label3', 'label4', 'label5', 'label6', 'label7', 'label8', 'label9', 'label10'])
        count = 0
        for file_name in val_files:
            if file_name.endswith('.mat'):

                record_name = file_name.strip('.mat')
                answer = []
                answer.append(record_name)

                result = pred[count]

                answer.extend(result)
                for i in range(10 - len(result)):
                    answer.append('')
                count += 1
                writer.writerow(answer)
        csvfile.close()

    train_pd0 = pd.DataFrame(blend_train[:,0,:])
    train_pd1 = pd.DataFrame(blend_train[:,1,:])
    train_pd2 = pd.DataFrame(blend_train[:,2,:])
    csv_path = "./ensemble_csv/"
    train_pd0.to_csv(csv_path+"attention_10net_{}_addori_fold0.csv".format(pre_type),index=None)
    train_pd1.to_csv(csv_path+"attention_10net_{}_addori_fold1.csv".format(pre_type),index=None)
    train_pd2.to_csv(csv_path+"attention_10net_{}_addori_fold2.csv".format(pre_type),index=None)

    test_pd0 = pd.DataFrame(blend_test[:,0,:])
    test_pd1 = pd.DataFrame(blend_test[:,1,:])
    test_pd2 = pd.DataFrame(blend_test[:,2,:])
    csv_path = "./test_csv/"
    test_pd0.to_csv(csv_path+"attention_10net_{}_addori_fold0.csv".format(pre_type),index=None)
    test_pd1.to_csv(csv_path+"attention_10net_{}_addori_fold1.csv".format(pre_type),index=None)
    test_pd2.to_csv(csv_path+"attention_10net_{}_addori_fold2.csv".format(pre_type),index=None)

In [8]:
predcit_net_kfolds(pre_type = "ori")

0
0
0
0
0
0
0
0
0
0
************************
fold:  1  training
(6689,)
fold:  2  training
(6689,)
fold:  3  training
(6689,)
 train data f1_score  : 0.9364256418580391
f1 score of ab 0 is 0.9658348169213595
f1 score of ab 1 is 0.9898416333572271
f1 score of ab 2 is 0.9655428489767162
f1 score of ab 3 is 0.9825511727369634
f1 score of ab 4 is 0.9620437044981247
f1 score of ab 5 is 0.9677196412449129
f1 score of ab 6 is 0.9396350192316612
f1 score of ab 7 is 0.953039546753347
f1 score of ab 8 is 0.9482459357393056
f1 score of ab 9 is 0.9634379884906863


In [55]:
train = []
path = './'
# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_sym_addori_fold0.csv").values)
# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_sym_addori_fold1.csv").values)
# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_sym_addori_fold2.csv").values) # 3folds f0.817

# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_db6_addori_fold0.csv").values)
# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_db6_addori_fold1.csv").values)
# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_db6_addori_fold2.csv").values) # 3folds f0.818

# train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_ori_addori_fold0.csv").values)
train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_ori_addori_fold1.csv").values)
train.append(pd.read_csv(path+"ensemble_csv/"+"attention_10net_ori_addori_fold2.csv").values) # 3folds f0.817

In [56]:
test = []

# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_sym_addori_fold0.csv").values)
# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_sym_addori_fold1.csv").values)
# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_sym_addori_fold2.csv").values)

# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_db6_addori_fold0.csv").values)
# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_db6_addori_fold1.csv").values)
# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_db6_addori_fold2.csv").values)

# test.append(pd.read_csv(path+"test_csv/"+"attention_10net_ori_addori_fold0.csv").values)
test.append(pd.read_csv(path+"test_csv/"+"attention_10net_ori_addori_fold1.csv").values)
test.append(pd.read_csv(path+"test_csv/"+"attention_10net_ori_addori_fold2.csv").values)

In [14]:
y_train.shape

(6689, 10)

In [57]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet, Lasso, RidgeCV
from sklearn.ensemble import GradientBoostingClassifier
from mlknn import MLkNN

gbm = GradientBoostingClassifier(learning_rate=0.005,n_estimators=100,max_depth=5,min_samples_leaf=20,
                                min_samples_split=600,subsample=0.7,random_state=2019)
LR = LogisticRegression(penalty="l2",C=1.0)
Eln = ElasticNet()
Las = Lasso(alpha=0.2)
LRR = RidgeCV()

pre_type = "db6"#"db6"#"sym"
labels = pd.read_csv("/media/jdcloud/" + "REFERENCE.csv")

index = np.arange(6689)
y_train = preprocess_y(labels, index)

x_train = np.hstack(train[:])
# x_train = np.column_stack([x_train,train[-1]])
# x_train = np.column_stack([x_train,train[-2]])
#train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.333, random_state=0)
#train = lgb.Dataset(x_train,label=y_train)
#valid = lgb.Dataset(valid_x, label=valid_y
#gbm = lgb.train(params,train,num_boost_round=1000,#valid_sets=valid,early_stopping_rounds=5)

clf = MLkNN(k=8)

#clf = OneVsRestClassifier(LR)
clf.fit(x_train,y_train)

MLkNN(ignore_first_neighbours=0, k=8, s=1.0)

In [18]:
from sklearn.metrics import f1_score,hamming_loss
from sklearn.metrics import precision_recall_fscore_support as prf
import warnings
warnings.filterwarnings("ignore")

In [58]:
y_pred = clf.predict(x_train).toarray()
y_pred_proba_train = clf.predict(x_train).toarray()
print(" train data f1_score  :", f1_score(y_train, y_pred, average='macro'))
for i in range(10):
    
    print("f1 score of ab {} is {}".format(i, f1_score(y_train[:, i], y_pred[:, i], average='macro')))
    
print(" train data hamming_loss  :", hamming_loss(y_train, y_pred)) 
print(" train data precision recall f1  :", prf(y_train, y_pred,average="samples"))# 'micro', 'weighted

 train data f1_score  : 0.9459682850306548
f1 score of ab 0 is 0.9698879144657897
f1 score of ab 1 is 0.9921295035618354
f1 score of ab 2 is 0.968707806818373
f1 score of ab 3 is 0.9864826677400564
f1 score of ab 4 is 0.9631518930859532
f1 score of ab 5 is 0.9762565047862113
f1 score of ab 6 is 0.9515934021319987
f1 score of ab 7 is 0.9552460119452224
f1 score of ab 8 is 0.9550018867888549
f1 score of ab 9 is 0.9737361997784897
 train data hamming_loss  : 0.01190013454925998
 train data precision recall f1  : (0.9521104300593013, 0.9487741067424128, 0.9467209847012509, None)


In [59]:
out = np.hstack(test[:])
#out = np.column_stack([out,test[-1]])
#out = np.column_stack([out,test[-2]])
# LR_clf = joblib.load("LR_ensemble.pkl")
# MLkNN_clf = joblib.load("MLkNN_ensemble.pkl")

# y_pred_LR = LR_clf.predict(out)
# y_pred_proba_LR = LR_clf.predict_proba(out)

# y_pred_MLkNN = MLkNN_clf.predict(out).toarray()
# y_pred_proba_MLkNN = MLkNN_clf.predict_proba(out).toarray()

# y_pred_MLkNN[:,7] = y_pred_LR[:,7]
# y_pred_proba_MLkNN[:,7] = y_pred_proba_LR[:,7]
# y_pred = y_pred_MLkNN
# y_pred_proba = y_pred_proba_MLkNN

y_pred = clf.predict(out).toarray()
y_pred_proba = clf.predict_proba(out).toarray()

classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

pred = []
for j in range(y_pred.shape[0]):
    pred.append([classes[i] for i in range(10) if y_pred[j][i] == 1])

''' ''' 
for i, val in enumerate(pred):
    if val == []:
        pass
        #for i_p, val_p in enumerate(y_pred_proba[i]):
        #    if val_p >= 0.4:
        #        pred[i].append(i_p)    # f1 == 0.832
                
        if y_pred_proba[i][np.argmax(y_pred_proba[i])] >= 0.3:
            pred[i] = [np.argmax(y_pred_proba[i])]     # f1 == 0.833  0.4

val_dataset_path = '/media/jdcloud' + "/Val/"
val_files = os.listdir(val_dataset_path)
val_files.sort()

with open('jupyter_answers_densenet_{}_0809_ensemble.csv'.format(pre_type), 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File_name', 'label1', 'label2',
                     'label3', 'label4', 'label5', 'label6', 'label7', 'label8', 'label9', 'label10'])
    count = 0
    for file_name in val_files:
        if file_name.endswith('.mat'):

            record_name = file_name.strip('.mat')
            answer = []
            answer.append(record_name)

            result = pred[count]

            answer.extend(result)
            for i in range(10 - len(result)):
                answer.append('')
                
            count += 1
            writer.writerow(answer)
    csvfile.close()

# predict quarter final data
## attention 10nets 
### model : attention_extend_weights-best_k{}_r{}_0809_30.hdf5

In [3]:
pre_type = "db6"#"db6"

#labels = pd.read_csv(path + "REFERENCE.csv")
labels = pd.read_csv("/media/uuser/data/final_codes/final_run_semi/reference.csv")
raw_IDs = labels["File_name"].values.tolist()

IDs = {}
IDs["sym"] = raw_IDs
IDs["db4"] = [i + "_db4" for i in raw_IDs]
IDs["db6"] = [i + "_db6" for i in raw_IDs]

X = np.empty((6500, 10, 2560, 12))
for i, ID in enumerate(IDs[pre_type]):
    #print(ID)
    X[i,] = np.load("/media/uuser/data/ysecgtest/training_data/" + ID + ".npy")
#train_x = [(X[:, i]-np.mean(X[:, i]))/np.std(X[:, i]) for i in range(10)]
train_x = [X[:, 0], X[:, 1], X[:, 2], X[:, 3], X[:, 4], X[:, 5], X[:, 6], X[:, 7], X[:, 8], X[:, 9]]

def preprocess_y(labels, y, num_class=10):
    bin_label = np.zeros((len(y), num_class)).astype('int8')
    for i in range(len(y)):
        label_nona = labels.loc[y[i]].dropna()
        for j in range(1, label_nona.shape[0]):
            bin_label[i, int(label_nona[j])] = 1
    return bin_label

In [4]:
from resnet_ecg import attentionmodel

#pre_type = "sym"

#labels = pd.read_csv(path + "REFERENCE.csv")
#labels = pd.read_csv("/media/uuser/data/final_run/reference.csv")
raw_IDs = labels["File_name"].values.tolist()

IDs = {}
IDs["sym"] = raw_IDs
IDs["db4"] = [i + "_db4" for i in raw_IDs]
IDs["db6"] = [i + "_db6" for i in raw_IDs]

# X = np.empty((6500, 10, 2560, 12))
# for i, ID in enumerate(IDs[pre_type]):
#     X[i,] = np.load("/media/uuser/data/final_run/training_data/" + ID + ".npy")
# #train_x = [(X[:, i]-np.mean(X[:, i]))/np.std(X[:, i]) for i in range(10)]
# train_x = [X[:, 0], X[:, 1], X[:, 2], X[:, 3], X[:, 4], X[:, 5], X[:, 6], X[:, 7], X[:, 8], X[:, 9]]

# def preprocess_y(labels, y, num_class=10):
#     bin_label = np.zeros((len(y), num_class)).astype('int8')
#     for i in range(len(y)):
#         label_nona = labels.loc[y[i]].dropna()
#         for j in range(1, label_nona.shape[0]):
#             bin_label[i, int(label_nona[j])] = 1
#     return bin_label

# net_num = 10
# test_x = [read_data_seg(path, split='Val', preprocess=True, n_index=i, pre_type=pre_type) for i in range(net_num)]

index = np.arange(6500)
y_train = preprocess_y(labels, index)

input_size = (2560, 12)
net_num = 10
inputs_list = [Input(shape=input_size) for _ in range(net_num)]
outputs = attentionmodel.build_network(inputs_list, 0.5, num_classes=10, block_size=4, relu=False)
model = Model(inputs=inputs_list, outputs=outputs)

# print(model.summary())
n_classes = 10
n_fold = 3
model_path = './official_attention_model/'
#'densenet_extend_weights-best_one_fold_0607.hdf5'
blend_train = np.zeros((6500, n_fold, n_classes)).astype('float32')
en_amount = 1
for seed in range(en_amount):
    for i in range(n_fold):
        model_name = "attention_extend_weights-best_k{}_r{}_0809_30.hdf5".format(seed, i)
        model.load_weights(model_path + model_name)
        blend_train[:,i,:] = model.predict(train_x)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [5]:
train_pd0 = pd.DataFrame(blend_train[:,0,:])
train_pd1 = pd.DataFrame(blend_train[:,1,:])
train_pd2 = pd.DataFrame(blend_train[:,2,:])
csv_path = "/media/uuser/data/final_codes/final_run_final/quarter_final/"
train_pd0.to_csv(csv_path+"attention_10net_db6_addori_fold0.csv",index=None)
train_pd1.to_csv(csv_path+"attention_10net_db6_addori_fold1.csv",index=None)
train_pd2.to_csv(csv_path+"attention_10net_db6_addori_fold2.csv",index=None)