In [1]:
import os
import numpy as np
import pandas as pd
from train import *
from preprocess import *
from utils import *

import tensorflow as tf
from keras.optimizers import RMSprop

## Data Preprocessing

In [None]:
data_path = '../train_data/'   # change data path accordingly
features_path = '../feature/'
patients = [user + '/' for user in sorted(os.listdir(data_path)) if user[0].isalpha()]
splits = ['train/', 'val/', 'test/']
labels = ['relapse/','non-relapse/', '']
file_name = '/data.csv'

patient = patients[0]
split = splits[-1]
label = labels[-1]

combs = []
count = 0
for patient in patients[0:]:
    for phase in sorted(os.listdir(data_path + patient + split + label))[count:]:
#         combs.append([patient, phase])
        extract_user_features(data_path, patient, split, label, phase, file_name, '5Min')
        count += 1
# print(combs)
# Parallel(n_jobs=4)(delayed(extract_user_features)(data_path, patient, split, label, phase, file_name, frequency='5Min') for patient, phase in combs)

## Autoencoder Training

In [1]:
train_mode = 'sleep' # specify CAE tranining data: sleep, awake, or both 
directory = '../input/'
os.chdir(directory) 
input_list = [f for f in os.listdir() if 'v2' in f]
model_name = f'latent_k11_{train_mode}_'  # latent_k{kernel_size}_{status of sleep}
norm_cols = ['lin_acc_norm', 'ang_acc_norm',
                 'heartRate_mean', 'heartRate_max', 'heartRate_min',
                'rRInterval_mean', 'rRInterval_rmssd',
          'rRInterval_sdnn', 'rRInterval_sd1', 'rRInterval_sd2',
          'rRInterval_lombscargle_power_high', 'rRInterval_lombscargle_power_low']

for file_name in input_list:
    
    df = pd.read_csv(file_name, index_col=0).dropna()
    window = 48

    X_train, X_val_n, X_val_r = load_data(file_name, norm_cols, window, train_mode)

#     tf.random.set_seed(1)
    autoencoder = Trainer(X_train, window)
    opt = RMSprop(learning_rate=1e-4)
    X = np.concatenate([X_train, X_val_n, X_val_r])
    X_val = np.concatenate([X_val_n, X_val_r])
    autoencoder.compile(optimizer=opt, loss=tf.losses.MeanSquaredError())
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    history = autoencoder.fit(X, X,
                    epochs=1000,
                    batch_size=1,
                    # validation_data=[X_val, X_val],
                    shuffle=True,
                    callbacks=[callback],
                    )  # best: 1000, batch = 4, rmsprop validation_data=[X_val_1, X_val_1]

    autoencoder.save(f"../save_model/{model_name}{file_name[-6:-4]}.keras")

## Clustering and Evaluation

In [None]:
data_dir = '../input/'
model_dir = '../save_model/'
test_dir = '../test/'
n_range = sorted(os.listdir(data_dir))
train_mode, eval_mode = 'sleep', 'both' # e.g., "sleep","sleep"; "awake","both"

auprc_list, auroc_list, auprc_base_list, f1_list, pred_list, true_list = [], [], [], [], [], []
for i in range(n_range):
  data_file = data_dir + f'input_v2_user0{i}.csv'
  model_path = model_dir + f'latent_k11_{train_mode}_0{i}.keras'
  test_file = test_dir + f'test_v2_user0{i}.csv'
  save_dir = f'./result_6415_concat_k11_{train_mode}_{eval_mode}.csv'
  auprc, auroc, auprc_base, f1, y_pred, y_true  = generate_evaluation_result(data_dir, model_dir, test_dir, data_file, model_path, test_file, train_mode, eval_mode)
  auprc_list.append(auprc)
  auroc_list.append(auroc)
  auprc_base_list.append(auprc_base)
  f1_list.append(f1)
  pred_list.append(y_pred)
  true_list.append(y_true)

result_df = pd.DataFrame(data={'auprc': auprc_list, 'auroc': auroc_list, 'auprc_base': auprc_base_list,
                            'f1': f1_list, 'pred': pred_list, 'y_true':true_list})
result_df.to_csv(save_dir) 