In [None]:
# first we define relevant directories
import sys
import os
import pickle
# project directory
project_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# src directory
src_dir = os.path.join(project_dir, 'src')
sys.path.insert(0, src_dir)
dataset_dir = os.path.join(project_dir, "dataset")
print("dataset_dir: ", dataset_dir)
print("project_dir: ", project_dir)

In [None]:
from explore_hcupdata import *
from utilities import create_directory, ReaderWriter

In [None]:
# read relevant data
CONT_COLS = ReaderWriter.read_data(os.path.join(dataset_dir, 'continuous_features.pkl'))
COL_FEATURES = ReaderWriter.read_data(os.path.join(dataset_dir, 'col_features.pkl'))
feat_label = ReaderWriter.read_data(os.path.join(dataset_dir, 'feat_label.pkl'))
code_feat = ReaderWriter.read_data(os.path.join(dataset_dir, 'code_feat.pkl'))
datasplit = ReaderWriter.read_data(os.path.join(dataset_dir, 'datasplit.pkl'))
traj_info = ReaderWriter.read_data(os.path.join(dataset_dir, 'traj_info.pkl'))
fsample = ReaderWriter.read_data(os.path.join(dataset_dir, 'fsample.pkl'))

### Neural network dataset preparation

In [None]:
PADDSYMB_INDX = 1000

class PatientDataTensorMemmap:
    def __init__(self, idx_mapper, dtype, paddsymbol):
        self.idx_mapper = idx_mapper
        self.idx_mapper_inverse = {numcode:pid for pid, numcode in self.idx_mapper.items()}
        self.dsettype = dtype
        self.paddsymbol = paddsymbol
        
    def memmap_arrays(self, X_tensor, Y_tensor, E_tensor, T_tensor, fpath):
        # create a memmap numpy arrays
        tensor_info = ['seqtensor_info', 'labeltensor_info', 
                       'indexeventtensor_info', 'seqlentensor_info']
        arrays = [X_tensor, Y_tensor, E_tensor, T_tensor]
        array_names = ['seq_tensor', 'label_tensor', 'indexevent_tensor', 'seqlen_tensor']
        for i, (arr, arr_name) in enumerate(zip(arrays, array_names)):
            tmparr = np.memmap(os.path.join(fpath, arr_name+'.dat'), dtype=arr.dtype, mode='w+', shape=arr.shape)
            tmparr[:] = arr[:]
            setattr(self, tensor_info[i], (arr.dtype, arr.shape))
        self.num_samples = X_tensor.shape[0]
        self.input_dim = X_tensor.shape[-1]
         
    def read_fromdisk(self, fpath, memmap = True):
        # to refactor this function/process
        # due to issues with multiprocessing and size of seq_tensor
        # loading seq_tensor is delayed until it is called within the spawned child processes
        array_names = ['seq_tensor', 'label_tensor', 'indexevent_tensor', 'seqlen_tensor']
        tensor_info = [self.seqtensor_info, self.labeltensor_info, 
                       self.indexeventtensor_info, self.seqlentensor_info]
        for arr_info, arr_name in zip(tensor_info, array_names):
            arr = np.memmap(os.path.join(fpath, arr_name+'.dat'), dtype=arr_info[0], mode = 'r', shape=arr_info[1])
            if(not memmap):
                arr = np.asarray(arr)
            setattr(self, arr_name, arr)
        if(hasattr(self, 'fpath')):
            del self.fpath

def build_numpy_tensor(gdf, tensor, colnames, pid_mapper):
    patient_id = pid_mapper[gdf.iloc[0]['nrd_visitlink']]
    tensor[patient_id, :len(gdf), :] = gdf[colnames]
    
def generate_patientdataset_tensor(sample, traj_info, colfeatures, sample_type, fpath):
    y_colname = 'allcause_readmit'
    idx = sample['nrd_visitlink'].unique()
    target_idx = traj_info['nrd_visitlink'].isin(idx)
#     max_seqlen = traj_info.loc[target_idx,'seq_len'].max()
    max_seqlen = traj_info['seq_len'].max() # to keep all datasets with uniform Time length
    N = len(idx) # total number of samples (i.e patients)
    input_dim = len(colfeatures)
    pid_mapper = {vstlink:int(code) for code, vstlink in enumerate(list(idx))}
    # build sequence tensor
    X_tensor = np.zeros((N, max_seqlen, input_dim))
    sample.groupby(['nrd_visitlink']).apply(build_numpy_tensor, X_tensor, colfeatures, pid_mapper)
    Y_tensor = np.zeros((N, max_seqlen, 1)) + PADDSYMB_INDX
    # build label tensor
    sample.groupby(['nrd_visitlink']).apply(build_numpy_tensor, Y_tensor, [y_colname], pid_mapper)
    # build the index event tensor
    E_tensor = np.zeros((N, max_seqlen, 1)) + PADDSYMB_INDX
    sample.groupby(['nrd_visitlink']).apply(build_numpy_tensor, E_tensor, ['index_event'], pid_mapper)
    # build sequence length tensor
    temp = traj_info.loc[target_idx, ['nrd_visitlink', 'seq_len']]
    T_tensor = np.ones(N, dtype='uint8')*-1
    T_idx =np.vectorize(pid_mapper.get)(temp['nrd_visitlink'])
    T_tensor[T_idx] = temp['seq_len'].values 
    patient_data = PatientDataTensorMemmap(pid_mapper, sample_type, PADDSYMB_INDX)
    patient_data.memmap_arrays(X_tensor, Y_tensor, E_tensor, T_tensor, fpath)
    return(patient_data)

def generate_classweights(fold_name, train_idx, option):
    train_sample = traj_info.loc[traj_info['nrd_visitlink'].isin(train_idx)]
    if(option == 'all_indx'): # use all index events in the computation
        nsamples = train_sample['run_num_indxevents'].sum()
        n_one = train_sample['count_allcausereadmit'].sum()
    elif(option == 'last_indx'): # use last index event in the computation
        nsamples = train_sample.shape[0]
        n_one = train_sample['allcause_readmit'].sum()
    elif(option == 'all'): # use all events
        nsamples = train_sample['run_num_events'].sum()
        n_one = train_sample['count_allcausereadmit'].sum()
    n_zero = nsamples - n_one
    print("number of samples: ", nsamples)
    print("number of zeros: ", n_zero)
    print("number of ones: ", n_one)
    print("readmit rate: ", n_one/nsamples)
    w_zero = nsamples/(2*n_zero)
    w_one = nsamples/(2*n_one)
    print("fold: ", fold_name)
    print("w_zero:{}, w_one:{}".format(w_zero, w_one))
    print("-"*10)
    return((w_zero, w_one))

def generate_datasetfolds(datafolds, fsample, dataset_dir, norm_option):
    dsettypes = ('train', 'validation', 'test')
    for fold_name in datafolds:
        print("fold ", fold_name)
        cdir = create_directory("{}_{}".format(fold_name, norm_option), dataset_dir)
        target_dsets = {}
        # datafolds[fold_name] is a tuple of (train, validation, test) indices/visitlinks
        for i, target_idx in enumerate(datafolds[fold_name]): 
            dset = fsample.loc[fsample['nrd_visitlink'].isin(target_idx)].copy()
            print(dsettypes[i] + " dataset")
            print(len(dset))
            print()
            if(norm_option != 'none'):
                apply_normalization(dset, 
                                    CONT_COLS, 
                                    ReaderWriter.read_data(os.path.join(cdir, "{}_info.pkl".format(norm_option))))
            pdt_path = create_directory(dsettypes[i]+"_pdtm", cdir)
            pdt = generate_patientdataset_tensor(dset, traj_info, COL_FEATURES, dsettypes[i], pdt_path)
            # pickle data
            fpathname = os.path.join(pdt_path, 'pdtm_object.pkl')
            ReaderWriter.dump_data(pdt, fpathname)
            target_dsets[pdt.dsettype] = fpathname
            if(dsettypes[i] == 'train'):
                for classweight_option in ('last_indx',): # get the class weights
                    class_weights = generate_classweights(fold_name, target_idx, classweight_option)
                    ReaderWriter.dump_data(class_weights, os.path.join(pdt_path, "classweights_" + classweight_option + ".pkl"))
        # pickle the pointer dictionary on disk
        ReaderWriter.dump_data(target_dsets, os.path.join(cdir, 'dataset_tuple.pkl'))

### Generate subset fold for hyperparam tuning

In [None]:
# normalizing option with dataset directory
norm_options = ('none', )
dataset_neural_dir = create_directory('dataset_neural', project_dir)

# build the subset dataset for hyperparameter optimization
train_idx, val_idx = get_datasubset(datasplit, traj_info, 0.3)
subsetfolds = {'subsetfold':(train_idx,  val_idx)}
# generate normalizer
# generate_normalizers(subsetfolds, fsample, dataset_neural_dir, CONT_COLS, normalize_options = norm_options)
generate_datasetfolds(subsetfolds, fsample, dataset_neural_dir, norm_options[0])

### Generate five folds

In [None]:
# datafolds -> fold_id:(train_idx, val_idx, test_idx)
datafolds = get_datafolds(datasplit, traj_info, 0.2)
# generate normalizer
# generate_normalizers(datafolds, fsample, dataset_neural_dir, CONT_COLS, normalize_options = norm_options)
generate_datasetfolds(datafolds, fsample, dataset_neural_dir, norm_options[0])