In [1]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append(os.path.abspath(os.path.join('../utils')))
sys.path.append(os.path.abspath(os.path.join('../', 'utils', 'GRU-D')))
from dataset_loader import TidySequentialDataCSVLoader
from utils_preproc import parse_id_cols, parse_output_cols, parse_feature_cols, parse_id_cols, parse_time_cols, load_data_dict_json, get_fenceposts

In [2]:
CLF_TRAIN_TEST_SPLIT_PATH = 'data/classifier_train_test_split_dir/'
x_train_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_trainCustomTimes_10_6_vitals_only.csv.gz')
x_valid_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_validCustomTimes_10_6_vitals_only.csv.gz')
x_test_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_testCustomTimes_10_6_vitals_only.csv.gz')


y_train_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_trainCustomTimes_10_6_vitals_only.csv.gz')
y_valid_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_validCustomTimes_10_6_vitals_only.csv.gz')
y_test_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_testCustomTimes_10_6_vitals_only.csv.gz')

x_dict_json=os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_dictCustomTimes_10_6_vitals_only.json')
x_data_dict = load_data_dict_json(x_dict_json)

x_train_df = pd.read_csv(x_train_csv)
x_valid_df = pd.read_csv(x_valid_csv)
x_test_df = pd.read_csv(x_test_csv)

y_train_df = pd.read_csv(y_train_csv)
y_valid_df = pd.read_csv(y_valid_csv)
y_test_df = pd.read_csv(y_test_csv)

# limit the EHR to first 48 hours
max_t = 48
x_train_df = x_train_df[x_train_df.stop<=max_t].reset_index(drop=True)
y_train_df = y_train_df[y_train_df.stop<=max_t].reset_index(drop=True)

x_valid_df = x_valid_df[x_valid_df.stop<=max_t].reset_index(drop=True)
y_valid_df = y_valid_df[y_valid_df.stop<=max_t].reset_index(drop=True)

x_test_df = x_test_df[x_test_df.stop<=max_t].reset_index(drop=True)
y_test_df = y_test_df[y_test_df.stop<=max_t].reset_index(drop=True)

y_train_df = y_train_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)
y_valid_df = y_valid_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)
y_test_df = y_test_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)

feature_cols = parse_feature_cols(x_data_dict['schema'])
id_cols = parse_id_cols(x_data_dict['schema'])

In [3]:
for split, y_df in [('train', y_train_df), 
                    ('valid', y_valid_df), 
                    ('test', y_test_df)]:
    print('===================%s==========================='%split)
    
    n_adms_pos_outcome = len(y_df[y_df['in_icu_mortality']==1]['stay_id'].unique())
    n_adms_total = len(y_df['stay_id'].unique())
    
    print('Number of slices in %s : %s'%(split, len(y_df)))
    print('Frac positive slices in %s : %.4f'%(split, y_df['in_icu_mortality'].sum()/len(y_df)))
    print('Number of admissions in %s : %s'%(split, n_adms_total))
    print('Frac positive admissions in %s : %.4f'%(split, n_adms_pos_outcome/n_adms_total))
    



Number of slices in train : 42836
Frac positive slices in train : 0.0136
Number of admissions in train : 42836
Frac positive admissions in train : 0.0136
Number of slices in valid : 15443
Frac positive slices in valid : 0.0131
Number of admissions in valid : 15443
Frac positive admissions in valid : 0.0131
Number of slices in test : 15802
Frac positive slices in test : 0.0116
Number of admissions in test : 15802
Frac positive admissions in test : 0.0116


In [5]:
train_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_train_df,
    y_csv_path=y_train_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

valid_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_valid_df,
    y_csv_path=y_valid_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

test_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_test_df,
    y_csv_path=y_test_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

# num_true_feats = int(F/3)
train_x_NTD, y_train = train_vitals.get_batch_data(batch_id=0)
valid_x_NTD, y_valid = valid_vitals.get_batch_data(batch_id=0)
test_x_NTD, y_test = test_vitals.get_batch_data(batch_id=0)

N_tr = len(train_x_NTD)
N_va = len(valid_x_NTD)
N_te = len(test_x_NTD)

In [26]:
y_train.shape

(42836,)

In [6]:
state_id = 41
data_save_dir = 'data/classifier_train_test_split_dir/'

for ii, perc_labelled in enumerate([1.2, 3.7, 11.1, 33.3, 100]):#3.7, 11.1, 33.3, 100
    curr_save_dir = os.path.join(data_save_dir, 'percentage_labelled_sequnces=%s'%perc_labelled)
    
    print('---------------------------------------------------------------------------')
    print('CREATING TRAIN/VALID/TEST SPLITS FOR %.3f PERCENT OF SEQUENCES LABELLED'%perc_labelled)
    print('---------------------------------------------------------------------------')
    y_train_ss = y_train.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_tr = int((1-(perc_labelled)/100)*N_tr)
    unlabelled_inds_tr = rnd_state.permutation(N_tr)[:n_unlabelled_tr]
    y_train_ss = y_train_ss.astype(np.float32)
    y_train_ss[unlabelled_inds_tr] = np.nan  
    if perc_labelled!=100:
        print('Excluded inds train: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_tr[0],
                                                              unlabelled_inds_tr[1],
                                                              unlabelled_inds_tr[2],
                                                              unlabelled_inds_tr[-3],
                                                              unlabelled_inds_tr[-2],
                                                              unlabelled_inds_tr[-1]))
    
    y_valid_ss = y_valid.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_va = int((1-(perc_labelled)/100)*N_va)
    unlabelled_inds_va = rnd_state.permutation(N_va)[:n_unlabelled_va]
    y_valid_ss = y_valid_ss.astype(np.float32)
    y_valid_ss[unlabelled_inds_va] = np.nan 
    if perc_labelled!=100:
        print('Excluded inds valid: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_va[0],
                                                          unlabelled_inds_va[1],
                                                          unlabelled_inds_va[2],
                                                          unlabelled_inds_va[-3],
                                                          unlabelled_inds_va[-2],
                                                          unlabelled_inds_va[-1]))

    y_test_ss = y_test.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_te = int((1-(perc_labelled)/100)*N_te)
    unlabelled_inds_te = rnd_state.permutation(N_te)[:n_unlabelled_te]
    y_test_ss = y_test_ss.astype(np.float32)
    y_test_ss[unlabelled_inds_te] = np.nan
    if perc_labelled!=100:
        print('Excluded inds test: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_te[0],
                                                          unlabelled_inds_te[1],
                                                          unlabelled_inds_te[2],
                                                          unlabelled_inds_te[-3],
                                                          unlabelled_inds_te[-2],
                                                          unlabelled_inds_te[-1]))
    
    print('---------------------------------------------------------------------------')
    for split, y in [('train', y_train_ss),
                    ('valid', y_valid_ss),
                    ('test', y_test_ss)]:
        frac_pos_labels = np.nansum(y)/(~np.isnan(y)).sum()
        print('fraction positive labels in %s set with %.3f percent of sequences labelled : %.4f'%(split,
                                                                                                   perc_labelled,
                                                                                                   frac_pos_labels))
    print('---------------------------------------------------------------------------')

---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 1.200 PERCENT OF SEQUENCES LABELLED
---------------------------------------------------------------------------
Excluded inds train: 3658, 24411, 36568 ... 21793, 12641, 38467
Excluded inds valid: 14685, 4004, 13801 ... 14248, 4256, 6648
Excluded inds test: 10851, 4531, 10134 ... 6648, 14074, 5532
---------------------------------------------------------------------------
fraction positive labels in train set with 1.200 percent of sequences labelled : 0.0078
fraction positive labels in valid set with 1.200 percent of sequences labelled : 0.0054
fraction positive labels in test set with 1.200 percent of sequences labelled : 0.0053
---------------------------------------------------------------------------
---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 3.700 PERCENT OF SEQUENCES LABELLED
------------------------