In [1]:
import pandas as pd
import numpy as np
PROJECT_REPO_DIR = "/cluster/tufts/hugheslab/prath01/projects/time_series_prediction"
import sys
import os

PROJECT_SRC_DIR = os.path.join(PROJECT_REPO_DIR, 'src')
sys.path.append(PROJECT_SRC_DIR)
from feature_transformation import (parse_id_cols, remove_col_names_from_list_if_not_in_df, parse_time_col, parse_feature_cols)
from utils import load_data_dict_json

sys.path.append(os.path.join(PROJECT_SRC_DIR, "rnn"))
from dataset_loader import TidySequentialDataCSVLoader
PROJECT_SRC_DIR = '/cluster/tufts/hugheslab/prath01/projects/time_series_prediction/src/'
sys.path.append(PROJECT_SRC_DIR)
sys.path.append(os.path.abspath("../src"))

from feature_transformation import get_fenceposts

In [2]:
CLF_TRAIN_TEST_SPLIT_PATH = '/cluster/tufts/hugheslab/prath01/projects/time_series_prediction/datasets/mimic-iv/v20220627/split-by=subject_id/sequence_features_custom_times/classifier_train_test_split_dir/'
x_train_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_trainCustomTimes_10_6_vitals_only.csv.gz')
x_valid_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_validCustomTimes_10_6_vitals_only.csv.gz')
x_test_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_testCustomTimes_10_6_vitals_only.csv.gz')


y_train_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_trainCustomTimes_10_6_vitals_only.csv.gz')
y_valid_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_validCustomTimes_10_6_vitals_only.csv.gz')
y_test_csv = os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_testCustomTimes_10_6_vitals_only.csv.gz')

x_dict_json=os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_dictCustomTimes_10_6_vitals_only.json')
x_data_dict = load_data_dict_json(x_dict_json)

x_train_df = pd.read_csv(x_train_csv)
x_valid_df = pd.read_csv(x_valid_csv)
x_test_df = pd.read_csv(x_test_csv)

y_train_df = pd.read_csv(y_train_csv)
y_valid_df = pd.read_csv(y_valid_csv)
y_test_df = pd.read_csv(y_test_csv)

# limit the EHR to first 48 hours
max_t = 48
x_train_df = x_train_df[x_train_df.stop<=max_t].reset_index(drop=True)
y_train_df = y_train_df[y_train_df.stop<=max_t].reset_index(drop=True)

x_valid_df = x_valid_df[x_valid_df.stop<=max_t].reset_index(drop=True)
y_valid_df = y_valid_df[y_valid_df.stop<=max_t].reset_index(drop=True)

x_test_df = x_test_df[x_test_df.stop<=max_t].reset_index(drop=True)
y_test_df = y_test_df[y_test_df.stop<=max_t].reset_index(drop=True)

y_train_df = y_train_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)
y_valid_df = y_valid_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)
y_test_df = y_test_df.drop_duplicates(subset='stay_id', keep='last').reset_index(drop=True)

feature_cols = parse_feature_cols(x_data_dict['schema'])
id_cols = parse_id_cols(x_data_dict['schema'])

In [3]:
for split, y_df in [('train', y_train_df), 
                    ('valid', y_valid_df), 
                    ('test', y_test_df)]:
    print('===================%s==========================='%split)
    
    n_adms_pos_outcome = len(y_df[y_df['in_icu_mortality']==1]['stay_id'].unique())
    n_adms_total = len(y_df['stay_id'].unique())
    
    print('Number of slices in %s : %s'%(split, len(y_df)))
    print('Frac positive slices in %s : %.4f'%(split, y_df['in_icu_mortality'].sum()/len(y_df)))
    print('Number of admissions in %s : %s'%(split, n_adms_total))
    print('Frac positive admissions in %s : %.4f'%(split, n_adms_pos_outcome/n_adms_total))
    



Number of slices in train : 42836
Frac positive slices in train : 0.0136
Number of admissions in train : 42836
Frac positive admissions in train : 0.0136
Number of slices in valid : 15443
Frac positive slices in valid : 0.0131
Number of admissions in valid : 15443
Frac positive admissions in valid : 0.0131
Number of slices in test : 15802
Frac positive slices in test : 0.0116
Number of admissions in test : 15802
Frac positive admissions in test : 0.0116


In [5]:
train_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_train_df,
    y_csv_path=y_train_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

valid_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_valid_df,
    y_csv_path=y_valid_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

test_vitals = TidySequentialDataCSVLoader(
    x_csv_path=x_test_df,
    y_csv_path=y_test_df,
    x_col_names=feature_cols,
    idx_col_names=id_cols,
    y_col_name="in_icu_mortality",
    y_label_type='per_sequence'
)

# num_true_feats = int(F/3)
train_x_NTD, y_train = train_vitals.get_batch_data(batch_id=0)
valid_x_NTD, y_valid = valid_vitals.get_batch_data(batch_id=0)
test_x_NTD, y_test = test_vitals.get_batch_data(batch_id=0)

N_tr = len(train_x_NTD)
N_va = len(valid_x_NTD)
N_te = len(test_x_NTD)

In [26]:
y_train.shape

(42836,)

In [6]:
state_id = 41
data_save_dir = '/cluster/tufts/hugheslab/prath01/datasets/mimic4_ssl/'

for ii, perc_labelled in enumerate([1.2, 3.7, 11.1, 33.3, 100]):#3.7, 11.1, 33.3, 100
    curr_save_dir = os.path.join(data_save_dir, 'percentage_labelled_sequnces=%s'%perc_labelled)
    
    print('---------------------------------------------------------------------------')
    print('CREATING TRAIN/VALID/TEST SPLITS FOR %.3f PERCENT OF SEQUENCES LABELLED'%perc_labelled)
    print('---------------------------------------------------------------------------')
    y_train_ss = y_train.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_tr = int((1-(perc_labelled)/100)*N_tr)
    unlabelled_inds_tr = rnd_state.permutation(N_tr)[:n_unlabelled_tr]
    y_train_ss = y_train_ss.astype(np.float32)
    y_train_ss[unlabelled_inds_tr] = np.nan  
    if perc_labelled!=100:
        print('Excluded inds train: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_tr[0],
                                                              unlabelled_inds_tr[1],
                                                              unlabelled_inds_tr[2],
                                                              unlabelled_inds_tr[-3],
                                                              unlabelled_inds_tr[-2],
                                                              unlabelled_inds_tr[-1]))
    
    y_valid_ss = y_valid.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_va = int((1-(perc_labelled)/100)*N_va)
    unlabelled_inds_va = rnd_state.permutation(N_va)[:n_unlabelled_va]
    y_valid_ss = y_valid_ss.astype(np.float32)
    y_valid_ss[unlabelled_inds_va] = np.nan 
    if perc_labelled!=100:
        print('Excluded inds valid: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_va[0],
                                                          unlabelled_inds_va[1],
                                                          unlabelled_inds_va[2],
                                                          unlabelled_inds_va[-3],
                                                          unlabelled_inds_va[-2],
                                                          unlabelled_inds_va[-1]))

    y_test_ss = y_test.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_te = int((1-(perc_labelled)/100)*N_te)
    unlabelled_inds_te = rnd_state.permutation(N_te)[:n_unlabelled_te]
    y_test_ss = y_test_ss.astype(np.float32)
    y_test_ss[unlabelled_inds_te] = np.nan
    if perc_labelled!=100:
        print('Excluded inds test: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_te[0],
                                                          unlabelled_inds_te[1],
                                                          unlabelled_inds_te[2],
                                                          unlabelled_inds_te[-3],
                                                          unlabelled_inds_te[-2],
                                                          unlabelled_inds_te[-1]))
    
#     # Check whether the specified path exists or not
#     isExist = os.path.exists(curr_save_dir)

#     if not isExist:
#         # Create a new directory because it does not exist 
#         os.makedirs(curr_save_dir)
        
#     # save the data to the respective folder
#     print('Saving data to %s'%curr_save_dir)
#     np.save(os.path.join(curr_save_dir, 'X_train.npy'), train_x_NTD)
#     np.save(os.path.join(curr_save_dir, 'y_train.npy'), y_train_ss)
#     print('Done saving train..')
#     np.save(os.path.join(curr_save_dir, 'X_valid.npy'), valid_x_NTD)
#     np.save(os.path.join(curr_save_dir, 'y_valid.npy'), y_valid_ss)
#     print('Done saving valid..')
#     np.save(os.path.join(curr_save_dir, 'X_test.npy'), test_x_NTD)
#     np.save(os.path.join(curr_save_dir, 'y_test.npy'), y_test_ss)
#     print('Done saving test..')
    
    
    print('---------------------------------------------------------------------------')
    for split, y in [('train', y_train_ss),
                    ('valid', y_valid_ss),
                    ('test', y_test_ss)]:
        frac_pos_labels = np.nansum(y)/(~np.isnan(y)).sum()
        print('fraction positive labels in %s set with %.3f percent of sequences labelled : %.4f'%(split,
                                                                                                   perc_labelled,
                                                                                                   frac_pos_labels))
    print('---------------------------------------------------------------------------')

---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 1.200 PERCENT OF SEQUENCES LABELLED
---------------------------------------------------------------------------
Excluded inds train: 3658, 24411, 36568 ... 21793, 12641, 38467
Excluded inds valid: 14685, 4004, 13801 ... 14248, 4256, 6648
Excluded inds test: 10851, 4531, 10134 ... 6648, 14074, 5532
---------------------------------------------------------------------------
fraction positive labels in train set with 1.200 percent of sequences labelled : 0.0078
fraction positive labels in valid set with 1.200 percent of sequences labelled : 0.0054
fraction positive labels in test set with 1.200 percent of sequences labelled : 0.0053
---------------------------------------------------------------------------
---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 3.700 PERCENT OF SEQUENCES LABELLED
------------------------

In [7]:
(.7+.5+.5)/3

0.5666666666666667

## Make collapsed version of dataset for LR, RF

In [5]:
from featurize_single_time_series import collapse_std, collapse_elapsed_time_since_last_measured, collapse_count, collapse_slope, collapse_median, collapse_min, collapse_max, collapse_value_last_measured, make_summary_ops

In [6]:
# collapse features
def featurize_ts(
        time_arr_by_var,
        val_arr_by_var,
        n_features,
        percentile_slices_to_featurize=[(0., 100.)],
        summary_ops=['count', 'mean', 'std', 'slope'],
        ):
    ''' Featurize provided multivariate irregular time series into flat vector
    Args
    ----
    time_arr_by_var : dict of 1D NumPy arrays
    val_arr_by_var : dict of 1D NumPy arrays
    start_numerictime : float
        Indicates numerical time value at which current window *starts*
    stop_numerictime : float
        Indicates numerical time that current window *stops*
    Returns
    -------
    feat_vec_1F : 2D NumPy array, shape (1, F)
        One entry for each combination of {variable, summary op, subwindow slice}
    '''
    
    start_numerictime = 0
    stop_numerictime = 24
    time_range = stop_numerictime - start_numerictime

    F = len(percentile_slices_to_featurize) * n_features * len (summary_ops)
    feat_vec_1F = np.zeros((1, F))
    ff = 0

    SUMMARY_OPERATIONS = make_summary_ops()

    for rp_ind, (low, high) in enumerate(percentile_slices_to_featurize):
        cur_window_start_time = start_numerictime + float(low) / 100 * time_range
        cur_window_stop_time = start_numerictime + float(high) / 100 * time_range

        for var_id in range(n_features):
            cur_feat_arr = val_arr_by_var[:, var_id].astype('float')
            cur_numerictime_arr = time_arr_by_var

            # Keep only the entries whose times occur within current window
            start = np.searchsorted(
                cur_numerictime_arr, cur_window_start_time, side='left')
            stop = np.searchsorted(
                cur_numerictime_arr, cur_window_stop_time, side='right')
            cur_numerictime_arr = cur_numerictime_arr[start:stop]
            cur_feat_arr = cur_feat_arr[start:stop]
            cur_isfinite_arr = np.isfinite(cur_feat_arr)
            
            for op_ind, op in enumerate(summary_ops):
                summary_func, empty_val = SUMMARY_OPERATIONS[op]
                if cur_feat_arr.size < 1 or cur_isfinite_arr.sum() < 1:
                    feat_vec_1F[0,ff] = empty_val
                else:
                    feat_vec_1F[0,ff] = summary_func(
                        cur_feat_arr, cur_numerictime_arr, cur_isfinite_arr,
                        cur_window_start_time, cur_window_stop_time)
#                 feat_names.append("feature_%s_%s_%.0f-%.0f" % (var_id, op, float(low), float(high)))
                ff += 1
    return feat_vec_1F

In [26]:
N_tr = len(train_x_NTD)
N_va = len(valid_x_NTD)
N_te = len(test_x_NTD)
percentile_slices_to_featurize = [(0., 100.)]
summary_ops = ["std", "time_since_measured", "count", "slope", "median", "min", "max"]
n_features = train_x_NTD.shape[-1]
F = len(percentile_slices_to_featurize) * n_features * len (summary_ops)
fps_train = get_fenceposts(x_train_df, id_cols)
fps_valid = get_fenceposts(x_valid_df, id_cols)
fps_test = get_fenceposts(x_test_df, id_cols)


train_x_collapsed_NF = np.zeros((N_tr, F))
valid_x_collapsed_NF = np.zeros((N_va, F))
test_x_collapsed_NF = np.zeros((N_te, F))

print('Collapsing train feaures')
for nn in range(N_tr):
    if (nn%500)==0:
        print('Done with %s sequences..'%nn)
    
    T = train_x_NTD.shape[1]
    train_x_collapsed_NF[nn, :] = featurize_ts(np.arange(0, T).astype(float),
                                               train_x_NTD[nn],
                                               n_features,
                                               percentile_slices_to_featurize=percentile_slices_to_featurize,
                                               summary_ops=summary_ops)
print('Collapsing valid feaures')
for nn in range(N_va):
    if (nn%500)==0:
        print('Done with %s sequences..'%nn)
    valid_x_collapsed_NF[nn, :] = featurize_ts(np.arange(0, T).astype(float),
                                               valid_x_NTD[nn],
                                               n_features,
                                               percentile_slices_to_featurize=percentile_slices_to_featurize,
                                               summary_ops=summary_ops)

print('Collapsing test feaures')
for nn in range(N_te):
    if (nn%500)==0:
        print('Done with %s sequences..'%nn)
    test_x_collapsed_NF[nn, :] = featurize_ts(np.arange(0, T).astype(float),
                                               test_x_NTD[nn],
                                               n_features,
                                               percentile_slices_to_featurize=percentile_slices_to_featurize,
                                               summary_ops=summary_ops)

Collapsing train feaures
Done with 0 sequences..
Done with 500 sequences..
Done with 1000 sequences..
Done with 1500 sequences..
Done with 2000 sequences..
Done with 2500 sequences..
Done with 3000 sequences..
Done with 3500 sequences..
Done with 4000 sequences..
Done with 4500 sequences..
Done with 5000 sequences..
Done with 5500 sequences..
Done with 6000 sequences..
Done with 6500 sequences..
Done with 7000 sequences..
Done with 7500 sequences..
Done with 8000 sequences..
Done with 8500 sequences..
Done with 9000 sequences..
Done with 9500 sequences..
Done with 10000 sequences..
Done with 10500 sequences..
Done with 11000 sequences..
Done with 11500 sequences..
Done with 12000 sequences..
Done with 12500 sequences..
Done with 13000 sequences..
Done with 13500 sequences..
Done with 14000 sequences..
Done with 14500 sequences..
Done with 15000 sequences..
Done with 15500 sequences..
Done with 16000 sequences..
Done with 16500 sequences..
Done with 17000 sequences..
Done with 17500 seq

In [27]:
state_id = 41
data_save_dir = '/cluster/tufts/hugheslab/prath01/datasets/mimic4_ssl/'

for ii, perc_labelled in enumerate([1.2, 3.7, 11.1, 33.3, 100]):#3.7, 11.1, 33.3, 100
    curr_save_dir = os.path.join(data_save_dir, 'percentage_labelled_sequnces=%s'%perc_labelled)
    
    print('---------------------------------------------------------------------------')
    print('CREATING TRAIN/VALID/TEST SPLITS FOR %.3f PERCENT OF SEQUENCES LABELLED'%perc_labelled)
    print('---------------------------------------------------------------------------')
    y_train_ss = y_train.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_tr = int((1-(perc_labelled)/100)*N_tr)
    unlabelled_inds_tr = rnd_state.permutation(N_tr)[:n_unlabelled_tr]
    y_train_ss = y_train_ss.astype(np.float32)
    y_train_ss[unlabelled_inds_tr] = np.nan  
    if perc_labelled!=100:
        print('Excluded inds train: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_tr[0],
                                                              unlabelled_inds_tr[1],
                                                              unlabelled_inds_tr[2],
                                                              unlabelled_inds_tr[-3],
                                                              unlabelled_inds_tr[-2],
                                                              unlabelled_inds_tr[-1]))
    
    y_valid_ss = y_valid.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_va = int((1-(perc_labelled)/100)*N_va)
    unlabelled_inds_va = rnd_state.permutation(N_va)[:n_unlabelled_va]
    y_valid_ss = y_valid_ss.astype(np.float32)
    y_valid_ss[unlabelled_inds_va] = np.nan 
    if perc_labelled!=100:
        print('Excluded inds valid: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_va[0],
                                                          unlabelled_inds_va[1],
                                                          unlabelled_inds_va[2],
                                                          unlabelled_inds_va[-3],
                                                          unlabelled_inds_va[-2],
                                                          unlabelled_inds_va[-1]))

    y_test_ss = y_test.copy()
    rnd_state = np.random.RandomState(state_id)
    n_unlabelled_te = int((1-(perc_labelled)/100)*N_te)
    unlabelled_inds_te = rnd_state.permutation(N_te)[:n_unlabelled_te]
    y_test_ss = y_test_ss.astype(np.float32)
    y_test_ss[unlabelled_inds_te] = np.nan
    if perc_labelled!=100:
        print('Excluded inds test: %d, %d, %d ... %d, %d, %d'%(unlabelled_inds_te[0],
                                                          unlabelled_inds_te[1],
                                                          unlabelled_inds_te[2],
                                                          unlabelled_inds_te[-3],
                                                          unlabelled_inds_te[-2],
                                                          unlabelled_inds_te[-1]))
    
    # Check whether the specified path exists or not
    isExist = os.path.exists(curr_save_dir)

    if not isExist:
        # Create a new directory because it does not exist 
        os.makedirs(curr_save_dir)
        
    # save the data to the respective folder
    print('Saving data to %s'%curr_save_dir)
    np.save(os.path.join(curr_save_dir, 'X_train_collapsed.npy'), train_x_collapsed_NF)
    np.save(os.path.join(curr_save_dir, 'y_train_collapsed.npy'), y_train_ss)
    print('Done saving train..')
    np.save(os.path.join(curr_save_dir, 'X_valid_collapsed.npy'), valid_x_collapsed_NF)
    np.save(os.path.join(curr_save_dir, 'y_valid_collapsed.npy'), y_valid_ss)
    print('Done saving valid..')
    np.save(os.path.join(curr_save_dir, 'X_test_collapsed.npy'), test_x_collapsed_NF)
    np.save(os.path.join(curr_save_dir, 'y_test_collapsed.npy'), y_test_ss)
    print('Done saving test..')



---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 1.200 PERCENT OF SEQUENCES LABELLED
---------------------------------------------------------------------------
Excluded inds train: 3658, 24411, 36568 ... 21793, 12641, 38467
Excluded inds valid: 14685, 4004, 13801 ... 14248, 4256, 6648
Excluded inds test: 10851, 4531, 10134 ... 6648, 14074, 5532
Saving data to /cluster/tufts/hugheslab/prath01/datasets/mimic4_ssl/percentage_labelled_sequnces=1.2
Done saving train..
Done saving valid..
Done saving test..
---------------------------------------------------------------------------
CREATING TRAIN/VALID/TEST SPLITS FOR 3.700 PERCENT OF SEQUENCES LABELLED
---------------------------------------------------------------------------
Excluded inds train: 3658, 24411, 36568 ... 12705, 41004, 20243
Excluded inds valid: 14685, 4004, 13801 ... 8849, 9201, 5668
Excluded inds test: 10851, 4531, 10134 ... 11803, 992, 8849
Saving data to /cl

In [15]:
N_tr

42836

In [18]:
x_train_df

Unnamed: 0,subject_id,hadm_id,stay_id,start,stop,blood_glucose_concentration,bmi,body_temperature,diastolic_blood_pressure,heart_rate,height,o2_sat,respiratory_rate,systolic_blood_pressure,weight,admission_timestamp,stop_time,Age,is_gender_male,is_gender_unknown
0,10001217,24597018,37067082,-24.0,-17.3,,,,,,,,,,,2157-11-20 19:18:02,2:00:00,55,0,0
1,10001217,24597018,37067082,-17.3,-9.3,,,,,,,,,,,2157-11-20 19:18:02,10:00:00,55,0,0
2,10001217,24597018,37067082,-9.3,-1.3,,,,,,,,,,,2157-11-20 19:18:02,18:00:00,55,0,0
3,10001217,24597018,37067082,-1.3,6.7,,,37.138890,90.0,86.0,,99.0,18.0,151.0,,2157-11-20 19:18:02,2:00:00,55,0,0
4,10001217,24597018,37067082,6.7,14.7,,,36.722220,72.0,89.0,,98.0,19.0,141.0,,2157-11-20 19:18:02,10:00:00,55,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326904,19999987,23865745,36195440,3.0,11.0,,,37.222220,67.0,90.0,,100.0,20.0,118.0,,2145-11-02 22:59:00,10:00:00,57,0,0
326905,19999987,23865745,36195440,11.0,19.0,,,37.166668,68.0,103.0,,93.0,19.0,111.0,,2145-11-02 22:59:00,18:00:00,57,0,0
326906,19999987,23865745,36195440,19.0,27.0,,,37.166668,59.0,102.0,,100.0,22.0,101.0,,2145-11-02 22:59:00,2:00:00,57,0,0
326907,19999987,23865745,36195440,27.0,35.0,,,38.111110,72.0,113.0,,99.0,27.0,116.0,,2145-11-02 22:59:00,10:00:00,57,0,0
