In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append(os.path.join(os.path.abspath('../'), 'predictions_collapsed'))
sys.path.append(os.path.join(os.path.abspath('../'), 'src'))
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
pd.set_option('display.max_colwidth', -1)
import glob

from config_loader import (
    D_CONFIG, DATASET_TOP_PATH,
    DATASET_SITE_PATH, PROJECT_REPO_DIR, PROJECT_CONDA_ENV_YAML,
    DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH
    )

sys.path.append(os.path.join(PROJECT_REPO_DIR, 'src'))
from feature_transformation import *
from utils import load_data_dict_json

  pd.set_option('display.max_colwidth', -1)


In [2]:
DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH

'/home/prash/datasets/v20210419/HUF/split-by=patient_id/collapsed_features_dynamic_input_output'

## Get collapsed feats

In [4]:
dynamic_collapsed_vitals_df = pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'CollapsedVitalsDynamic.csv.gz'))
dynamic_collapsed_labs_df = pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'CollapsedLabsDynamic.csv.gz'))
dynamic_collapsed_medications_df = pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'CollapsedMedicationsDynamic.csv.gz'))


## Get data dicts

In [7]:
vitals_dd = load_data_dict_json(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'Spec_CollapsedVitalsDynamic.json'))
labs_dd = load_data_dict_json(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'Spec_CollapsedLabsDynamic.json'))
medications_dd = load_data_dict_json(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'Spec_CollapsedMedicationsDynamic.json'))

## Get outputs

In [3]:
vitals_output =  pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'OutputsDynamicVitals.csv.gz'))
labs_output =  pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'OutputsDynamicLabs.csv.gz'))
medications_output =  pd.read_csv(os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'OutputsDynamicMedications.csv.gz'))

In [12]:
print(dynamic_collapsed_vitals_df.shape)
print(dynamic_collapsed_labs_df.shape)
print(dynamic_collapsed_medications_df.shape)

(434898, 75)
(370358, 185)
(262246, 333)


# Check if dimensions make sense

In [16]:
print(len(dynamic_collapsed_vitals_df['hospital_admission_id'].unique()))
print(len(dynamic_collapsed_labs_df['hospital_admission_id'].unique()))
print(len(dynamic_collapsed_medications_df['hospital_admission_id'].unique()))

58248
41622
30701


In [22]:
print(len(vitals_output['hospital_admission_id'].unique()))
print(len(labs_output['hospital_admission_id'].unique()))
print(len(medications_output['hospital_admission_id'].unique()))

58248
41622
30701


## Merge vitals, labs and medications collapsed features

In [45]:
id_cols = parse_id_cols(vitals_dd)
dynamic_collapsed_feats_df = pd.merge(pd.merge(dynamic_collapsed_vitals_df, dynamic_collapsed_labs_df, 
                                      on=id_cols+['window_start', 'window_end'], how='left'), 
                                      dynamic_collapsed_medications_df, on=id_cols+['window_start', 'window_end'], 
                                      how='left')

# since the nan values are all unobserved, set to 0
dynamic_collapsed_feats_df[dynamic_collapsed_feats_df.isna()]=0.0

print(dynamic_collapsed_feats_df.shape)

(434898, 583)


## Merge demographics

In [47]:
demographics_df = pd.read_csv(os.path.join(DATASET_SITE_PATH, 'demographics_before_icu.csv.gz'))

In [52]:
dynamic_collapsed_feats_df = pd.merge(dynamic_collapsed_feats_df, demographics_df, on=id_cols, how='left')

In [53]:
dynamic_collapsed_feats_df

Unnamed: 0,patient_id,hospital_admission_id,facility_code,blood_glucose_concentration_std_0%_to_100%,bmi_std_0%_to_100%,body_temperature_std_0%_to_100%,diastolic_blood_pressure_std_0%_to_100%,heart_rate_std_0%_to_100%,height_std_0%_to_100%,o2_sat_std_0%_to_100%,...,nervous_system_medication_max_0%_to_100%,respiratory_medication_max_0%_to_100%,sensory_organ_medication_max_0%_to_100%,systemic_hormonal_medication_max_0%_to_100%,various_other_medication_max_0%_to_100%,birth_date,admission_timestamp,age_at_admission,gender_is_male,gender_is_unknown
0,24,29973369,2,0.0,0.0,0.350000,12.000000,5.000000,0.0,0.500000,...,0.0,0.0,0.0,0.0,0.0,1964-12-06,2023-06-11 16:51:00,58.550685,0.0,0.0
1,24,29973369,2,0.0,0.0,0.495580,11.542193,7.363574,0.0,1.707825,...,0.0,0.0,0.0,0.0,0.0,1964-12-06,2023-06-11 16:51:00,58.550685,0.0,0.0
2,24,29973369,2,0.0,0.0,0.401233,11.015141,8.393119,0.0,2.000000,...,0.0,0.0,0.0,0.0,0.0,1964-12-06,2023-06-11 16:51:00,58.550685,0.0,0.0
3,24,29973369,2,0.0,0.0,0.521632,11.757976,7.484706,0.0,2.094968,...,0.0,0.0,0.0,0.0,0.0,1964-12-06,2023-06-11 16:51:00,58.550685,0.0,0.0
4,24,29973369,2,0.0,0.0,0.448182,10.949094,7.246745,0.0,2.258770,...,0.0,0.0,0.0,0.0,0.0,1964-12-06,2023-06-11 16:51:00,58.550685,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434893,2999997,23067798,2,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1999-08-01,2027-01-18 07:32:00,27.484932,1.0,0.0
434894,2999997,23067798,2,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1999-08-01,2027-01-18 07:32:00,27.484932,1.0,0.0
434895,2999997,23067798,2,0.0,0.0,0.000000,14.352700,19.131126,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1999-08-01,2027-01-18 07:32:00,27.484932,1.0,0.0
434896,2999997,23067798,2,0.0,0.0,0.372022,10.853827,17.301252,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1999-08-01,2027-01-18 07:32:00,27.484932,1.0,0.0


## Get the dynamic outputs of the merged labs, vitals and medication features

### Same as the vitals dynamic outputs because all stays contain at-least 1 vital 

In [46]:
dynamic_outputs_df = vitals_output.copy()

In [6]:
vitals_output.columns

Index(['patient_id', 'hospital_admission_id', 'facility_code',
       'clinical_deterioration_outcome', 'window_start', 'window_end',
       'stay_length'],
      dtype='object')

'/home/prash/datasets/v20211018/HUF/split-by=patient_id/collapsed_features_dynamic_input_output/classifier_train_test_split'

In [12]:
CLF_TRAIN_TEST_SPLIT_PATH=os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'classifier_train_test_split')
features_csv=os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, "dynamic_featuresCustomTimes_10_6_vitals_only.csv.gz")
outcomes_csv=os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, "dynamic_outcomesCustomTimes_10_6_vitals_only.csv.gz")

In [13]:
features_df = pd.read_csv(features_csv)

In [28]:
features_df.loc[features_df.patient_id==11568, ['patient_id', 'hospital_admission_id', 'start', 'stop', 'admission_timestamp']]

Unnamed: 0,patient_id,hospital_admission_id,start,stop,admission_timestamp
1745,11568,13602459,-24.0,-23.0,2021-09-05 17:48:00
1746,11568,13602459,-24.0,-7.0,2021-09-05 17:48:00
1747,11568,13602459,-24.0,1.0,2021-09-05 17:48:00
1748,11568,13602459,-24.0,17.0,2021-09-05 17:48:00
1749,11568,13602459,-24.0,25.0,2021-09-05 17:48:00
1750,11568,13602459,-24.0,41.0,2021-09-05 17:48:00
1751,11568,13602459,-24.0,49.0,2021-09-05 17:48:00
1752,11568,13602459,-24.0,65.0,2021-09-05 17:48:00
1753,11568,13602459,-24.0,73.0,2021-09-05 17:48:00
1754,11568,13602459,-24.0,89.0,2021-09-05 17:48:00


In [29]:
outcomes_df = pd.read_csv(outcomes_csv)
outcomes_df.loc[outcomes_df.patient_id==11568, ['patient_id', 'hospital_admission_id', 'stay_length']]

Unnamed: 0,patient_id,hospital_admission_id,stay_length
1745,11568,13602459,120.75
1746,11568,13602459,120.75
1747,11568,13602459,120.75
1748,11568,13602459,120.75
1749,11568,13602459,120.75
1750,11568,13602459,120.75
1751,11568,13602459,120.75
1752,11568,13602459,120.75
1753,11568,13602459,120.75
1754,11568,13602459,120.75


In [27]:
outcomes_df.patient_id.unique()[50:60]

array([11465, 11474, 11568, 11581, 11583, 11619, 11705, 11717, 11817,
       11823])