In [1]:
 import cv2
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
import dask.dataframe as dd
import datetime as dt

In [38]:
from tqdm import tqdm

### Hospital

In [46]:
core_mimiciv_path = '/nfs/turbo/med-kayvan-lab/Projects/HeartFailure/Data/Raw/physionet.org/files/mimiciv/2.2/'

In [14]:
df_admissions = dd.read_csv(core_mimiciv_path + 'hosp/admissions.csv', assume_missing=True, dtype={'admission_location': 'object','deathtime': 'object','edouttime': 'object','edregtime': 'object'})
df_patients = dd.read_csv(core_mimiciv_path + 'hosp/patients.csv', assume_missing=True, dtype={'dod': 'object'})  
df_transfers = dd.read_csv(core_mimiciv_path + 'hosp/transfers.csv', assume_missing=True, dtype={'careunit': 'object'})

In [15]:
df_d_labitems = dd.read_csv(core_mimiciv_path + 'hosp/d_labitems.csv', assume_missing=True, dtype={'loinc_code': 'object'})
df_d_icd_procedures = dd.read_csv(core_mimiciv_path + 'hosp/d_icd_procedures.csv', assume_missing=True, dtype={'icd_code': 'object', 'icd_version': 'object'})
df_d_icd_diagnoses = dd.read_csv(core_mimiciv_path + 'hosp/d_icd_diagnoses.csv', assume_missing=True, dtype={'icd_code': 'object', 'icd_version': 'object'})
df_d_hcpcs = dd.read_csv(core_mimiciv_path + 'hosp/d_hcpcs.csv', assume_missing=True, dtype={'category': 'object'})
df_diagnoses_icd = dd.read_csv(core_mimiciv_path + 'hosp/diagnoses_icd.csv', assume_missing=True, dtype={'icd_code': 'object', 'icd_version': 'object'})
df_drgcodes = dd.read_csv(core_mimiciv_path + 'hosp/drgcodes.csv', assume_missing=True)
df_emar = dd.read_csv(core_mimiciv_path + 'hosp/emar.csv', assume_missing=True)
df_emar_detail = dd.read_csv(core_mimiciv_path + 'hosp/emar_detail.csv', assume_missing=True, low_memory=False, dtype={'completion_interval': 'object','dose_due': 'object','dose_given': 'object','infusion_complete': 'object','infusion_rate_adjustment': 'object','infusion_rate_unit': 'object','new_iv_bag_hung': 'object','product_description_other': 'object','reason_for_no_barcode': 'object','restart_interval': 'object','route': 'object','side': 'object','site': 'object','continued_infusion_in_other_location': 'object','infusion_rate': 'object','non_formulary_visual_verification': 'object','prior_infusion_rate': 'object','product_amount_given': 'object', 'infusion_rate_adjustment_amount': 'object'})
df_hcpcsevents = dd.read_csv(core_mimiciv_path + 'hosp/hcpcsevents.csv', assume_missing=True, dtype={'hcpcs_cd': 'object'})
df_labevents = dd.read_csv(core_mimiciv_path + 'hosp/labevents.csv', assume_missing=True, dtype={'storetime': 'object', 'value': 'object', 'valueuom': 'object', 'flag': 'object', 'priority': 'object', 'comments': 'object'})
df_microbiologyevents = dd.read_csv(core_mimiciv_path + 'hosp/microbiologyevents.csv', assume_missing=True, dtype={'comments': 'object', 'quantity': 'object'})
df_poe = dd.read_csv(core_mimiciv_path + 'hosp/poe.csv', assume_missing=True, dtype={'discontinue_of_poe_id': 'object','discontinued_by_poe_id': 'object','order_status': 'object'})
df_poe_detail = dd.read_csv(core_mimiciv_path + 'hosp/poe_detail.csv', assume_missing=True)
df_prescriptions = dd.read_csv(core_mimiciv_path + 'hosp/prescriptions.csv', assume_missing=True, dtype={'form_rx': 'object','gsn': 'object'})
df_procedures_icd = dd.read_csv(core_mimiciv_path + 'hosp/procedures_icd.csv', assume_missing=True, dtype={'icd_code': 'object', 'icd_version': 'object'})
df_services = dd.read_csv(core_mimiciv_path + 'hosp/services.csv', assume_missing=True, dtype={'prev_service': 'object'})

### ICU

In [18]:
## ICU
df_d_items = dd.read_csv(core_mimiciv_path + 'icu/d_items.csv', assume_missing=True)
df_procedureevents = dd.read_csv(core_mimiciv_path + 'icu/procedureevents.csv', assume_missing=True, dtype={'value': 'object', 'secondaryordercategoryname': 'object', 'totalamountuom': 'object'})
df_outputevents = dd.read_csv(core_mimiciv_path + 'icu/outputevents.csv', assume_missing=True, dtype={'value': 'object'})
df_inputevents = dd.read_csv(core_mimiciv_path + 'icu/inputevents.csv', assume_missing=True, dtype={'value': 'object', 'secondaryordercategoryname': 'object', 'totalamountuom': 'object'})
df_icustays = dd.read_csv(core_mimiciv_path + 'icu/icustays.csv', assume_missing=True)
df_datetimeevents = dd.read_csv(core_mimiciv_path + 'icu/datetimeevents.csv', assume_missing=True, dtype={'value': 'object'})
df_chartevents = dd.read_csv(core_mimiciv_path + 'icu/chartevents.csv', assume_missing=True, low_memory=False, dtype={'value': 'object', 'valueuom': 'object'})

## CXR

In [2]:
# data_path = 'data_clean'
# img_path='./data/mimic-cxr-jpg/'

In [48]:
core_mimiciv_imgcxr_path = '/nfs/turbo/med-kayvan-lab/Projects/HeartFailure/Data/Raw/physionet.org/files/'

In [49]:
df_mimic_cxr_split = dd.read_csv(core_mimiciv_imgcxr_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-split.csv', assume_missing=True)
df_mimic_cxr_chexpert = dd.read_csv(core_mimiciv_imgcxr_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv', assume_missing=True)

In [34]:
try:
    df_mimic_cxr_metadata = dd.read_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv', assume_missing=True, dtype={'dicom_id': 'object'}, blocksize=None)
except:
    df_mimic_cxr_metadata = pd.read_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv', dtype={'dicom_id': 'object'})
    df_mimic_cxr_metadata = dd.from_pandas(df_mimic_cxr_metadata, npartitions=7)
df_mimic_cxr_negbio = dd.read_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-negbio.csv', assume_missing=True)

### Fix data type issues to allow for merging

In [24]:
## CORE
df_admissions['admittime'] = dd.to_datetime(df_admissions['admittime'])
df_admissions['dischtime'] = dd.to_datetime(df_admissions['dischtime'])
df_admissions['deathtime'] = dd.to_datetime(df_admissions['deathtime'])
df_admissions['edregtime'] = dd.to_datetime(df_admissions['edregtime'])
df_admissions['edouttime'] = dd.to_datetime(df_admissions['edouttime'])

df_transfers['intime'] = dd.to_datetime(df_transfers['intime'])
df_transfers['outtime'] = dd.to_datetime(df_transfers['outtime'])


## HOSP
df_diagnoses_icd.icd_code = df_diagnoses_icd.icd_code.str.strip()
df_diagnoses_icd.icd_version = df_diagnoses_icd.icd_version.str.strip()
df_d_icd_diagnoses.icd_code = df_d_icd_diagnoses.icd_code.str.strip()
df_d_icd_diagnoses.icd_version = df_d_icd_diagnoses.icd_version.str.strip()

df_procedures_icd.icd_code = df_procedures_icd.icd_code.str.strip()
df_procedures_icd.icd_version = df_procedures_icd.icd_version.str.strip()
df_d_icd_procedures.icd_code = df_d_icd_procedures.icd_code.str.strip()
df_d_icd_procedures.icd_version = df_d_icd_procedures.icd_version.str.strip()

df_hcpcsevents.hcpcs_cd = df_hcpcsevents.hcpcs_cd.str.strip()
df_d_hcpcs.code = df_d_hcpcs.code.str.strip()

df_prescriptions['starttime'] = dd.to_datetime(df_prescriptions['starttime'])
df_prescriptions['stoptime'] = dd.to_datetime(df_prescriptions['stoptime'])

df_emar['charttime'] = dd.to_datetime(df_emar['charttime'])
df_emar['scheduletime'] = dd.to_datetime(df_emar['scheduletime'])
df_emar['storetime'] = dd.to_datetime(df_emar['storetime'])

df_labevents['charttime'] = dd.to_datetime(df_labevents['charttime'])
df_labevents['storetime'] = dd.to_datetime(df_labevents['storetime'])

df_microbiologyevents['chartdate'] = dd.to_datetime(df_microbiologyevents['chartdate'])
df_microbiologyevents['charttime'] = dd.to_datetime(df_microbiologyevents['charttime'])
df_microbiologyevents['storedate'] = dd.to_datetime(df_microbiologyevents['storedate'])
df_microbiologyevents['storetime'] = dd.to_datetime(df_microbiologyevents['storetime'])

df_poe['ordertime'] = dd.to_datetime(df_poe['ordertime'])
df_services['transfertime'] = dd.to_datetime(df_services['transfertime'])


## ICU
df_procedureevents['starttime'] = dd.to_datetime(df_procedureevents['starttime'])
df_procedureevents['endtime'] = dd.to_datetime(df_procedureevents['endtime'])
df_procedureevents['storetime'] = dd.to_datetime(df_procedureevents['storetime'])
# df_procedureevents['comments_date'] = dd.to_datetime(df_procedureevents['comments_date'])

df_outputevents['charttime'] = dd.to_datetime(df_outputevents['charttime'])
df_outputevents['storetime'] = dd.to_datetime(df_outputevents['storetime'])

df_inputevents['starttime'] = dd.to_datetime(df_inputevents['starttime'])
df_inputevents['endtime'] = dd.to_datetime(df_inputevents['endtime'])
df_inputevents['storetime'] = dd.to_datetime(df_inputevents['storetime'])

df_icustays['intime'] = dd.to_datetime(df_icustays['intime'])
df_icustays['outtime'] = dd.to_datetime(df_icustays['outtime'])

df_datetimeevents['charttime'] = dd.to_datetime(df_datetimeevents['charttime'])
df_datetimeevents['storetime'] = dd.to_datetime(df_datetimeevents['storetime'])

df_chartevents['charttime'] = dd.to_datetime(df_chartevents['charttime'])
df_chartevents['storetime'] = dd.to_datetime(df_chartevents['storetime'])

In [30]:
# BUILD DATAFRAME OF IMAGES AND NOTES FOR MIMIC-IV CXR
def build_mimic_cxr_jpg_dataframe(core_mimiciv_imgcxr_path, do_save=False):
    # Inputs:
    #   core_mimiciv_imgcxr_path -> Directory of CXR images and image notes
    #   do_save -> Flag to save dataframe
    #
    # Outputs:
    #   df_mimic_cxr_jpg -> CXR images and image notes Dataframe
    df_mimic_cxr_jpg = pd.DataFrame()
    mimic_cxr_jpg_dir = core_mimiciv_imgcxr_path
    
    #Figure out how many files we will read
    file_count = 0
    for subdir, dirs, files in os.walk(mimic_cxr_jpg_dir):
        for file in files:
            # Extract filename and extension to filter by CSV only
            filename, extension = os.path.splitext(file)
            if extension=='.txt':
                file_count = file_count + 1
                
    #Setup progress bar
    pbar = tqdm(total=file_count)
    
    #Iterate
    for subdir, dirs, files in os.walk(mimic_cxr_jpg_dir):
        for file in files:
            # Extract filename and extension to filter by CSV only
            filename, extension = os.path.splitext(file)
            if extension=='.txt':
                note = open(subdir + '/' + filename + extension, "r", errors='ignore')
                img_note_text = note.read()
                note.close()
                img_folder = subdir + '/' + filename
                
                for img_subdir, img_dirs, img_files in os.walk(img_folder):
                    for img_file in img_files:
                        # Extract filename and extension to filter by CSV only
                        img_filename, img_extension = os.path.splitext(img_file)
                        if img_extension=='.jpg':
                            df_mimic_cxr_jpg = df_mimic_cxr_jpg.append({'Note_folder': subdir.replace(core_mimiciv_imgcxr_path,''), 'Note_file': filename + extension , 'Note': img_note_text, 'Img_Folder': img_folder.replace(core_mimiciv_imgcxr_path,''), 'Img_Filename': img_filename + img_extension, 'dicom_id': img_filename}, ignore_index=True)
                            
        #Update progress bar
        pbar.update(1)
        
    #Save
    if do_save:
        try:
            df_mimic_cxr_jpg.to_csv(core_mimiciv_imgcxr_path + '/mimic-cxr-2.0.0-jpeg-txt.csv')
        except Exception as e:
            print(e)
        
    return df_mimic_cxr_jpg

In [50]:
## CXR
df_mimic_cxr_jpg = build_mimic_cxr_jpg_dataframe(core_mimiciv_imgcxr_path, do_save=True)
if (not 'cxrtime' in df_mimic_cxr_metadata.columns) or (not 'Img_Filename' in df_mimic_cxr_metadata.columns):
    # Create CXRTime variable if it does not exist already
    print("Processing CXRtime stamps")
    df_cxr = df_mimic_cxr_metadata.compute()
    df_cxr['StudyDateForm'] = pd.to_datetime(df_cxr['StudyDate'], format='%Y%m%d')
    df_cxr['StudyTimeForm'] = df_cxr.apply(lambda x : '%#010.3f' % x['StudyTime'] ,1)
    df_cxr['StudyTimeForm'] = pd.to_datetime(df_cxr['StudyTimeForm'], format='%H%M%S.%f').dt.time
    df_cxr['cxrtime'] = df_cxr.apply(lambda r : dt.datetime.combine(r['StudyDateForm'],r['StudyTimeForm']),1)

    # # Add paths and info to images in cxr
    # df_mimic_cxr_jpg = pd.read_csv(core_mimiciv_imgcxr_path + 'files/mimic-cxr-2.0.0-jpeg-txt.csv')
    df_cxr = pd.merge(df_mimic_cxr_jpg, df_cxr, on='dicom_id')
    
    # Save
    df_cxr.to_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv', index=False)
    #Read back the dataframe
    try:
        df_mimic_cxr_metadata = dd.read_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv', assume_missing=True, dtype={'dicom_id': 'object', 'Note': 'object'}, blocksize=None)
    except:
        df_mimic_cxr_metadata = pd.read_csv(core_mimiciv_path + 'mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv', dtype={'dicom_id': 'object', 'Note': 'object'})
        df_mimic_cxr_metadata = dd.from_pandas(df_mimic_cxr_metadata, npartitions=7)
df_mimic_cxr_metadata['cxrtime'] = dd.to_datetime(df_mimic_cxr_metadata['cxrtime'])

KeyboardInterrupt: 

In [11]:
print('PROCESSING "CXR" DB...')
df_mimic_cxr_split = df_mimic_cxr_split.compute().sort_values(by=['subject_id'])
df_mimic_cxr_chexpert = df_mimic_cxr_chexpert.compute().sort_values(by=['subject_id'])
df_mimic_cxr_metadata = df_mimic_cxr_metadata.compute().sort_values(by=['subject_id'])
df_mimic_cxr_negbio = df_mimic_cxr_negbio.compute().sort_values(by=['subject_id'])

PROCESSING "CXR" DB...


In [12]:
## CXR
print('- CXR > df_mimic_cxr_split')
print('--------------------------------')
print(df_mimic_cxr_split.dtypes)
print('\n\n')

print('- CXR > df_mimic_cxr_chexpert')
print('--------------------------------')
print(df_mimic_cxr_chexpert.dtypes)
print('\n\n')

print('- CXR > df_mimic_cxr_metadata')
print('--------------------------------')
print(df_mimic_cxr_metadata.dtypes)
print('\n\n')

print('- CXR > df_mimic_cxr_negbio')
print('--------------------------------')
print(df_mimic_cxr_negbio.dtypes)
print('\n\n')

- CXR > df_mimic_cxr_split
--------------------------------
dicom_id       object
study_id      float64
subject_id    float64
split          object
dtype: object



- CXR > df_mimic_cxr_chexpert
--------------------------------
subject_id                    float64
study_id                      float64
Atelectasis                   float64
Cardiomegaly                  float64
Consolidation                 float64
Edema                         float64
Enlarged Cardiomediastinum    float64
Fracture                      float64
Lung Lesion                   float64
Lung Opacity                  float64
No Finding                    float64
Pleural Effusion              float64
Pleural Other                 float64
Pneumonia                     float64
Pneumothorax                  float64
Support Devices               float64
dtype: object



- CXR > df_mimic_cxr_metadata
--------------------------------
dicom_id                                       object
subject_id                    

In [13]:
# Get Unique Subjects with Chest Xrays
df_cxr_ids = pd.concat([pd.DataFrame(), df_mimic_cxr_chexpert[['subject_id']]], sort=True).drop_duplicates()

In [15]:
print('Unique Subjects with Chest Xrays Available: ' + str(len(df_cxr_ids)))df_mimic_cxr_negbio

Unique Subjects with Chest Xrays Available: 65379


In [23]:
df_cxr = df_mimic_cxr_chexpert.copy()
df_cxr = df_cxr.merge(df_mimic_cxr_metadata, how='left')
df_cxr = df_cxr.merge(df_mimic_cxr_negbio, how='left')

In [25]:
df_cxr.columns

Index(['subject_id', 'study_id', 'Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'dicom_id', 'PerformedProcedureStepDescription', 'ViewPosition', 'Rows',
       'Columns', 'StudyDate', 'StudyTime',
       'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning'],
      dtype='object')

In [26]:
df_cxr = df_cxr.drop_duplicates()
df_cxr.to_csv('full_cxr.csv', index=False)

In [27]:
df_cxr['study_id'] = df_cxr['study_id'].astype('int64')

In [None]:
img_path = os.path.join(self.img_path, img_folder[1:], img_file)
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (self.img_shape[0], self.img_shape[1]))

In [30]:
df_cxr.head(2)

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,10000032.0,50414267,,,,,,,,,...,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,CHEST (PA AND LAT),PA,3056.0,2544.0,21800506.0,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,10000032.0,50414267,,,,,,,,,...,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21800506.0,213014.531,CHEST (PA AND LAT),lateral,Erect


In [None]:
img_meta = self.cxr[(self.cxr['subject_id'] == subject_id) & (self.cxr['hadm_id'] == hadm_id)]