In [11]:
dir_path = os.getcwd()
dir_path


'/home/aflanders/CS598_DLH'

In [1]:
from __future__ import absolute_import
from __future__ import print_function

from typing import NamedTuple
import argparse
import os
import sys
import pandas as pd
from tqdm import tqdm

from mimic3benchmark.subject import read_stays, read_diagnoses, read_events, get_events_for_stay,\
    add_hours_elpased_to_events
from mimic3benchmark.subject import convert_events_to_timeseries, get_first_valid_from_timeseries
from mimic3benchmark.preprocessing import read_itemid_to_variable_map, map_itemids_to_variables, clean_events
from mimic3benchmark.preprocessing import assemble_episodic_data

class Args(NamedTuple):
    subjects_root_path: str
    variable_map_file: str
    reference_range_file: str
    notes: bool
    notes_csv_file: str

parser = argparse.ArgumentParser(description='Extract episodes from per-subject data.')
parser.add_argument('subjects_root_path', type=str, help='Directory containing subject sub-directories.')
parser.add_argument('--variable_map_file', type=str,
                    default='/resources/itemid_to_variable_map.csv',
                    help='CSV containing ITEMID-to-VARIABLE map.')
parser.add_argument('--reference_range_file', type=str,
                    default='../resources/variable_ranges.csv',
                    help='CSV containing reference ranges for VARIABLEs.')
parser.add_argument('--notes', action='store_true', help='NOTES: Process notes')
parser.add_argument('--notes_csv_file', type=str,
                    help='CSV file with all mimic clinical notes')
#args, _ = parser.parse_known_args()

args = Args('/mnt/data01/mimic-3/benchmark-small',
        'mimic3benchmark/resources/itemid_to_variable_map.csv',
        'mimic3benchmark/resources/variable_ranges.csv',
        True,
        '/mnt/data01/mimic-3/csv/NOTEEVENTS.csv')

var_map = read_itemid_to_variable_map(args.variable_map_file)
variables = var_map.VARIABLE.unique()


In [2]:
if args.notes:
    all_notes = pd.read_csv(args.notes_csv_file, parse_dates=["CHARTTIME"], infer_datetime_format=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
all_notes.dtypes

ROW_ID                  int64
SUBJECT_ID              int64
HADM_ID               float64
CHARTDATE              object
CHARTTIME      datetime64[ns]
STORETIME              object
CATEGORY               object
DESCRIPTION            object
CGID                  float64
ISERROR               float64
TEXT                   object
dtype: object

In [4]:
all_notes.drop(all_notes[all_notes.ISERROR == 1].index, inplace=True)
all_notes.drop(columns="ISERROR", inplace=True)

In [44]:
all_notes.describe(include="all")

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT
count,2082294.0,2082294.0,1850465.0,2082294,1765728,1245518,2082294,2082294,1245518.0,2082294
unique,,,,38686,1448424,1223828,15,3840,,2022302
top,,,,2148-12-03,2153-10-31 01:04:00,2134-01-07 12:26:00,Nursing/other,Report,,ECG interpreted by ordering physician.\n[**Nam...
freq,,,,161,40,16,822497,1132519,,6832
first,,,,,2097-12-07 12:55:00,,,,,
last,,,,,2210-10-01 09:58:00,,,,,
mean,1041820.0,32415.94,149879.9,,,,,,17716.28,
std,601382.1,27550.77,28780.71,,,,,,2158.616,
min,1.0,2.0,100001.0,,,,,,14010.0,
25%,521046.2,11464.0,125195.0,,,,,,15803.0,


In [44]:
rec_31558_01 = all_notes[(all_notes["SUBJECT_ID"] == 31558) & (~pd.isnull(all_notes["CHARTTIME"]))].sort_values(["CHARTDATE", "CHARTTIME"])

In [5]:
# for subject_dir in tqdm(os.listdir(args.subjects_root_path), desc='Iterating over subjects'):
#     dn = os.path.join(args.subjects_root_path, subject_dir)
#     try:
#         subject_id = int(subject_dir)
#         if not os.path.isdir(dn):
#             raise Exception
#     except:
#         continue
subject_id = 305
subject_dir = "305"

stays = read_stays(os.path.join(args.subjects_root_path, str(subject_id)))
notes = all_notes[(all_notes["SUBJECT_ID"] == subject_id) & (~pd.isnull(all_notes["CHARTTIME"]))].sort_values(["CHARTTIME"])



In [6]:
stays.head(5)

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,LAST_CAREUNIT,DBSOURCE,INTIME,OUTTIME,LOS,ADMITTIME,DISCHTIME,...,ETHNICITY,DIAGNOSIS,GENDER,DOB,DOD,INDATE,AGE,MORTALITY_INUNIT,MORTALITY,MORTALITY_INHOSPITAL
0,305,133059,232719,CCU,carevue,2125-04-27 18:47:43,2125-05-01 16:50:19,3.9185,2125-04-26 11:45:00,2125-05-03 18:12:00,...,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,F,2052-10-24,2129-09-07,2125-04-27,72.553425,0,0,0
1,305,108015,297009,CSRU,carevue,2126-01-02 11:28:23,2126-01-07 10:07:29,4.9438,2125-12-31 18:40:00,2126-01-10 11:30:00,...,WHITE,NON-ST SEGMENT ELEVATION MYOCARDIAL INFARCTION...,F,2052-10-24,2129-09-07,2126-01-02,73.238356,0,0,0
2,305,194340,217232,SICU,metavision,2129-09-03 12:31:31,2129-09-05 23:00:50,2.437,2129-08-20 18:03:00,2129-09-07 17:55:00,...,WHITE,HEART FAILURE,F,2052-10-24,2129-09-07,2129-09-03,76.909589,0,1,1


In [8]:
notes.tail(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT
391656,388592,305,194340.0,2129-09-05,2129-09-05 17:08:00,2129-09-05 17:08:25,Nursing,Nursing Transfer Note,17222.0,76-year-old woman with complex vascular histor...
391578,388600,305,194340.0,2129-09-05,2129-09-05 17:08:00,2129-09-05 18:06:37,Nursing,Nursing Transfer Note,17222.0,76-year-old woman with complex vascular histor...
391528,388614,305,194340.0,2129-09-05,2129-09-05 17:08:00,2129-09-05 22:25:17,Nursing,Nursing Transfer Note,18861.0,76-year-old woman with complex vascular histor...
1090048,1100264,305,194340.0,2129-09-06,2129-09-06 07:49:00,,Radiology,CHEST (PORTABLE AP),,[**2129-9-6**] 7:49 AM\n CHEST (PORTABLE AP) ...
1077782,1100406,305,194340.0,2129-09-07,2129-09-07 07:48:00,,Radiology,CHEST (PORTABLE AP),,[**2129-9-7**] 7:48 AM\n CHEST (PORTABLE AP) ...


In [30]:
def get_icu_stay_from_dt_func(stays):
    def get_icu_stay(chartdate):
        for i in range(len(stays["INTIME"])):
            latest_start = max(stays["INTIME"].iloc[i], chartdate)
            earliest_end = min(stays["OUTTIME"].iloc[i], chartdate)
            if earliest_end < latest_start:
                continue
            # delta = (earliest_end - latest_start)
            # delta = delta.days*24 + delta.seconds/3600
            # overlap = max(0, delta)
            # if overlap > 0:
            else:
                return int(stays["ICUSTAY_ID"].iloc[i])
        return np.nan
    return get_icu_stay


In [31]:
get_icu_stay_from_dt_func(stays)(notes["CHARTTIME"][391656])

217232

In [32]:
import numpy as np
notes["ICUSTAY_ID"] = notes['CHARTTIME'].apply(get_icu_stay_from_dt_func(stays)).astype("int")
notes = notes[notes['ICUSTAY_ID'].notna()]

In [33]:
notes.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT,ICUSTAY_ID
1259602,1264491,305,133059.0,2125-04-27,2125-04-27 20:51:00,2125-04-27 21:03:00,Nursing/other,Report,17079.0,Pt transfered from [**Hospital Ward Name 57**]...,232719
1259603,1264492,305,133059.0,2125-04-27,2125-04-27 22:22:00,2125-04-27 22:28:00,Nursing/other,Report,17079.0,2130-2230\nPt complained of chest pain radiati...,232719
1259856,1264493,305,133059.0,2125-04-28,2125-04-28 04:41:00,2125-04-28 04:57:00,Nursing/other,Report,17219.0,72 YR. OLD WOMAN TRANSFERRED FROM OSH S/P MI. ...,232719
1259857,1264494,305,133059.0,2125-04-28,2125-04-28 06:57:00,2125-04-28 07:02:00,Nursing/other,Report,17219.0,ADDENDUM\n0600 C/O L. ARM PAIN RADIATING TO JA...,232719
1258610,1264495,305,133059.0,2125-04-28,2125-04-28 17:58:00,2125-04-28 18:26:00,Nursing/other,Report,19525.0,ccu nursing progress note\ns: my body is no go...,232719


In [19]:
def get_hours_func(intime):
    def get_hours(chartdate):
        if (intime > chartdate):
            return np.nan
        delta = (chartdate - intime)
        hours = delta.days*24 + delta.seconds/3600
        hours = max(0, hours)
        return hours
    return get_hours

In [34]:
import dateutil.parser
#print(notes["CHARTTIME"][1259602])
#print(datetime.fromisoformat('2125-04-28 00:51:00'))
chartdate = dateutil.parser.isoparse('2125-04-29T01:47:43')
print(chartdate)
print(stays["INTIME"].iloc[0])
get_hours_func(stays["INTIME"].iloc[0])(chartdate)

2125-04-29 01:47:43
2125-04-27 18:47:43


31.0

In [35]:
notes['HOURS'] = (notes.CHARTTIME - stays["INTIME"].iloc[0]).apply(lambda s: s / np.timedelta64(1, 's')) / 60./60

In [36]:
notes.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT,ICUSTAY_ID,HOURS
1259602,1264491,305,133059.0,2125-04-27,2125-04-27 20:51:00,2125-04-27 21:03:00,Nursing/other,Report,17079.0,Pt transfered from [**Hospital Ward Name 57**]...,232719,2.054722
1259603,1264492,305,133059.0,2125-04-27,2125-04-27 22:22:00,2125-04-27 22:28:00,Nursing/other,Report,17079.0,2130-2230\nPt complained of chest pain radiati...,232719,3.571389
1259856,1264493,305,133059.0,2125-04-28,2125-04-28 04:41:00,2125-04-28 04:57:00,Nursing/other,Report,17219.0,72 YR. OLD WOMAN TRANSFERRED FROM OSH S/P MI. ...,232719,9.888056
1259857,1264494,305,133059.0,2125-04-28,2125-04-28 06:57:00,2125-04-28 07:02:00,Nursing/other,Report,17219.0,ADDENDUM\n0600 C/O L. ARM PAIN RADIATING TO JA...,232719,12.154722
1258610,1264495,305,133059.0,2125-04-28,2125-04-28 17:58:00,2125-04-28 18:26:00,Nursing/other,Report,19525.0,ccu nursing progress note\ns: my body is no go...,232719,23.171389


In [45]:
# reading tables of this subject
stays = read_stays(os.path.join(args.subjects_root_path, subject_dir))
diagnoses = read_diagnoses(os.path.join(args.subjects_root_path, subject_dir))
events = read_events(os.path.join(args.subjects_root_path, subject_dir))

if args.notes:
    notes = all_notes[(all_notes["SUBJECT_ID"] == subject_id) & (~pd.isnull(all_notes["CHARTTIME"]))] \
                .sort_values(["CHARTTIME"])
    notes["ICUSTAY_ID"] = notes['CHARTTIME'].apply(get_icu_stay_from_dt_func(stays))
    notes = notes[notes['ICUSTAY_ID'].notna()]

episodic_data = assemble_episodic_data(stays, diagnoses)

# cleaning and converting to time series
events = map_itemids_to_variables(events, var_map)
events = clean_events(events)
if events.shape[0] == 0:
    # no valid events for this subject
    print("no valid events for this subject")
timeseries = convert_events_to_timeseries(events, variables=variables)

# extracting separate episodes
for i in range(stays.shape[0]):
    stay_id = stays.ICUSTAY_ID.iloc[i]
    intime = stays.INTIME.iloc[i]
    outtime = stays.OUTTIME.iloc[i]

    episode = get_events_for_stay(timeseries, stay_id, intime, outtime)
    if episode.shape[0] == 0:
        # no data for this episode
        print(f"no data for episode: {i}")
        continue

    episode = add_hours_elpased_to_events(episode, intime).set_index('HOURS').sort_index(axis=0)
    if stay_id in episodic_data.index:
        episodic_data.loc[stay_id, 'Weight'] = get_first_valid_from_timeseries(episode, 'Weight')
        episodic_data.loc[stay_id, 'Height'] = get_first_valid_from_timeseries(episode, 'Height')
    episodic_data.loc[episodic_data.index == stay_id].to_csv(os.path.join(args.subjects_root_path, subject_dir,
                                                                            'episode{}.csv'.format(i+1)),
                                                                index_label='Icustay')
    columns = list(episode.columns)
    columns_sorted = sorted(columns, key=(lambda x: "" if x == "Hours" else x))
    episode = episode[columns_sorted]
    episode.to_csv(os.path.join(args.subjects_root_path, subject_dir, 'episode{}_timeseries.csv'.format(i+1)),
                    index_label='Hours')

    event_notes = notes[(notes["ICUSTAY_ID"] == stay_id)].copy()
    event_notes['HOURS'] = (event_notes.CHARTTIME - intime).apply(lambda s: s / np.timedelta64(1, 's')) / 60./60
    event_notes = event_notes[["HOURS", "CATEGORY", "DESCRIPTION", "TEXT"]].set_index('HOURS').sort_index(axis=0)
    event_notes.to_csv(os.path.join(args.subjects_root_path, subject_dir, 'episode{}_notes.csv'.format(i+1)),
                    index_label='Hours')

no data for episode: 0
no data for episode: 1


In [4]:
from .note_processing.heuristic_tokenize import sent_tokenize_rules

ModuleNotFoundError: No module named '__main__.note_processing'; '__main__' is not a package