In [None]:
## Helper functions for preprocessing text in the data file
# This code is taken from https://github.com/kaggarwal/ClinicalNotesICU

SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)


def getSentences(t):
    return list(preprocess_mimic(t))

def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))

def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()

def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section

def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

In [None]:
# dir and function to load raw data
raw_data_dir = '/content/drive/MyDrive/CS598_Project/data'
MIMIC_EXTRACT_DATA = '/content/drive/MyDrive/CS598_Project/data/all_hourly_data.h5'

def load_raw_data(raw_data_dir):
  # implement this function to load raw data to dataframe/numpy array/tensor
  admission_df = pd.read_csv(os.path.join(raw_data_dir, 'ADMISSIONS.csv'))
  events_df = pd.read_csv(os.path.join(raw_data_dir, 'NOTEEVENTS.csv'), low_memory = False)
  icu_stay_df = pd.read_csv(os.path.join(raw_data_dir, 'ICUSTAYS.csv'))

  #print(len(admission_df))
  #print(len(events_df))
  #print(len(icu_stay_df))

  return [admission_df, events_df, icu_stay_df]

raw_data = load_raw_data(raw_data_dir)

# calculate statistics
def calculate_stats(raw_data):
  # implement this function to calculate the statistics
  # it is encouraged to print out the results
  admission_data = raw_data[0]
  note_events_data = raw_data[1]
  icu_stay_data = raw_data[2]
  print("There are total ", len(admission_data), " Admission Data")
  print("There are total ", len(icu_stay_data), " ICU Stay Data")

  ## Select Clinical Notes
  note_events_data.groupby(note_events_data.CATEGORY).agg(['count'])
  note_categories = note_events_data.groupby(note_events_data.CATEGORY).agg(['count']).index
  selected_note = []
  for category in list(note_categories):
    if category != 'Discharge summary':
      selected_note.append(category)
  ## Create sub notes based on category
  sub_notes = note_events_data[note_events_data.CATEGORY.isin(selected_note)]
  sub_notes.shape
  ## - Handle missing chart
  missing_chart_notes_idx = []
  for note in sub_notes.itertuples():
    if isinstance(note.CHARTTIME, str):
      continue
    if np.isnan(note.CHARTTIME):
      missing_chart_notes_idx.append(note.Index)
  print("{} of notes missing charttime.".format(len(missing_chart_notes_idx)))
  sub_notes.drop(missing_chart_notes_idx, inplace = True)
  print("sub_notes shape: ", sub_notes.shape)

  ## - Select based on Patient_ID
  #notes_patient_id = sub_notes[sub_notes.SUBJECT_ID.isin(patient_ids)]
  #sub_notes.shape
  #print(len(note_events_data))

  ## - Select based on Time Limit
  statistic = pd.read_hdf(MIMIC_EXTRACT_DATA, 'patients')
  print("MIMIC-EXTRACT DATA (Patients Num & Hospital Admission & ICU Admission): ", len(statistic))
  TIMELIMIT = 1 ## 1 day
  statistic.shape
  statistic.head()
  new_stats = statistic.reset_index()
  new_stats.rename(columns = {"subject_id": "SUBJECT_ID", "hadm_id": "HADM_ID"}, inplace = True)
  print("new_stats shape: ", new_stats.shape, "\nsub_notes shape: ", sub_notes.shape)
  df_adm_notes = pd.merge(sub_notes[['ROW_ID','SUBJECT_ID','HADM_ID','CHARTTIME', 'CATEGORY', 'TEXT']],
                          new_stats[['SUBJECT_ID','HADM_ID','icustay_id','age','admittime','dischtime', 'deathtime', 'intime', 'outtime', 'los_icu', 'mort_icu', 'mort_hosp', 'hospital_expire_flag', 'hospstay_seq', 'max_hours']],
                          on = ['SUBJECT_ID'],
                          how = 'left')
  df_adm_notes.head()
  df_adm_notes['CHARTTIME'] = pd.to_datetime(df_adm_notes['CHARTTIME'])
  df_less_n = df_adm_notes[((df_adm_notes['CHARTTIME'] - df_adm_notes['intime']).dt.total_seconds() / (24*60*60)) < TIMELIMIT]
  print("df_less_n.shape: ", df_less_n.shape)
  pd.to_pickle(df_less_n, os.path.join(raw_data_dir, 'sub_notes.p'))
  return df_less_n

state = calculate_stats(raw_data)

## Helper function to process text

# process raw data
def process_data(raw_data):

  # implement this function to process the data as you need
  clinical_notes = pd.read_pickle(os.path.join(raw_data_dir, 'sub_notes.p'))
  clinical_notes.shape

  eliminate_notes = clinical_notes[clinical_notes.SUBJECT_ID.notnull()]
  eliminate_notes = eliminate_notes[eliminate_notes.CHARTTIME.notnull()]
  eliminate_notes = eliminate_notes[eliminate_notes.TEXT.notnull()]
  eliminate_notes.shape

  eliminate_notes = eliminate_notes[['SUBJECT_ID', 'HADM_ID_y', 'CHARTTIME', 'TEXT']]
  eliminate_notes['preprocessed_text'] = None

  for notes in eliminate_notes.itertuples():
    text = notes.TEXT
    eliminate_notes.at[notes.Index, 'preprocessed_text'] = getSentences(text)

  pd.to_pickle(eliminate_notes, os.path.join(raw_data_dir, 'preprocessed_notes.p'))
  print("Preprocessed Data: ", eliminate_notes.shape)
  return eliminate_notes

processed_data = process_data(raw_data)

'''
 you can load the processed data directly
processed_data_dir = '/content/gdrive/My Drive/Colab Notebooks/<path-to-raw-data>'
def load_processed_data(raw_data_dir):
  pass

'''