In [None]:
import time
import datetime
import random
import re
import matplotlib.pyplot as plt
import json
import os
#from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
pd.set_option('display.max_rows', 100)

In [None]:
# load note events and admision records
PATH_DATA = './MIMIC'
PATH_FILTERED_DATA = './MIMIC/filterd_data'
df_notes = pd.read_csv(os.path.join(PATH_DATA, 'NOTEEVENTS.csv'))
df_adm = pd.read_csv(os.path.join(PATH_FILTERED_DATA, 'admits_final.csv'))

In [None]:
def merge_on_subject(table1, table2):
    return table1.merge(table2['HADM_ID'], how='inner', left_on=['HADM_ID'], right_on=['HADM_ID'])

def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    #y=re.sub('~','',y)
    return y

def preprocessing(df):
    df['TEXT']=df['TEXT'].fillna(' ')
    df['TEXT']=df['TEXT'].str.replace('\n',' ')
    df['TEXT']=df['TEXT'].str.replace('\r',' ')
    df['TEXT']=df['TEXT'].apply(str.strip)
    df['TEXT']=df['TEXT'].str.lower()

    df['TEXT']=df['TEXT'].apply(lambda x: preprocess1(x))

    return df


def tokens_count(tk,txt):
  ids = tk(txt)['input_ids']
  return len(ids)

In [None]:
df_notes = df_notes[df_notes['HADM_ID'].notnull()]
df_notes.HADM_ID = df_notes.HADM_ID.astype(int)

# keep only nursing/others
df_notes = df_notes[df_notes['CATEGORY'] == 'Nursing/other']
df_notes = df_notes.reset_index(drop = True)

# sort the note
df_notes.CHARTTIME = pd.to_datetime(df_notes.CHARTTIME, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_notes.sort_values(by=['SUBJECT_ID','HADM_ID', 'CHARTTIME'], inplace=True)
df_notes.reset_index(drop = True)

# preprocess notes
preprocessing(df_notes)

# group by notes for each admissions, results are stored in a new dataframe
df_nursing_for_each_adm = pd.DataFrame(df_notes.groupby('HADM_ID')['TEXT'].apply(list)).reset_index()

# filter out notes which are not in admission table
HADM_ID_adm = list(df_adm.HADM_ID)
df_nursing_for_each_adm = df_nursing_for_each_adm[df_nursing_for_each_adm['HADM_ID'].isin(HADM_ID_adm)]

# filter out the admissions having notes over 100
note_list = list(df_nursing_for_each_adm.TEXT)
id_list = list(df_nursing_for_each_adm.HADM_ID)
id_selected_list = []
note_len = []
for i, note in enumerate(note_list):
  if len(note)<100:
    id_selected_list.append(id_list[i])

df_nursing_for_each_adm = df_nursing_for_each_adm[df_nursing_for_each_adm['HADM_ID'].isin(id_selected_list)]
df_nursing_for_each_adm = df_nursing_for_each_adm.reset_index(drop=True) 
df_nursing_for_each_adm.to_csv(os.path.join(PATH_FILTERED_DATA, 'nursing_notes_bf_length_limits.csv'))

In [None]:
# find admissions with notes shorter than 50 tokens
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

notes = list(df_nursing_for_each_adm['TEXT'])
id_list = list(df_nursing_for_each_adm['HADM_ID'])

over_limit = []
over_token_num = []

for i, notes_list in enumerate(notes):
  print(i)
  for note in notes_list:
    tk_count = tokens_count(tokenizer, note)
    if tk_count < 50 :
      over_token_num.append(tk_count)
      over_limit.append(id_list[i])
      break


In [None]:
# for an admission, check the number of notes in it are very short
num_of_note_shorter = []

for id in over_limit:
  n_list = list(df_nursing_for_each_adm[df_nursing_for_each_adm['HADM_ID']==id]['TEXT'])[0]
  num_of_note = 0
  for n in n_list:
    num_of_tk = tokens_count(tokenizer,n)
    if num_of_tk<50:
      num_of_note += 1
  num_of_note_shorter.append(num_of_note)

In [None]:
# if the number of short note takes more than 20%, we remove this admission. Otherwize, we delete the short note from all notes of this admission
len_of_ad = []
for id in over_limit:
  n_list = list(df_nursing_for_each_adm[df_nursing_for_each_adm['HADM_ID']==id]['TEXT'])[0]
  len_of_ad.append(len(n_list))

alot_shorter = []
note_tobe_del = []
for i in range(len(len_of_ad)):
  if num_of_note_shorter[i]/len_of_ad[i] > 0.2:
    alot_shorter.append(over_limit[i])
  else:
    note_tobe_del.append(over_limit[i])

# remove the admissions
df_nursing_for_each_adm = df_nursing_for_each_adm[~ df_nursing_for_each_adm['HADM_ID'].isin(alot_shorter)]

# delete the short notes from all notes of this admission
for i in note_tobe_del:
  n_list = list(df_nursing_for_each_adm[df_nursing_for_each_adm['HADM_ID']==i]['TEXT'])[0]
  del_index = []
  for j in range(len(n_list)):
    if tokens_count(tokenizer,n_list[j]) < 50:
      del_index.append(j)
  for k in sorted(del_index, reverse=True):
    del n_list[k]


In [None]:
# find admissions with notes longer than 800 tokens
notes = list(df_nursing_for_each_adm['TEXT'])
id_list = list(df_nursing_for_each_adm['HADM_ID'])

over_limit = []
over_token_num = []

for i, notes_list in enumerate(notes):
  print(i)
  for note in notes_list:
    tk_count = tokens_count(tokenizer, note)
    if tk_count > 800 :
      over_token_num.append(tk_count)
      over_limit.append(id_list[i])
      break

df_nursing_for_each_adm = df_nursing_for_each_adm[~ df_nursing_for_each_adm['HADM_ID'].isin(over_limit)]
df_nursing_for_each_adm = df_nursing_for_each_adm.reset_index(drop = True)

In [None]:
# convert note list of each admission to string
notes_list = list(df_nursing_for_each_adm['TEXT'])
for i, n in enumerate(notes_list):
  s = '\n'.join(n)
  df_nursing_for_each_adm.iloc[i, 1] = s

df_nursing_for_each_adm.to_csv(os.path.join(PATH_FILTERED_DATA, 'nursing_notes_af_length_limits.csv'))

# keep admissions in note table
hadm_id = list(df_nursing_for_each_adm['HADM_ID'])
df_adm = df_adm[df_adm['HADM_ID'].isin(hadm_id)] # 16493 remained

df_adm.to_csv(os.path.join(PATH_FILTERED_DATA, 'admits_final_af_note_filter.csv'))