In [1]:
import sys
sys.path.append("../") # go to parent dir

In [1]:
import pandas as pd
import sys
import spacy
import re
import time
import scispacy
import glob
import os
from tqdm import tqdm
tqdm.pandas()
from note_processing.heuristic_tokenize import sent_tokenize_rules 

ModuleNotFoundError: No module named 'note_processing'

In [3]:
# OUTPUT_DIR = '/mnt/data01/mimic-3/benchmark-small/test/345' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh

#this is the path to mimic data if you're reading from a csv. Else uncomment the code to read from database below
MIMIC_NOTES_PATHS = ['/mnt/data01/mimic-3/benchmark-small/test',
                     '/mnt/data01/mimic-3/benchmark-small/train']  

WORKERS = 5

In [4]:
all_files = []

for path in MIMIC_NOTES_PATHS:
    files = glob.glob(path + "/*/*_notes.csv")
    all_files += files

print("\nTotal note files: " + str(len(all_files)))
all_files = [f for f in all_files if not os.path.exists(f[:-4] + '_sent.csv')]
print("Total unprocessed files: " + str(len(all_files)))

li = []

for filename in tqdm(all_files, desc="Load note files"):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["filename"] = filename
    li.append(df)

notes = pd.concat(li, axis=0, ignore_index=True)
notes.describe(include="all")

Load note files: 100%|██████████| 37/37 [00:00<00:00, 291.64it/s]
Total note files: 43
Total unprocessed files: 37



Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename
count,1051.0,1051,1051,1051,1051
unique,,10,60,981,35
top,,Nursing,Nursing Progress Note,Chief Complaint:\n 24 Hour Events:\n - SBP...,/mnt/data01/mimic-3/benchmark-small/train/124/...
freq,,481,376,5,226
mean,96.352609,,,,
std,126.598239,,,,
min,0.201111,,,,
25%,14.525833,,,,
50%,36.261111,,,,
75%,121.352361,,,,


In [5]:
notes.head(5)

Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename
0,0.201111,Radiology,CHEST (PORTABLE AP),[**2169-5-21**] 10:17 PM\n CHEST (PORTABLE AP)...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
1,1.034444,Radiology,CHEST PORT. LINE PLACEMENT,[**2169-5-21**] 11:07 PM\n CHEST PORT. LINE PL...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
2,1.967778,Radiology,CHEST PORT. LINE PLACEMENT,[**2169-5-22**] 12:03 AM\n CHEST PORT. LINE PL...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
3,7.351111,Nursing/other,Report,Respiratory CAre\nPt received from ED intubate...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
4,7.584444,Nursing/other,Report,0000-0700 NPN\nPt. admitted via ER from [**Hos...,/mnt/data01/mimic-3/benchmark-small/test/345/e...


In [6]:
notes.groupby("CATEGORY").agg(['count'])

Unnamed: 0_level_0,Hours,DESCRIPTION,TEXT,filename
Unnamed: 0_level_1,count,count,count,count
CATEGORY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
General,23,23,23,23
Nursing,481,481,481,481
Nursing/other,11,11,11,11
Nutrition,13,13,13,13
Pharmacy,2,2,2,2
Physician,329,329,329,329
Radiology,108,108,108,108
Rehab Services,8,8,8,8
Respiratory,75,75,75,75
Social Work,1,1,1,1


In [7]:
# aflanders:
# This code will split the notes into natural sentence boundaries separated by \n
# which can then be fed into sentence embedding models such as BIO-ClinicalBert or 
# BioSentVec
#
# This frame and the next are largly from format_mimic_for_BERT.py in EmilyAlsentzer/clinicalBERT
# I have updated the code to work with spacy 3.0 and made some other changes
#
# Example:
# THis is a 
# single 
# sentence. and another sentence.

# THis is a single sentence.\n
# and another sentence.\n

from spacy.language import Language

#setting sentence boundaries
@Language.component('sbd_component')
def sbd_component(doc):
    for i, token in enumerate(doc[:-2]):
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    return doc

#convert de-identification text into one token
# aflanders: no need to pass in the next separate, is available in processed_text
# def fix_deid_tokens(text, processed_text):
def fix_deid_tokens(doc):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 

    indexes = [m.span() for m in re.finditer(deid_regex, doc.text, flags=re.IGNORECASE)]

    for start,end in indexes:
        # processed_text.merge(start_idx=start,end_idx=end)
        # aflanders: Make compatible with latest version fo spacy
        try:
            span = doc.char_span(start, end)
            if span is not None:
                with doc.retokenize() as retokenizer:
                    # retokenizer.merge(processed_text[start:end+1])
                    retokenizer.merge(span)
        except:
            print(f'Error with: {text}')
                
    return doc

In [8]:

def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    # processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_section = fix_deid_tokens(processed_section)
    processed_sections.append(processed_section)

def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n' and len(sent_text.split()) > 1:
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['TEXT'] += sent_text + '\n'  

def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

def process_note(note):
    try:
        note_text = note['TEXT'] #unicode(note['text'])
        note['TEXT'] = ''
        processed_sections = process_note_helper(note_text)
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        return note 
    except Exception as e:
        # pass
        print ('error processing note', e)


In [9]:
# %time

#category = ["Nursing", "Nursing/other", 'General', 'Physician ']  # or None
category = ["Nursing/other"]  # or None

# start = time.time()
# tqdm.pandas()

print('Begin reading notes')

if category != None:
    notes = notes[notes['CATEGORY'].isin(category)]
print('Number of notes: %d' %len(notes.index))
# notes['ind'] = list(range(len(notes.index)))

nlp = spacy.load('en_core_sci_md', disable=['tagger','ner', 'lemmatizer'])
nlp.add_pipe('sbd_component', before='parser')  

Begin reading notes
Number of notes: 11


<function __main__.sbd_component>

In [10]:
filenames = list(notes["filename"].unique().tolist())
len(filenames)

1

In [11]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=WORKERS)

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [12]:
formatted_notes = notes.parallel_apply(process_note, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3), Label(value='0 / 3'))), HBox(c…

In [49]:
formatted_notes.head(3)

Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename
3,7.351111,Nursing/other,Report,Respiratory CAre Pt received from ED intubated...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
4,7.584444,Nursing/other,Report,0700 NPN Pt. admitted via ER\nfrom [**Hospital...,/mnt/data01/mimic-3/benchmark-small/test/345/e...
6,20.067778,Nursing/other,Report,"BS CTAB, no change with MDI's.\nSuctioned for ...",/mnt/data01/mimic-3/benchmark-small/test/345/e...


In [52]:
# Write out a new note files organized by sentence
filenames = list(formatted_notes["filename"].unique().tolist())
for filename in tqdm(filenames, desc="Writing note sentence files"):
    df = formatted_notes[formatted_notes["filename"] == filename][["Hours", "CATEGORY", "DESCRIPTION", "TEXT"]]
    df = df.set_index("Hours")
    write_file = filename.replace(".csv", "_sent.csv")
    with open(write_file, "w") as f:
        df.to_csv(f, index_label='Hours')

Writing note sentence files: 100%|██████████| 7/7 [00:00<00:00, 414.60it/s]
