In [1]:
import sys
sys.path.append("../") # go to parent dir

In [25]:
import pandas as pd
import sys
import spacy
import re
import time
import scispacy
import glob
import os
from tqdm import tqdm
tqdm.pandas()
from note_processing.heuristic_tokenize import sent_tokenize_rules 

In [3]:
# OUTPUT_DIR = '/mnt/data01/mimic-3/benchmark-small/test/345' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh

#this is the path to mimic data if you're reading from a csv. Else uncomment the code to read from database below
MIMIC_NOTES_PATHS = ['/mnt/data01/mimic-3/benchmark-small/test',
                     '/mnt/data01/mimic-3/benchmark-small/train']  

WORKERS = 3

In [4]:
all_files = []

for path in MIMIC_NOTES_PATHS:
    files = glob.glob(path + "/*/*_notes.csv")
    all_files += files

print("\nTotal note files: " + str(len(all_files)))
all_files = [f for f in all_files if not os.path.exists(f[:-4] + '_sent.csv')]
print("Total unprocessed files: " + str(len(all_files)))

li = []

for filename in tqdm(all_files, desc="Load note files"):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["filename"] = filename
    li.append(df)

notes = pd.concat(li, axis=0, ignore_index=True)
notes.columns= notes.columns.str.lower()
notes.describe(include="all")

Load note files: 100%|██████████| 2/2 [00:00<00:00, 164.53it/s]
Total note files: 43
Total unprocessed files: 2



Unnamed: 0,hours,category,description,text,filename
count,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0
top,,,,,
freq,,,,,


In [5]:
notes.head(5)

Unnamed: 0,hours,category,description,text,filename


In [8]:
notes.groupby("category").agg(['count'])

Unnamed: 0_level_0,hours,description,text,filename
Unnamed: 0_level_1,count,count,count,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
General,23,23,23,23
Nursing,502,502,502,502
Nursing/other,50,50,50,50
Nutrition,13,13,13,13
Pharmacy,2,2,2,2
Physician,333,333,333,333
Radiology,119,119,119,119
Rehab Services,8,8,8,8
Respiratory,75,75,75,75
Social Work,1,1,1,1


In [9]:
# aflanders:
# This code will split the notes into natural sentence boundaries separated by \n
# which can then be fed into sentence embedding models such as BIO-ClinicalBert or 
# BioSentVec
#
# This frame and the next are largly from format_mimic_for_BERT.py in EmilyAlsentzer/clinicalBERT
# I have updated the code to work with spacy 3.0 and made some other changes
#
# Example:
# THis is a 
# single 
# sentence. and another sentence.

# THis is a single sentence.\n
# and another sentence.\n

from spacy.language import Language

#setting sentence boundaries
@Language.component('sbd_component')
def sbd_component(doc):
    for i, token in enumerate(doc[:-2]):
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    return doc

#convert de-identification text into one token
# aflanders: no need to pass in the next separate, is available in processed_text
# def fix_deid_tokens(text, processed_text):
def fix_deid_tokens(doc):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 

    indexes = [m.span() for m in re.finditer(deid_regex, doc.text, flags=re.IGNORECASE)]

    for start,end in indexes:
        # processed_text.merge(start_idx=start,end_idx=end)
        # aflanders: Make compatible with latest version fo spacy
        try:
            span = doc.char_span(start, end)
            if span is not None:
                with doc.retokenize() as retokenizer:
                    # retokenizer.merge(processed_text[start:end+1])
                    retokenizer.merge(span)
        except:
            print(f'Error with: {text}')
                
    return doc

In [10]:

def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    # processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_section = fix_deid_tokens(processed_section)
    processed_sections.append(processed_section)

def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n':
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['text'] += sent_text + '\n'  

def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

def process_note(note):
    try:
        note_text = note['text'] #unicode(note['text'])
        note['text'] = ''
        processed_sections = process_note_helper(note_text)
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        return note 
    except Exception as e:
        # pass
        print ('error processing note', e)


In [11]:
# %time

#category = ["Nursing", "Nursing/other", 'General', 'Physician ']  # or None
category = ["Nursing/other"]  # or None

# start = time.time()
# tqdm.pandas()

print('Begin reading notes')

if category != None:
    notes = notes[notes['category'].isin(category)]
print('Number of notes: %d' %len(notes.index))
# notes['ind'] = list(range(len(notes.index)))

nlp = spacy.load('en_core_sci_md', disable=['tagger','ner', 'lemmatizer'])
nlp.add_pipe('sbd_component', before='parser')  

Begin reading notes
Number of notes: 50


<function __main__.sbd_component>

In [12]:
filenames = list(notes["filename"].unique().tolist())
len(filenames)

7

In [13]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=WORKERS)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [14]:
formatted_notes = notes.parallel_apply(process_note, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [15]:
# Load the pre-trained Bio_ClinicalBERT model
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


In [16]:
# Don't bother trying to run the pipeline without a GPU
import numpy as np
from transformers import pipeline
pipe = pipeline('feature-extraction', model=model, 
                tokenizer=tokenizer, device=-1)

In [17]:
import torch 
print(torch.__version__)
print(torch.cuda.current_device())
print(torch._C._cuda_getCompiledVersion(), 'cuda compiled version')
print(torch.version.cuda)

1.8.1+cu102


RuntimeError: No CUDA GPUs are available

In [18]:
features = pipe(['Respiratory CAre Pt received from ED intubated for airway protection.And then another sentenc',
                  'Coughing and gagging with Sx, swallowing frequently with irritation of ETT.']  ,
                pad_to_max_length=True)
features = np.squeeze(features)
features = features[:,0,:]
features

array([[-0.03993183,  0.28045335, -0.22612181, ..., -0.36056122,
         0.02098022, -0.07117227],
       [ 0.26059052,  0.28010404, -0.19884744, ..., -0.41535103,
         0.55648178, -0.36985204]])

In [19]:
import time

def get_embeddings(text, pipe):
    sents = text.split('\n')[:-1]
    #sents = list(map(lambda x: x[:50], sents))
    start_idx = 0
    while True:
        try:
            sent_features = pipe(sents[start_idx:] ,pad_to_max_length=True)
        except BaseException as e:
            start_idx += 1

            if start_idx >= len(sents):
                print("\nError in get_embeddings()")
                print('# of sentences: '+ str(len(sents)))
                sent_len = [len(x) for x in sents]
                print(sent_len)
                sent_features = None
                break

            print("Dropping sentence: " + sents[start_idx-1])
            continue
        break

    if sent_features is not None:
        try:
            sent_features = np.squeeze(sent_features)[:,0,:]
        except:
            sent_features = None
    
    return sent_features

In [22]:
# strings = formatted_notes["text"].tolist()
# doc = '\n'.join(strings)
# sents = doc.split('\n')[:-1]
# features = pipe(sents ,
#                 pad_to_max_length=True)

In [24]:
formatted_notes["embeddings"] = formatted_notes["text"].progress_apply(get_embeddings, args=(pipe,))


100%|██████████| 50/50 [00:43<00:00,  1.14it/s]


In [55]:
for i in range(10):
    print(formatted_notes["embeddings"].iloc[i].shape)

(7, 768)
(47, 768)
(5, 768)
(71, 768)
(43, 768)
(5, 768)
(51, 768)
(45, 768)
(6, 768)
(28, 768)


In [56]:
formatted_notes.describe(include="all")

Unnamed: 0,hours,category,description,text,filename,ind,embeddings
count,50.0,50,50,50,50,50.0,50
unique,,1,1,50,7,,50
top,,Nursing/other,Report,Nursing Admission/Progress Note (1335-\n1900) ...,/mnt/data01/mimic-3/benchmark-small/train/191/...,,"[[0.38307225704193115, 0.34203147888183594, 0...."
freq,,50,50,1,20,,1
mean,45.648222,,,,,24.5,
std,31.821967,,,,,14.57738,
min,4.174444,,,,,0.0,
25%,21.070764,,,,,12.25,
50%,44.101111,,,,,24.5,
75%,62.639236,,,,,36.75,


In [57]:
formatted_notes.head(1)

Unnamed: 0,hours,category,description,text,filename,ind,embeddings
3,7.351111,Nursing/other,Report,Respiratory CAre Pt received from ED intubated...,/mnt/data01/mimic-3/benchmark-small/test/345/e...,0,"[[-0.23393520712852478, -0.19446520507335663, ..."


In [59]:
type(formatted_notes["embeddings"].iloc[0])

numpy.ndarray

In [60]:
# Write out a new notes file with the embeddings
# aflanders: This is going to take too long and take up too much space
# The embeddings will be longer than the notes themselves. Each patient/episode
# can go from 500Kb to 18Mb

# filenames = [filename["filename"].value]
# formatted_notes.columns= formatted_notes.columns.str.capitalize()
# formatted_notes.rename(columns={"Embeddings":"Bert embeddings"}, inplace=True)

# np.set_printoptions(threshold=sys.maxsize)

# filenames = list(formatted_notes["Filename"].unique().tolist())
# for filename in tqdm(filenames, desc="Writing embedding files"):
#     df = formatted_notes[formatted_notes["Filename"] == filename][["Hours", "Category", "Description", "Bert embeddings"]]
#     write_file = filename.replace("_notes.csv", "_notes_bert.csv")
#     with open(write_file, "w") as f:
#         df.to_csv(f, index_label='Hours')

In [61]:
print(filenames)

['/mnt/data01/mimic-3/benchmark-small/test/345/episode1_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/124/episode2_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/191/episode2_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/222/episode3_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/222/episode4_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/109/episode8_notes.csv', '/mnt/data01/mimic-3/benchmark-small/train/109/episode7_notes.csv']
