In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

In [2]:
print("TF version: ", tf.__version__)
print("TF HUB version: ", hub.__version__)

TF version:  2.6.2
TF HUB version:  0.12.0


In [85]:
import pandas as pd
import numpy as np

In [5]:
import psycopg2
import pandas as pd
import sys
import spacy
import re
import stanfordnlp
import time
import scispacy
from tqdm import tqdm
from heuristic_sentence_splitter import sent_tokenize_rules

In [6]:
from spacy.language import Language

In [7]:
@Language.component("sbd_component")
def sbd_component(doc):
    for i, token in enumerate(doc[:-2]):
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    return doc

nlp = spacy.load('en_core_sci_md', disable=['tagger','ner', "lemmatizer"])
nlp.add_pipe("sbd_component", before='parser')

<function __main__.sbd_component(doc)>

In [8]:
def process_note(note):
    try:
        note_text = note['TEXT'] #unicode(note['text'])
        note['TEXT'] = ''
        processed_sections = process_note_helper(note_text)
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        return note 
    except Exception as e:
        pass
        #print ('error', e)

In [9]:
def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

In [10]:
def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_sections.append(processed_section)

In [11]:
#convert de-identification text into one token
def fix_deid_tokens(text, processed_text):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 
    if text:
        indexes = [m.span() for m in re.finditer(deid_regex,text,flags=re.IGNORECASE)]
    else:
        indexes = []
    for start,end in indexes:
        head = 0
        tail = 0
        for token in processed_text:
            if start <= token.idx:
                head = token.i
                break
        for token in processed_text:
            if end <= token.idx:
                tail = token.i
                break
        if tail == 0:
            tail = token.i
        with processed_text.retokenize() as retokenizer:
            retokenizer.merge(processed_text[head:tail+1])
        # processed_text.merge(start_idx=start,end_idx=end)
    return processed_text

In [12]:
def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

In [13]:
def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n':
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['TEXT'] += sent_text + '\n'  

In [14]:
tqdm.pandas()

In [15]:
df = pd.read_csv('./NOTEEVENTS.csv')
notes = df.sample(n = 100) # random select 100 rows
notes['ind'] = list(range(len(notes.index)))
# note_text = notes['TEXT']

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
formatted_notes = notes.progress_apply(process_note, axis=1)

100%|██████████| 100/100 [00:06<00:00, 16.26it/s]


In [19]:
# # save to file
# with open('./Discharge Summary 100 subject.txt','w') as f:
#     for text in formatted_notes['TEXT']:
#         if text != None and len(text) != 0 :
#             f.write(text)
#             f.write('\n')
# print ("Done formatting notes")

Done formatting notes


In [29]:
keywords = ["Addict", 
            "addiction",
            "user", 
            "drug abuser", 
            "drug seeking", 
            "abuser", 
            "former addict", 
            "reformed addict", 
            "addicted", 
            "use drugs", 
            "drug baby", 
            "opioid abuse", 
            "opioid dependence", 
            "addiction", 
            "want drugs", 
            "problem", 
            "use problem", 
            "habit", 
            "clean", 
            "clean from drugs", 
            "clean urine test", 
            "dirty urine test", 
            "relapse", 
            "opioid substitution",
            "relapse therapy", 
            "treatment failure", 
            "being clean"]

In [40]:
sentence_list = []
label_list = []

In [41]:
for text in formatted_notes['TEXT']:
    sent = text.split('\n')
    sentence_list += sent

In [44]:
pattern = '|'.join(f"\\b{k}\\b" for k in keywords)  # Whole words only  
for sent in sentence_list:
    match = re.findall(pattern, sent)
    if match:
        label_list.append(1)
    else:
        label_list.append(0)

In [45]:
# print(len(label_list))
# print(len(sentence_list))

3151
3151


In [46]:
# BERT embedding
# load preprocessor and BERT model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1")
encoder_inputs = preprocessor(text_input) # dict with keys: 'input_mask', 'input_type_ids', 'input_word_ids'
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [92]:
max_batch_size = 132
head = 0
tail = max_batch_size
L = len(sentence_list)
pooled_embedding = np.zeros((1, 768))
sequence_embedding = np.zeros((1, 128, 768))
while (tail < L):
    batch_sentence_list = sentence_list[head:tail]
    encoder_inputs = preprocessor(batch_sentence_list)
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"]
    sequence_output = outputs["sequence_output"]
    pooled_embedding = np.append(pooled_embedding, pooled_output, axis=0)
    sequence_embedding = np.append(sequence_embedding, sequence_output, axis=0)
    head = tail
    tail = head + max_batch_size
batch_sentence_list = sentence_list[head:]
encoder_inputs = preprocessor(batch_sentence_list)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
pooled_embedding = np.append(pooled_embedding, pooled_output, axis=0)
sequence_embedding = np.append(sequence_embedding, sequence_output, axis=0)
pooled_embedding = pooled_embedding[1:]
sequence_embedding = sequence_embedding[1:]

In [99]:
# classification using SVM
from sklearn.model_selection import KFold
from sklearn import svm, metrics

In [100]:
X = pooled_embedding
y = np.array(label_list)

In [106]:
clf = svm.SVC()
kf = KFold(n_splits = 10)
accuracy_kf = []
sensitivity_kf = []
specificity_kf = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # svm
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion = metrics.confusion_matrix(y_test, y_pred, labels=[0,1])
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = (TP+TN) / float(TP+TN+FN+FP)
    sensitivity = TP / float(TP+FN)
    specificity = TN / float(TN+FP)
    accuracy_kf.append(accuracy)
    sensitivity_kf.append(sensitivity)
    specificity_kf.append(specificity)
 



In [108]:
accuracy_kf = np.array(accuracy_kf)
sensitivity_kf = np.array(sensitivity_kf)
specificity_kf = np.array(specificity_kf)

In [109]:
acc = np.mean(accuracy_kf)
sens = np.mean(sensitivity_kf)
spec = np.mean(specificity_kf)
print('10-fold cross validation, acc: %.2f, sensitivity: %.2f, specificity: %.2f'%(acc, sens, spec))

10-fold cross validation, acc: 1.00, sensitivity: nan, specificity: 1.00


In [112]:
len(df)

2083180