# Create OBI de-ID prediction dataset

In [1]:
import pandas as pd
import jsonlines

In [2]:
notes = pd.read_csv("/home/vs428/project/Incarceration_Data/ed_notes_19_20.tsv", sep="\t")

In [3]:
notes.shape

(557256, 5)

In [4]:
notes.columns

Index(['PAT_ENC_CSN_ID', 'visit_occurrence_id', 'person_id', 'Type', 'TEXT'], dtype='object')

In [78]:
print("the notes are unique by encounter ID")
notes.shape, notes['PAT_ENC_CSN_ID'].nunique()

the notes are unique by encounter ID


((557256, 5), 157433)

In [5]:
notes_jsonl = []
for idx, text in notes.iterrows():
    note_dict  = {}
    note_dict['text'] = text['TEXT']
    note_dict['meta'] = {"PAT_ENC_CSN_ID":text['PAT_ENC_CSN_ID'], 
                       "person_id":text['person_id'],
                      "note_id":idx}
    note_dict['spans'] = []
    notes_jsonl.append(note_dict)

In [9]:
with jsonlines.open("/home/vs428/project/Incarceration_Data/ed_notes_19_20_obi_deid_input.jsonl", "w") as writer:
    writer.write_all(notes_jsonl)
    

# Take a small subset to test

In [25]:
import random

In [52]:
notes_jsonl[0]

{'text': '9:37 PM Pt has ready bed, able to go to floor before being infused with abx and IVIG. Transport booked. Pt resting comfortably on stretcher NAD at this time. \r\n',
 'meta': {'PAT_ENC_CSN_ID': 192405489, 'person_id': 40675941, 'note_id': 0},
 'spans': []}

In [26]:
notes_jsonl_sample = random.sample(notes_jsonl, 2000)

In [27]:
import re

In [28]:
pat = re.compile(r"\r\n")

In [29]:
# pat.sub("\n", notes_jsonl_sample[0]['text'])



In [30]:
for note in notes_jsonl_sample:
    note['text'] = pat.sub(" ", note['text'].encode("ascii", "ignore").decode())

In [31]:
# notes_jsonl_sample[2]

In [32]:
# notes_jsonl_sample[0]['text']

In [33]:
with jsonlines.open("/home/vs428/project/Incarceration_Data/notes/ed_notes_19_20_obi_deid_input_subset.jsonl", "w") as writer:
#     /home/vs428/project/Incarceration_Data/notes/ed_notes_19_20_obi_deid_input_subset.jsonl
    writer.write_all(notes_jsonl_sample)
    

In [34]:
test_recs = random.choices(notes_jsonl_sample, k=10)

In [35]:
file_out = [test_rec['text'] for test_rec in test_recs]

In [36]:
import numpy as np

In [37]:
np.save("/home/vs428/project/Incarceration_Data/other_algo_test.npy", file_out)

In [38]:
file_out = [test_rec['text'] for test_rec in notes_jsonl_sample]
np.save("/home/vs428/project/Incarceration_Data/other_algo_2000subset.npy", file_out)

# ^ Or chunk the full data

In [11]:
CHUNKSIZE = 2000

In [12]:
chunks = [notes_jsonl[i:i+CHUNKSIZE] for i in range(0, len(notes_jsonl), CHUNKSIZE)]

In [14]:
len(chunks)

279

In [21]:
for i in range(len(chunks)):
    with jsonlines.open(f"/home/vs428/project/Incarceration_Data/ed_notes_19_20_obi_deid_input_{i}.jsonl", "w") as writer:
        writer.write_all(chunks[i])


# Read in NPY files

In [39]:
import numpy as np

In [43]:
# deided = np.load("/home/vs428/project/Incarceration_Data/other_algo_test_out.npy", allow_pickle=True)
# raw = np.load("/home/vs428/project/Incarceration_Data/other_algo_test.npy", allow_pickle=True)

deided = np.load("/home/vs428/project/Incarceration_Data/other_algo_2000subset_deid.npy", allow_pickle=True)
raw = np.load("/home/vs428/project/Incarceration_Data/other_algo_2000subset.npy", allow_pickle=True)

In [51]:
notes_jsonl_sample[2]#['text']

{'text': "--------------------------------------- Resident Note---------------------------------------------- HPI:  Pt is a 74 y.o. year old female with a history of AFib, multiple myeloma, on Eliquis presenting with chief complaint of mechanical fall. Symptoms began earlier this afternoon, patient was walking up the steps, tripped and fell hitting her forehead.  There is no loss of consciousness, no peri event amnesia, patient denies any nausea or vomiting currently.  Patient denies any shortness of breath or chest pain. Physical Exam: BP (!) 162/66  | Pulse 68  | Temp 98.3 F (36.8 C) (Oral)  | Resp 18  | SpO2 97%  On initial exam well-appearing 74-year-old female, 1 centimeter abrasion to right of midline on forehead.  Secondary survey otherwise unremarkable. Patient denies any midline cervical thoracic or lumbar tenderness. Physical exam otherwise stated below  MDM/Plan: Mechanical fall, patient on Eliquis.  Will evaluate with basic labs including CBC CMP.  PT INR.  CT head and neck

In [57]:
for deid, notes_jsonl in zip(deided, notes_jsonl_sample):
    notes_jsonl['deid_text'] = deid

In [61]:
for x in notes_jsonl_sample:
    x['PAT_ENC_CSN_ID'] = x['meta']['PAT_ENC_CSN_ID']
    x['person_id'] = x['meta']['person_id']
    x['note_id'] = x['meta']['note_id']
# pd.DataFrame.from_records(notes_jsonl_sample)

In [65]:
notes.shape

(557256, 5)

In [67]:
len(notes_jsonl_sample)

# 12477

2000

In [73]:
pd.DataFrame.from_records(notes_jsonl_sample).shape

(2000, 7)

In [77]:
len(notes), notes['PAT_ENC_CSN_ID'].nunique()

(557256, 157433)

In [83]:
out = pd.DataFrame.from_records(notes_jsonl_sample)#; out.reset_index()

In [85]:
notes = notes.reset_index()

Unnamed: 0,PAT_ENC_CSN_ID,visit_occurrence_id,person_id,Type,TEXT
0,192405489,146031160,40675941,ED Notes,"9:37 PM Pt has ready bed, able to go to floor ..."
1,185618958,121245233,65365841,ED Provider Notes,\r\n\r\n\r\nHistory\r\nChief Complaint \r\nPat...
2,193465848,147453891,59780681,ED Provider Notes,\r\nHistory\r\nChief Complaint \r\nPatient pre...
3,192242419,145825331,39382730,ED Notes,7:10 AM \r\nAccepted team report with RN Hamil...
4,203621361,175809910,13249654,ED Notes,Floor Handoff \r\n\r\nAdmission Dx: SOB\r\n\r\...
...,...,...,...,...,...
557251,207843445,180785075,11011604,ED Notes,7:12 PM \r\nReport Received from Trish RN\r\nP...
557252,195204993,150345206,63163397,ED Notes,2:40 PM \r\nChief Complaint \r\nPatient presen...
557253,200298402,160096898,53072874,ED Psychiatric Eval Note,Yale New Haven Hospital-Ysc\r\nYALE NEW HAVEN ...
557254,205946884,178903143,13972664,ED Provider Notes,\r\nHistory\r\nChief Complaint \r\nPatient pre...


In [87]:
notes = notes.reset_index()

In [95]:
out.merge(notes, left_on="note_id", right_on="index", how="inner").drop(['PAT_ENC_CSN_ID_y', 
                                                                         'person_id_y', 'meta', 'spans',
                                                                         'index', 'TEXT'], axis=1).rename({"PAT_ENC_CSN_ID_x": "PAT_ENC_CSN_ID",
                                                                                                          "person_id_x":"person_id"}, axis=1).to_csv("/home/vs428/project/Incarceration_Data/notes_2000subset_deid.csv", index=False)


In [18]:
from typing import List, Any, Callable, Tuple, Union
import re 
import difflib 

Token = str
TokenList = List[Token]

In [19]:
whitespace = re.compile('\s+')
end_sentence = re.compile('[.!?]\s+')

def tokenize(s:str) -> TokenList:
    '''Split a string into tokens'''
    return whitespace.split(s)

def untokenize(ts:TokenList) -> str:
    '''Join a list of tokens into a string'''
    return ' '.join(ts)

def sentencize(s:str) -> TokenList:
    '''Split a string into a list of sentences'''
    return end_sentence.split(s)

def unsentencise(ts:TokenList) -> str:
    '''Join a list of sentences into a string'''
    return '. '.join(ts)

def html_unsentencise(ts:TokenList) -> str:
    '''Joing a list of sentences into HTML for display'''
    return ''.join(f'<p>{t}</p>' for t in ts)

In [20]:
def mark_text(text:str) -> str:
    return f'<span style="color: red;">{text}</span>'
    
def mark_span(text:TokenList) -> TokenList:
    return [mark_text(token) for token in text]

In [21]:
def mark_span(text:TokenList) -> TokenList:
    if len(text) > 0:
        text[0] = '<span style="background: #69E2FB;">' + text[0]
        text[-1] += '</span>'
    return text

In [22]:
def markup_diff(a:TokenList, b:TokenList,
                mark:Callable[TokenList, TokenList]=mark_span,
                default_mark: Callable[TokenList, TokenList] = lambda x: x,
                isjunk:Union[None, Callable[[Token], bool]]=None) -> Tuple[TokenList, TokenList]:
    """Returns a and b with any differences processed by mark

    Junk is ignored by the differ
    """
    seqmatcher = difflib.SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False)
    out_a, out_b = [], []
    for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        markup = default_mark if tag == 'equal' else mark
        out_a += markup(a[a0:a1])
        out_b += markup(b[b0:b1])
    assert len(out_a) == len(a)
    assert len(out_b) == len(b)
    return out_a, out_b

In [23]:
def align_seqs(a: TokenList, b: TokenList, fill:Token='') -> Tuple[TokenList, TokenList]:
    out_a, out_b = [], []
    seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False)
    for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        delta = (a1 - a0) - (b1 - b0)
        out_a += a[a0:a1] + [fill] * max(-delta, 0)
        out_b += b[b0:b1] + [fill] * max(delta, 0)
    assert len(out_a) == len(out_b)
    return out_a, out_b

In [24]:
from itertools import zip_longest
def html_sidebyside(a, b):
    # Set the panel display
    out = '<div style="display: grid;grid-template-columns: 1fr 1fr;grid-gap: 20px;">'
    # There's some CSS in Jupyter notebooks that makes the first pair unalign. This is a workaround
    out += '<p></p><p></p>'
    for left, right in zip_longest(a, b, fillvalue=''):
        out += f'<p>{left}</p>'
        out += f'<p>{right}</p>'
        out += '</div>'
    return out

In [25]:
import html
def html_diffs(a, b):
    a = html.escape(a)
    b = html.escape(b)

    out_a, out_b = [], []
    for sent_a, sent_b in zip(*align_seqs(sentencize(a), sentencize(b))):
        mark_a, mark_b = markup_diff(tokenize(sent_a), tokenize(sent_b))
        out_a.append(untokenize(mark_a))
        out_b.append(untokenize(mark_b))

    return html_sidebyside(out_a, out_b)

In [26]:
from IPython.display import HTML, display
def show_diffs(a, b):
    display(HTML(html_diffs(a,b)))
    
def save_diffs(a, b, fname):
    html = HTML(html_diffs(a,b)).data
    with open(fname, 'w') as f:
        f.write(html)

In [27]:
# print(raw[0]), print(test[0])

In [28]:
# show_diffs(raw[0], test[0])

for idx, (x, y) in enumerate(zip(raw, deided)):
    save_diffs(x, y, fname=f"/home/vs428/project/Incarceration_Data/other_algo_test_diff_{idx}.html")

# Convert to Binary Classification

In [2]:
training_data = []

# with jsonlines.open("/home/vs428/project/Incarceration_Data/incarceration_status_initial.jsonl") as reader:
with jsonlines.open("/home/vs428/project/Incarceration_Data/incarceration_status_v2_v2.jsonl") as reader:
    for line in reader:
        training_data.append(line)

In [3]:
training_data[0].keys()

dict_keys(['text', '_input_hash', '_task_hash', 'options', '_view_id', 'config', 'accept', 'answer', '_timestamp', '_annotator_id', '_session_id'])

In [4]:
label = "Prior_History_Incarceration"
bin_training_data = []

for eg in training_data:
    example = eg.copy()
    accepted = example['accept']
    # dictionary of all labels – if label is in accepted list, value is
    # set to True, otherwise it's set to False
    if label in accepted:
        example['label'] = label
        example['answer'] = "accept"
    else:
        example['label'] = label
        example['answer'] = "reject"
        
    example['_view_id'] = "classification"
    example['view_id'] = "classification"    
    
    example.pop('accept', None)
    example.pop('options', None)
    example.pop('config', None)
    
    bin_training_data.append(example)
        
    # cats = {label: label in accepted for label in labels}
    # training_data.append((text, {'cats': cats}))

In [98]:
# training_data[1], bin_training_data[2]

In [5]:
training_data = []

with jsonlines.open("/home/vs428/project/Incarceration_Data/incarceration_status_v2_v2_binary.jsonl", "w") as writer:
    writer.write_all(bin_training_data)

# Done

# Check Doc Length

In [34]:
import spacy
spacy.require_gpu()
import jsonlines
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
nlp = spacy.load("./incarceration_model_binary_trf_v2/model-best/", exclude="tagger,parser,attribute_ruler,lemmatizer,ner")
                                                                             
# nlp = spacy.load('/home/vs428/Documents/Moore/followup_model_v3/model-best', exclude="tagger,parser,attribute_ruler,lemmatizer,ner")
#nlp_orig = spacy.load("en_core_web_trf")
#nlp.add_pipe("parser", source=nlp_orig, after="transformer")
#nlp.add_pipe("tagger", source=nlp_orig, after="parser")
#nlp.add_pipe("attribute_ruler", source=nlp_orig, after="tagger")
#nlp.add_pipe("lemmatizer", source=nlp_orig, after="attribute_ruler")
#nlp.add_pipe("ner", source=nlp_orig, after="lemmatizer")

In [18]:
anns = []
with jsonlines.open("/home/vs428/project/Incarceration_Data/incarceration_status_v2_v2_binary.jsonl") as reader:
    for line in reader:
        anns.append(line)

In [19]:
texts = [ann['text'] for ann in anns]

In [20]:
Counter([ann['answer'] for ann in anns])

Counter({'accept': 484, 'reject': 516})

In [40]:
# taken from https://stackoverflow.com/a/44764557/1726404
'''
This works by using nlp.pipe and putting our records into tuples. We process it as tuples and get the context
In our work, the context is just the study id. 
We get the entity text, label, start and stop characters for each entity
we convert that to a json string, we then put the [context,json] together into a list
append this list to nlp_out
then turn nlp out into a df with 1 col being study id and the other being the nlp out
Finally we merge the df with our main data df. Now we have a column with the text
'''
import json
nlp_out = []
docs = []
count = 0
for doc in nlp.pipe(texts, batch_size=100, n_process=1):
    docs.append(doc)
    out_ = doc.cats
    nlp_out.append([count, json.dumps(out_, indent = 2)])
    if count % 50 == 0:
        print(count)
    count +=1
nlp_df = pd.DataFrame(nlp_out, columns=['study_id', 'NLP_OUT'])

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950


In [48]:
Counter([len(doc) < 512 for doc in docs])

Counter({False: 682, True: 318})