In [1]:
import os
import json
import jsonlines
from collections import defaultdict

### General Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/General/Input/notes-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']
        with open('External/MIST/Data/General/Input/' + '_'.join(map(str, line['ID'])) + '.txt', 'w') as file:
            file.write(line['note'])

In [None]:
# load the model, make predictions

!python2 External/MIST/Model/src/tasks/AMIA/utils/split_AMIA_file.py \
--extend_dates \
--promote_type_attr External/MIST/Data/deid_surrogate_train_all_version2.xml External/MIST/Data/i2b2_2006

!External/MIST/Model/src/MAT/bin/MATEngine \
--task "AMIA Deidentification" \
--input_dir External/MIST/Data/i2b2_2006 \
--input_file_re ".*[.]xml" \
--input_file_type xml-inline \
--workflow "Process tagged untokenized docs" \
--steps "zone and align" \
--output_dir External/MIST/Data/i2b2_2006_json \
--output_file_type mat-json \
--output_fsuff ".json"

!External/MIST/Model/src/MAT/bin/MATModelBuilder \
--task 'AMIA Deidentification' \
--input_files 'External/MIST/Data/i2b2_2006_json/*.json' \
--file_type mat-json \
--save_as_default_model

!External/MIST/Model/src/MAT/bin/MATEngine \
--task 'AMIA Deidentification' \
--workflow Demo \
--steps 'zone,tag' \
--input_dir External/MIST/Data/General/Input \
--input_file_type raw \
--input_encoding utf-8 \
--output_dir External/MIST/Data/General/Output \
--output_fsuff ".json" \
--output_file_type mat-json \
--output_encoding utf-8 \
--tagger_local

In [5]:
# load predictions

outputs = defaultdict(dict)
for count, filename in enumerate(os.listdir('External/MIST/Data/General/Output')):
    
    ID = tuple(map(int, filename[:-9].split('_')))
    output = json.load(open(f'External/MIST/Data/General/Output/{filename}', 'r'))
    
    for ent_type in output['asets']:
        if ent_type['type'] in {'DOCTOR', 'PATIENT'}:
            for ent in ent_type['annots']:
                outputs[ID][(ent[0], ent[1])] = output['signal'][ent[0]:ent[1]]
        
    if count % 1000 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 1000
Finish Processing Note 2000
Finish Processing Note 3000
Finish Processing Note 4000
Finish Processing Note 5000
Finish Processing Note 6000
Finish Processing Note 7000
Finish Processing Note 8000
Finish Processing Note 9000
Finish Processing Note 10000
Finish Processing Note 11000
Finish Processing Note 12000
Finish Processing Note 13000
Finish Processing Note 14000
Finish Processing Note 15000


In [6]:
# save the outputs

with jsonlines.open('../Data/General/Output/notes-MIST.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

### Polysemy Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/Polysemy/Input/polysemies-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']
        with open('External/MIST/Data/Polysemy/Input/' + '_'.join(map(str, line['ID'])) + '.txt', 'w') as file:
            file.write(line['note'])

In [None]:
# load the model, make predictions

!python2 External/MIST/Model/src/tasks/AMIA/utils/split_AMIA_file.py \
--extend_dates \
--promote_type_attr External/MIST/Data/deid_surrogate_train_all_version2.xml External/MIST/Data/i2b2_2006

!External/MIST/Model/src/MAT/bin/MATEngine \
--task "AMIA Deidentification" \
--input_dir External/MIST/Data/i2b2_2006 \
--input_file_re ".*[.]xml" \
--input_file_type xml-inline \
--workflow "Process tagged untokenized docs" \
--steps "zone and align" \
--output_dir External/MIST/Data/i2b2_2006_json \
--output_file_type mat-json \
--output_fsuff ".json"

!External/MIST/Model/src/MAT/bin/MATModelBuilder \
--task 'AMIA Deidentification' \
--input_files 'External/MIST/Data/i2b2_2006_json/*.json' \
--file_type mat-json \
--save_as_default_model

!External/MIST/Model/src/MAT/bin/MATEngine \
--task 'AMIA Deidentification' \
--workflow Demo \
--steps 'zone,tag' \
--input_dir External/MIST/Data/Polysemy/Input \
--input_file_type raw \
--input_encoding utf-8 \
--output_dir External/MIST/Data/Polysemy/Output \
--output_fsuff ".json" \
--output_file_type mat-json \
--output_encoding utf-8 \
--tagger_local

In [7]:
# load predictions

outputs = defaultdict(dict)
for count, filename in enumerate(os.listdir('External/MIST/Data/Polysemy/Output')):
    
    ID = tuple(map(int, filename[:-9].split('_')))
    output = json.load(open(f'External/MIST/Data/Polysemy/Output/{filename}', 'r'))
    
    for ent_type in output['asets']:
        if ent_type['type'] in {'DOCTOR', 'PATIENT'}:
            for ent in ent_type['annots']:
                outputs[ID][(ent[0], ent[1])] = output['signal'][ent[0]:ent[1]]
        
    if count % 100 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 100
Finish Processing Note 200


In [8]:
# save the outputs

with jsonlines.open('../Data/Polysemy/Output/polysemies-MIST.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])