In [1]:
import codecs
import random
import jsonlines
import tensorflow as tf
from pathlib import Path
from IPython.utils import io
from neuroner import neuromodel
from collections import defaultdict

seed = 0
random.seed(seed)
tf.compat.v1.random.set_random_seed(seed)

### General Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/General/Input/notes-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']
        with open('External/NeuroNER/Data/General/deploy/' + '_'.join(map(str, line['ID'])) + '.txt', 'w') as file:
            file.write(line['note'])
        with open('External/NeuroNER/Data/General/deploy/' + '_'.join(map(str, line['ID'])) + '.ann', 'w') as file:
            pass

In [None]:
# load the model

# neuromodel.fetch_model('i2b2_2014_glove_spacy_bioes')
nn = neuromodel.NeuroNER(
    train_model=False, use_pretrained_model=True,
    parameters_filepath='External/NeuroNER/Model/original+original/parameters.ini', 
    pretrained_model_folder='External/NeuroNER/Model/original+original/',
    token_pretrained_embedding_filepath='External/NeuroNER/Data/Embedding/glove.6B.100d.txt',
    dataset_text_folder='External/NeuroNER/Data/General',
    output_folder='External/NeuroNER/Data/General')

In [4]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    
    with io.capture_output() as captured:
        ents = nn.predict(note)
    
    for ent in ents:
        if ent['type'] in {'DOCTOR', 'PATIENT'}:
            outputs[ID][(ent['start'], ent['end'])] = ent['text']
            
    if count % 1000 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 1000
Finish Processing Note 2000
Finish Processing Note 3000
Finish Processing Note 4000
Finish Processing Note 5000
Finish Processing Note 6000
Finish Processing Note 7000
Finish Processing Note 8000
Finish Processing Note 9000
Finish Processing Note 10000
Finish Processing Note 11000
Finish Processing Note 12000
Finish Processing Note 13000
Finish Processing Note 14000
Finish Processing Note 15000


In [5]:
# save the outputs

with jsonlines.open('../Data/General/Output/notes-NeuroNER.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

### Polysemy Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/Polysemy/Input/polysemies-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']
        with open('External/NeuroNER/Data/Polysemy/deploy/' + '_'.join(map(str, line['ID'])) + '.txt', 'w') as file:
            file.write(line['note'])
        with open('External/NeuroNER/Data/Polysemy/deploy/' + '_'.join(map(str, line['ID'])) + '.ann', 'w') as file:
            pass

In [None]:
# load the model

# neuromodel.fetch_model('i2b2_2014_glove_spacy_bioes')
nn = neuromodel.NeuroNER(
    train_model=False, use_pretrained_model=True,
    parameters_filepath='External/NeuroNER/Model/original+original/parameters.ini', 
    pretrained_model_folder='External/NeuroNER/Model/original+original',
    token_pretrained_embedding_filepath='External/NeuroNER/Data/Embedding/glove.6B.100d.txt',
    dataset_text_folder='External/NeuroNER/Data/Polysemy',
    output_folder='External/NeuroNER/Data/Polysemy')

In [4]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    
    with io.capture_output() as captured:
        ents = nn.predict(note)
    
    for ent in ents:
        if ent['type'] in {'DOCTOR', 'PATIENT'}:
            outputs[ID][(ent['start'], ent['end'])] = ent['text']
            
    if count % 100 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 100
Finish Processing Note 200


In [5]:
# save the outputs

with jsonlines.open('../Data/Polysemy/Output/polysemies-NeuroNER.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

### Finetuning Evaluation

In [2]:
# prepare the inputs

def prepare(seed, type_context, type_name):
    
    for split in ['train', 'valid', 'deploy']:
        Path(f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/{split}').mkdir(parents=True, exist_ok=True)
    
    with jsonlines.open(f'../Data/Finetune/Input/inputs-{type_context}+{type_name}.jsonl', 'r') as reader:
        for line in reader:
            split, ID = line['ID']
            split = split if split == 'train' else 'valid'
            with codecs.open(f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/{split}/{ID}.txt', 'w', 'UTF-8') as file:
                file.write(line['note'])

    labels = defaultdict(dict)
    with jsonlines.open(f'../Data/Finetune/Input/labels-{type_context}+{type_name}.jsonl', 'r') as reader:
        for line in reader:
            ID, position, name = map(lambda x:tuple(x), line.values())
            labels[ID][position] = name
    for ID, mentions in labels.items():
        split, ID = ID
        split = split if split == 'train' else 'valid'
        with codecs.open(f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/{split}/{ID}.ann', 'w', 'utf-8') as file:
            for mentionID, (position, name) in enumerate(mentions.items()):
                label = random.choice(['PATIENT', 'DOCTOR'])
                file.write(f'T{mentionID}\t{label} {position[0]} {position[1]}\t{name[0]}\n')
                
    notes = {}
    with jsonlines.open('../Data/Finetune/Input/inputs-test.jsonl', 'r') as reader:
        for line in reader:
            notes[tuple(line['ID'])] = line['note']
            with open(f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/deploy/' + '_'.join(map(str, line['ID'])) + '.txt', 'w') as file:
                file.write(line['note'])
            with open(f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/deploy/' + '_'.join(map(str, line['ID'])) + '.ann', 'w') as file:
                pass
    return notes

In [4]:
# finetuning

def finetune(seed, type_context, type_name):
    
    # load the inputs
    random.seed(seed)
    tf.compat.v1.random.set_random_seed(seed)
    notes = prepare(seed, type_context, type_name)
    print('Finish Loading the Inputs')
    
    # load and finetune the model
    nn = neuromodel.NeuroNER(
        train_model=True, use_pretrained_model=True,
        maximum_number_of_epochs=5, patience=2,
        parameters_filepath='External/NeuroNER/Model/parameters.ini', 
        pretrained_model_folder='External/NeuroNER/Model/',
        token_pretrained_embedding_filepath='External/NeuroNER/Data/Embedding/glove.6B.100d.txt',
        dataset_text_folder=f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/',
        output_folder=f'External/NeuroNER/Data/Finetune/{type_context}+{type_name}/{seed}/')
    nn.fit()
    print('Finish Finetuning the Model')
    
    # make predictions
    outputs = defaultdict(dict)
    for count, (ID, note) in enumerate(notes.items()):
        with io.capture_output() as captured:
            ents = nn.predict(note)
        for ent in ents:
            if ent['type'] in {'DOCTOR', 'PATIENT'}:
                outputs[ID][(ent['start'], ent['end'])] = ent['text']
        if count % 100 == 0: print(f'Finish Processing Note {count}')

    # save the outputs
    with jsonlines.open(f'../Data/Finetune/Output/finetunes-{type_context}+{type_name}-NeuroNER-{seed}.jsonl', 'w') as writer:
        writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])
    print('Finish Saving the Predictions')

In [5]:
for seed in [0, 1, 2, 3, 4]:
    for type_context in ['general', 'clinical']:
        for type_name in ['popular', 'diverse']:
            print(f'Start seed = {seed} | Context = {type_context} | Name = {type_name}')
            finetune(seed, type_context, type_name)
            print()

Start seed = 0 | Context = general | Name = popular
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 0 | Context = general | Name = diverse
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 0 | Context = clinical | Name = popular
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 0 | Context = clinical | Name = diverse
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 1 | Context = general | Name = popular
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 1 | Context = general | Name = diverse
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 1 | Context = clinical | Name = popular
Finish Loading the Inputs
Finish Finetuning the Model
Finish Saving the Predictions

Start seed = 1 | Context = clinical | 