In [1]:
import spacy
import jsonlines
from spacy.tokens import DocBin
from collections import defaultdict

### General Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/General/Input/notes-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']

In [3]:
# load the model

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

In [5]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    doc = nlp(note)
    
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            outputs[ID][(ent.start_char, ent.end_char)] = ent.text
            
    if count % 1000 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 1000
Finish Processing Note 2000
Finish Processing Note 3000
Finish Processing Note 4000
Finish Processing Note 5000
Finish Processing Note 6000
Finish Processing Note 7000
Finish Processing Note 8000
Finish Processing Note 9000
Finish Processing Note 10000
Finish Processing Note 11000
Finish Processing Note 12000
Finish Processing Note 13000
Finish Processing Note 14000
Finish Processing Note 15000


In [6]:
# save the outputs

with jsonlines.open('../Data/General/Output/notes-spaCy.jsonl', 'w') as writer:=
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

### Polysemy Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/Polysemy/Input/polysemies-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']

In [3]:
# load the model

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

In [4]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    doc = nlp(note)
    
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            outputs[ID][(ent.start_char, ent.end_char)] = ent.text
            
    if count % 100 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 100
Finish Processing Note 200


In [5]:
# save the outputs

with jsonlines.open('../Data/Polysemy/Output/polysemies-spaCy.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

### Finetuning Evaluation

In [2]:
# convert the inputs

for type_context in ['general', 'clinical']:
    for type_name in ['popular', 'diverse']:
        
        notes = {}
        with jsonlines.open(f'../Data/Finetune/Input/inputs-{type_context}+{type_name}.jsonl', 'r') as reader:
            for line in reader:
                notes[tuple(line['ID'])] = line['note']
                
        labels = defaultdict(dict)
        with jsonlines.open(f'../Data/Finetune/Input/labels-{type_context}+{type_name}.jsonl', 'r') as reader:
            for line in reader:
                ID, position, name = map(lambda x:tuple(x), line.values())
                labels[ID][position] = name
                
        nlp = spacy.blank('en')
        dbs = {split: DocBin() for split in ['train', 'dev']}
        for ID, note in notes.items():
            doc = nlp.make_doc(note)
            ents = []
            for position, name in labels[ID].items():
                span = doc.char_span(position[0], position[1], label='PERSON')
                if span is not None: ents.append(span)
            doc.ents = ents
            dbs[ID[0]].add(doc)
            
        for split, db in dbs.items():
            db.to_disk(f'External/spaCy/Data/Finetune/{split}-{type_context}+{type_name}.spacy')

In [None]:
# !python -m spacy train External/spaCy/Model/config.cfg \
#  --output External/spaCy/Model/0/general+popular/ \
#  --paths.train External/spaCy/Data/Finetune/Input/train-general+popular.spacy \
#  --paths.dev External/spaCy/Data/Finetune/Input/dev-general+popular.spacy \
#  --gpu-id 0

In [2]:
# load the test inputs

notes = {}
with jsonlines.open('../Data/Finetune/Input/inputs-test.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']

In [3]:
# load the model, make predictions, save the outputs

spacy.prefer_gpu()
def test(type_context, type_name, seed):    
    nlp = spacy.load(f'External/spaCy/Model/{seed}/{type_context}+{type_name}/model-best')
    outputs = defaultdict(dict)
    
    for count, (ID, note) in enumerate(notes.items()):
        doc = nlp(note)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                outputs[ID][(ent.start_char, ent.end_char)] = ent.text

    with jsonlines.open(f'../Data/Finetune/Output/finetunes-{type_context}+{type_name}-spaCy-{seed}.jsonl', 'w') as writer:
        writer.write_all([{'ID':list(ID), 'position':list(position), 'name':[name]} for ID, preds in outputs.items() for position, name in preds.items()])

In [6]:
for seed in [0, 1, 2, 3, 4]:
    for type_context in ['general', 'clinical']:
        for type_name in ['popular', 'diverse']:
            test(type_context, type_name, seed)
            print(f'Finish Evaluating Seed = {seed}, Context = {type_context}, Name = {type_name}')

Finish Evaluating Seed = 0, Context = general, Name = popular
Finish Evaluating Seed = 0, Context = general, Name = diverse
Finish Evaluating Seed = 0, Context = clinical, Name = popular
Finish Evaluating Seed = 0, Context = clinical, Name = diverse
Finish Evaluating Seed = 1, Context = general, Name = popular
Finish Evaluating Seed = 1, Context = general, Name = diverse
Finish Evaluating Seed = 1, Context = clinical, Name = popular
Finish Evaluating Seed = 1, Context = clinical, Name = diverse
Finish Evaluating Seed = 2, Context = general, Name = popular
Finish Evaluating Seed = 2, Context = general, Name = diverse
Finish Evaluating Seed = 2, Context = clinical, Name = popular
Finish Evaluating Seed = 2, Context = clinical, Name = diverse
Finish Evaluating Seed = 3, Context = general, Name = popular
Finish Evaluating Seed = 3, Context = general, Name = diverse
Finish Evaluating Seed = 3, Context = clinical, Name = popular
Finish Evaluating Seed = 3, Context = clinical, Name = diverse
