In [1]:
import jsonlines
from collections import defaultdict
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter

### General Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/General/Input/notes-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']

In [3]:
# load the model

splitter = SegtokSentenceSplitter()
tagger = SequenceTagger.load("flair/ner-english-large")

2022-12-29 11:50:03,413 loading file /data/healthy-ml/scratch/yuxin102/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29




2022-12-29 11:50:21,456 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [5]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    note = splitter.split(note)
    tagger.predict(note)

    for sent in note:
        for ent in sent.get_spans('ner'):
            if ent.get_label('ner').value == 'PER':
                outputs[ID][(ent.start_position+sent.start_pos, ent.end_position+sent.start_pos)] = (ent.text, ent.get_label('ner').score)
                
    if count % 1000 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 1000
Finish Processing Note 2000
Finish Processing Note 3000
Finish Processing Note 4000
Finish Processing Note 5000
Finish Processing Note 6000
Finish Processing Note 7000
Finish Processing Note 8000
Finish Processing Note 9000
Finish Processing Note 10000
Finish Processing Note 11000
Finish Processing Note 12000
Finish Processing Note 13000
Finish Processing Note 14000
Finish Processing Note 15000


In [6]:
# save the outputs

with jsonlines.open('../Data/General/Output/notes-flair.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':list(name)} for ID, preds in outputs.items() for position, name in preds.items()])

### Polysemy Evaluation

In [2]:
# load the inputs

notes = {}
with jsonlines.open('../Data/Polysemy/Input/polysemies-input.jsonl', 'r') as reader:
    for line in reader:
        notes[tuple(line['ID'])] = line['note']

In [3]:
# load the model

splitter = SegtokSentenceSplitter()
tagger = SequenceTagger.load("flair/ner-english-large")

2022-12-29 11:50:03,413 loading file /data/healthy-ml/scratch/yuxin102/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29




2022-12-29 11:50:21,456 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [4]:
# make predictions

outputs = defaultdict(dict)
for count, (ID, note) in enumerate(notes.items()):
    note = splitter.split(note)
    tagger.predict(note)

    for sent in note:
        for ent in sent.get_spans('ner'):
            if ent.get_label('ner').value == 'PER':
                outputs[ID][(ent.start_position+sent.start_pos, ent.end_position+sent.start_pos)] = (ent.text, ent.get_label('ner').score)
                
    if count % 100 == 0: print(f'Finish Processing Note {count}')

Finish Processing Note 0
Finish Processing Note 100
Finish Processing Note 200


In [5]:
# save the outputs

with jsonlines.open('../Data/Polysemy/Output/polysemies-flair.jsonl', 'w') as writer:
    writer.write_all([{'ID':list(ID), 'position':list(position), 'name':list(name)} for ID, preds in outputs.items() for position, name in preds.items()])