In [1]:
from pathlib import Path
from evaluation_utils import get_document_text
import pandas as pd
import json
import numpy as np

data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_result_file = result_dir / 'bilstm_predictions.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [2]:
# Load
bilstm_predictions = pd.read_csv(bilstm_result_file)
bilstm_predictions.head()

Unnamed: 0,entity_id,annotation,bilstm
0,DL1111_32_46,not negated,not negated
1,DL1111_272_280,not negated,not negated
2,DL1111_363_377,not negated,not negated
3,DL1112_22_28,negated,negated
4,DL1113_59_67,not negated,not negated


In [3]:
# Select false negatives
false_negatives = bilstm_predictions[(bilstm_predictions.annotation == 'negated') & (bilstm_predictions.bilstm == 'not negated')]
false_negatives.head()

Unnamed: 0,entity_id,annotation,bilstm
121,DL1156_161_172,negated,not negated
127,DL1158_173_183,negated,not negated
142,DL1163_0_8,negated,not negated
155,DL1167_76_82,negated,not negated
266,DL1201_0_8,negated,not negated


In [4]:
# Show issue with a random record
random_entity = false_negatives.sample(1).entity_id.tolist()[0]
text, start, end = get_document_text(random_entity, dcc_dir, bilstm_predictions)

Voorgeschiedenis: strabismus operatie op kinderleeftijd en kruisband ruptuur linker knie.
Actuele gegevens: begin dit jaar voelde patiente een zwelling links in de bovenbuik, aanvankelijk pijnlijk nu meer zeurend van karakter.
Er zijn geen algemene verschijnselen in de zin van koorts, nachtzweten of gewichtsverlies.
Er is geen sprake geweest van een trauma.
In de linker bovenbuik een zwelling 15 cm onder de ribbenboog.
Geen ascites.
Geen ascites.
Het beeld zou kunnen passen bij een mucineus adenocarcinoom.

Entity: gewichtsverlies (301-316)

            entity_id annotation       bilstm
12520  SP2093_301_316    negated  not negated


In [5]:
# Count number of false negatives caused by -
count = 0
for index, record in false_negatives.iterrows():
    text, start, end = get_document_text(record.entity_id, dcc_dir, bilstm_predictions, print_text=False)
    if text[end:end+1] == '-':
        count += 1
print(f'{count} of {false_negatives.shape[0]} ({round((count / false_negatives.shape[0]) * 100)}%) false negatives caused by negation described as "-"')

51 of 340 (15%) false negatives caused by negation described as "-"
