# Evaluate biLSTM on complete DCC

In [1]:
from pathlib import Path
from medcat.meta_cat import MetaCAT
import pandas as pd
import json

data_dir = Path.cwd().parents[0] / 'data'
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
result_dir = Path.cwd().parents[0] / 'results'
bilstm_result_file = result_dir / 'bilstm_predictions.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

# Load biLSTM
meta_cat = MetaCAT.load(model_dir)

## Simple evaluation without retrieving the document IDs

In [3]:
results = meta_cat.eval(json_path=annotation_file)

Epoch: 0 **************************************************  Eval
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1760
           1       1.00      1.00      1.00     10791

    accuracy                           1.00     12551
   macro avg       0.99      0.99      0.99     12551
weighted avg       1.00      1.00      1.00     12551



## Evaluation including document IDs
Based on MedCATTrainer JSON format, create custom spaCy Doc and Span object with custom entities
and use the biLSTM to predict negations for these entities.

In [2]:
# Create custom Span and Doc objects to provide data in the expected format for MetaCAT. This allows for using
# the tokens as they are annotated in the labeled data. SpaCy's Span and Doc objects create entities from tokenized
# text, which could be different from how human annotators tokenize text.
# This code is based on MedCAT's json_to_fake_spacy(), see:
# https://github.com/CogStack/MedCAT/blob/bbb2dc8aa452d0561709993078ce4f0297a63ff6/medcat/utils/meta_cat/data_utils.py#L133
class Empty(object):
    def __init__(self):
        pass

class CustomSpan(object):
    def __init__(self, start_char, end_char, id):
        self._ = Empty()
        self.start_char = start_char
        self.end_char = end_char
        self._.id = id
        self._.meta_anns = None

class CustomDoc(object):
    def __init__(self, text, id):
        self._ = Empty()
        self._.share_tokens = None
        self.ents = []
        self._ents = self.ents
        self.text = text
        self.id = id

In [3]:
# Create empty list to store predictions for each entity
result = []

# Loop over every document
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']
    doc = CustomDoc(text=text, id=document_name)

    # Loop over every annotated entity
    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'

        # Add entity as custom Span to custom Doc object
        doc.ents.append(CustomSpan(start_char, end_char, entity_id))

    doc = meta_cat(doc)

    # Retrieve predictions
    for ent in doc.ents:
        entity_id = ent._.id
        annotation = ent._.meta_anns['Negation']['value']
        result.append([entity_id, annotation])

bilstm_predictions = pd.DataFrame(result, columns=['entity_id', 'bilstm'])

In [4]:
# Save
bilstm_predictions.to_csv(bilstm_result_file, index=False, compression='gzip')