# MetaCAT - BiLSTM for negations of Dutch Clinical Corpus

Based on https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy#scrollTo=dukwUnN1TPCg
and https://colab.research.google.com/drive/1zzV3XzFJ9ihhCJ680DaQV2QZ5XnHa06X#scrollTo=Sj29auXV8iPZ


In [1]:
from tqdm.notebook import tqdm
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from gensim.models import Word2Vec
import json
import numpy as np
import os

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

  from tqdm.autonotebook import tqdm


In [2]:
# Input
data_dir = os.path.join('..', 'data')
cdb_file = os.path.join(data_dir, 'cdb.dat')
vocab_file = os.path.join(data_dir, 'vocab.dat')
json_file_all = os.path.join(data_dir, 'emc-dcc_ann.json')
text_file = os.path.join(data_dir, 'data.txt')

# Output
output_dir = 'output'
tokenizer_name = 'emc_dcc'

## Create tokenizer

In [3]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(text_file)

In [4]:
# Save the tokenizer
tokenizer.save_model(output_dir, tokenizer_name)

['output/emc_dcc-vocab.json', 'output/emc_dcc-merges.txt']

## Create embeddings matrix

In [5]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [6]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġkortademigheid', 0.9008769989013672),
 ('Ġjeuk', 0.8975282311439514),
 ('Ġniezen', 0.8891239762306213),
 ('Ġbraken', 0.8868833780288696),
 ('Ġdiarree', 0.8819828629493713),
 ('Ġirritatie', 0.87400221824646),
 ('Ġovergeven', 0.8697969913482666),
 ('Ġzweten', 0.8684862852096558),
 ('Ġmisselijkheid', 0.8676348924636841),
 ('Ġbuikpijn', 0.8667157888412476)]

In [7]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))

In [8]:
# Save the embeddings
embeddings_file = os.path.join(output_dir, "embeddings.npy")
np.save(open(embeddings_file, 'wb'), np.array(embeddings))

In [9]:
# Change tokenizer to MedCAT's TokenizerWrapperBPE
tokenizer = TokenizerWrapperBPE(tokenizer)

## Train MetaCAT on all documents

In [10]:
# train and evaluate MetaCAT on all negations of the EMC DCC dataset
mc_negation = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir=output_dir, device='cpu')
mc_negation.train(json_file_all, 'Negation', nepochs=10)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      9758
           1       0.88      0.71      0.78      1591

    accuracy                           0.95     11349
   macro avg       0.92      0.85      0.88     11349
weighted avg       0.94      0.95      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1080
           1       0.89      0.75      0.81       182

    accuracy                           0.95      1262
   macro avg       0.93      0.87      0.89      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.18905627497957206
Test Loss:  0.18934118520701304






Model saved at epoch: 0 and f1: 0.9490017641810792
[[1064   16]
 [  46  136]]



Epoch: 1 **************************************************  T

Epoch: 9 **************************************************  Train
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9758
           1       0.92      0.82      0.86      1591

    accuracy                           0.96     11349
   macro avg       0.94      0.90      0.92     11349
weighted avg       0.96      0.96      0.96     11349

Epoch: 9 **************************************************  Test
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1080
           1       0.95      0.76      0.84       182

    accuracy                           0.96      1262
   macro avg       0.95      0.88      0.91      1262
weighted avg       0.96      0.96      0.96      1262

Train Loss: 0.12629681482786495
Test Loss:  0.14113614341476932






Model saved at epoch: 9 and f1: 0.9568720847635588
[[1072    8]
 [  44  138]]



Best/Average scores: F1: 0.9568720847635588, P: 0.958357172820

{'f1': 0.9568720847635588,
 'p': 0.9583571728201138,
 'r': 0.9587955625990491,
 'cls_report': {'0': {'precision': 0.9605734767025089,
   'recall': 0.9925925925925926,
   'f1-score': 0.9763205828779599,
   'support': 1080},
  '1': {'precision': 0.9452054794520548,
   'recall': 0.7582417582417582,
   'f1-score': 0.8414634146341463,
   'support': 182},
  'accuracy': 0.9587955625990491,
  'macro avg': {'precision': 0.9528894780772819,
   'recall': 0.8754171754171753,
   'f1-score': 0.9088919987560531,
   'support': 1262},
  'weighted avg': {'precision': 0.9583571728201138,
   'recall': 0.9587955625990491,
   'f1-score': 0.9568720847635588,
   'support': 1262}}}

## Example usage

In [11]:
# Load the cdb and vocab 
config = Config()

vocab = Vocab.load(vocab_file)
cdb = CDB.load(cdb_file)

# Create MedCAT pipeline
cat = CAT(cdb=cdb, vocab=vocab, config=config, meta_cats=[mc_negation])

In [17]:
# Test on DL1114 from DCC with negation
text = 'Echo- en rontgenonderzoek van de heup toont geen evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.97756976, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'negated', 'confidence': 0.9964055, 'name': 'Negation'}}




In [24]:
# Test on DL1114 from DCC without negation
text = 'Echo- en rontgenonderzoek van de heup toont evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.99538463, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.990191, 'name': 'Negation'}}




## Additional testing and evaluation

More tests and evaluation can be found in the evaluation notebook.