# MetaCAT - BiLSTM for negations of Dutch Clinical Corpus

Based on https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy#scrollTo=dukwUnN1TPCg
and https://colab.research.google.com/drive/1zzV3XzFJ9ihhCJ680DaQV2QZ5XnHa06X#scrollTo=Sj29auXV8iPZ


In [1]:
from tqdm.notebook import tqdm
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
import os

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

  from tqdm.autonotebook import tqdm


In [2]:
# Input
data_dir = os.path.join('..', 'data')
cdb_file = os.path.join(data_dir, 'cdb.dat')
vocab_file = os.path.join(data_dir, 'vocab.dat')
json_file_all = os.path.join(data_dir, 'emc-dcc_ann.json')
text_file = os.path.join(data_dir, 'data.txt')

# Output
output_dir = 'output'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
tokenizer_name = 'bbpe_dutch-wikipedia'

## Create tokenizer

In [3]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(text_file)

In [4]:
# Save the tokenizer
tokenizer.save_model(output_dir, tokenizer_name)

['output/bbpe_dutch-wikipedia-vocab.json',
 'output/bbpe_dutch-wikipedia-merges.txt']

## Create embeddings matrix

In [5]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [6]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġniezen', 0.9076570272445679),
 ('Ġkortademigheid', 0.9037686586380005),
 ('Ġjeuk', 0.8884788751602173),
 ('Ġbraken', 0.8784409761428833),
 ('Ġdiarree', 0.8775287866592407),
 ('Ġmisselijkheid', 0.847364604473114),
 ('Ġovergeven', 0.8446534872055054),
 ('Ġschokken', 0.8427244424819946),
 ('Ġbenauwdheid', 0.8399103879928589),
 ('Ġademhalingsproblemen', 0.8391226530075073)]

In [7]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))

In [8]:
# Save the embeddings
embeddings_file = os.path.join(output_dir, "embeddings.npy")
np.save(open(embeddings_file, 'wb'), np.array(embeddings))

In [9]:
# Change tokenizer to MedCAT's TokenizerWrapperBPE
tokenizer = TokenizerWrapperBPE(tokenizer)

## Train MetaCAT on all documents

In [10]:
# Initiate MetaCAT
mc_negation = MetaCAT(tokenizer=tokenizer,
                      embeddings=embeddings,
                      pad_id=len(embeddings)-1,
                      save_dir=output_dir,
                      device='cpu')

# Train model
mc_negation.train(json_file_all, 
                  'Negation',
                  model_config={'tokenizer_name': tokenizer_name},
                  nepochs=10)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.87      0.69      0.77      1591
           1       0.95      0.98      0.97      9758

    accuracy                           0.94     11349
   macro avg       0.91      0.83      0.87     11349
weighted avg       0.94      0.94      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.88      0.78      0.83       182
           1       0.96      0.98      0.97      1080

    accuracy                           0.95      1262
   macro avg       0.92      0.88      0.90      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.20137218964732134
Test Loss:  0.16563032631529495






Model saved at epoch: 0 and f1: 0.9512924273465027
[[ 142   40]
 [  20 1060]]



Epoch: 1 **************************************************  T

Epoch: 9 **************************************************  Train
              precision    recall  f1-score   support

           0       0.93      0.82      0.87      1591
           1       0.97      0.99      0.98      9758

    accuracy                           0.97     11349
   macro avg       0.95      0.91      0.93     11349
weighted avg       0.97      0.97      0.97     11349

Epoch: 9 **************************************************  Test
              precision    recall  f1-score   support

           0       0.90      0.78      0.84       182
           1       0.96      0.99      0.97      1080

    accuracy                           0.96      1262
   macro avg       0.93      0.88      0.91      1262
weighted avg       0.96      0.96      0.96      1262

Train Loss: 0.12445199813260893
Test Loss:  0.14788185196812265






Best/Average scores: F1: 0.9583290261808863, P: 0.9586288944631696, R: 0.9595879556259905


{'f1': 0.9583290261808863,
 'p': 0.9586288944631696,
 'r': 0.9595879556259905,
 'cls_report': {'0': {'precision': 0.9171974522292994,
   'recall': 0.7912087912087912,
   'f1-score': 0.8495575221238937,
   'support': 182},
  '1': {'precision': 0.9656108597285068,
   'recall': 0.9879629629629629,
   'f1-score': 0.9766590389016019,
   'support': 1080},
  'accuracy': 0.9595879556259905,
  'macro avg': {'precision': 0.941404155978903,
   'recall': 0.8895858770858771,
   'f1-score': 0.9131082805127477,
   'support': 1262},
  'weighted avg': {'precision': 0.9586288944631696,
   'recall': 0.9595879556259905,
   'f1-score': 0.9583290261808863,
   'support': 1262}}}

In [11]:
# Save model config
mc_negation.save(full_save=False)

## Example usage

In [12]:
# Load the cdb and vocab 
config = Config()

vocab = Vocab.load(vocab_file)
cdb = CDB.load(cdb_file)

# Create MedCAT pipeline
cat = CAT(cdb=cdb, vocab=vocab, config=config, meta_cats=[mc_negation])

In [13]:
# Test on DL1114 from DCC with negation
text = 'Echo- en rontgenonderzoek van de heup toont geen evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.99413735, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'negated', 'confidence': 0.99721986, 'name': 'Negation'}}




In [14]:
# Test on DL1114 from DCC without negation
text = 'Echo- en rontgenonderzoek van de heup toont evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.99924016, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.9917353, 'name': 'Negation'}}




## Additional testing and evaluation

More tests and evaluation can be found in the evaluation notebook.