# MetaCAT - BiLSTM for negations of Dutch Clinical Corpus

Based on https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy#scrollTo=dukwUnN1TPCg
and https://colab.research.google.com/drive/1zzV3XzFJ9ihhCJ680DaQV2QZ5XnHa06X#scrollTo=Sj29auXV8iPZ


In [1]:
from tokenizers import ByteLevelBPETokenizer
from gensim.models import Word2Vec
import numpy as np
import os
import pandas as pd

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

In [2]:
# Input
data_dir = os.path.join('..', 'data')
cdb_file = os.path.join(data_dir, 'cdb.dat')
vocab_file = os.path.join(data_dir, 'vocab.dat')
json_file_all = os.path.join(data_dir, 'emc-dcc_ann.json')
text_file = os.path.join(data_dir, 'data.txt')

# Output
output_dir = 'output'

# Hardware for training, 'cpu' or 'cuda'
device = 'cuda'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
tokenizer_name = 'bbpe_dutch-wikipedia'

## Create tokenizer

In [3]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(text_file)

In [4]:
# Save the tokenizer
tokenizer.save_model(output_dir, tokenizer_name)

['output\\bbpe_dutch-wikipedia-vocab.json',
 'output\\bbpe_dutch-wikipedia-merges.txt']

## Create embeddings matrix

In [5]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [6]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġkortademigheid', 0.9212014675140381),
 ('Ġdiarree', 0.9084370136260986),
 ('Ġbraken', 0.898695170879364),
 ('Ġmisselijkheid', 0.8878295421600342),
 ('Ġjeuk', 0.8811523914337158),
 ('Ġbenauwdheid', 0.8811056613922119),
 ('Ġvermoeidheid', 0.8810759782791138),
 ('Ġbuikpijn', 0.8754057884216309),
 ('Ġconstipatie', 0.872384786605835),
 ('Ġhoofdpijn', 0.8706159591674805)]

In [7]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))

In [8]:
# Save the embeddings
embeddings_file = os.path.join(output_dir, "embeddings.npy")
np.save(open(embeddings_file, 'wb'), np.array(embeddings))

In [9]:
# Change tokenizer to MedCAT's TokenizerWrapperBPE
tokenizer = TokenizerWrapperBPE(tokenizer)

## Train MetaCAT on all documents

In [10]:
# Initiate MetaCAT
mc_negation = MetaCAT(tokenizer=tokenizer,
                      embeddings=embeddings,
                      pad_id=len(embeddings)-1,
                      save_dir=output_dir,
                      device=device)

# Train model
mc_negation.train(json_file_all, 
                  'Negation',
                  model_config={'tokenizer_name': tokenizer_name},
                  nepochs=10)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.85      0.69      0.76      1591
           1       0.95      0.98      0.97      9758

    accuracy                           0.94     11349
   macro avg       0.90      0.84      0.86     11349
weighted avg       0.94      0.94      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.92      0.74      0.82       182
           1       0.96      0.99      0.97      1080

    accuracy                           0.95      1262
   macro avg       0.94      0.86      0.90      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.204156841223561
Test Loss:  0.17391254243557341






Model saved at epoch: 0 and f1: 0.9509975383104231
[[ 134   48]
 [  11 1069]]



Epoch: 1 **************************************************  Tra

{'f1': 0.9615973378529737,
 'p': 0.9620149661500167,
 'r': 0.9627575277337559,
 'cls_report': {'0': {'precision': 0.9299363057324841,
   'recall': 0.8021978021978022,
   'f1-score': 0.8613569321533924,
   'support': 182},
  '1': {'precision': 0.967420814479638,
   'recall': 0.9898148148148148,
   'f1-score': 0.9784897025171624,
   'support': 1080},
  'accuracy': 0.9627575277337559,
  'macro avg': {'precision': 0.948678560106061,
   'recall': 0.8960063085063086,
   'f1-score': 0.9199233173352774,
   'support': 1262},
  'weighted avg': {'precision': 0.9620149661500167,
   'recall': 0.9627575277337559,
   'f1-score': 0.9615973378529737,
   'support': 1262}}}

In [11]:
# Save model config
mc_negation.save(full_save=False)

## Example usage

In [12]:
# Load the cdb and vocab 
config = Config()

vocab = Vocab.load(vocab_file)
cdb = CDB.load(cdb_file)

# Create MedCAT pipeline
cat = CAT(cdb=cdb, vocab=vocab, config=config, meta_cats=[mc_negation])

In [13]:
# Test on DL1114 from DCC with negation
text = 'Echo- en rontgenonderzoek van de heup toont geen evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.9724897, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'negated', 'confidence': 0.9999876, 'name': 'Negation'}}




In [14]:
# Test on DL1114 from DCC without negation
text = 'Echo- en rontgenonderzoek van de heup toont evidente heupdysplasie.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: heup
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.99853075, 'name': 'Negation'}}


Entity: heupdysplasie
Meta Annotations: {'Negation': {'value': 'not negated', 'confidence': 0.9968718, 'name': 'Negation'}}




## Additional testing and evaluation

More tests and evaluation can be found in the evaluation notebook.