# MetaCAT - BiLSTM for negations of Dutch Clinical Corpus

Based on https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy#scrollTo=dukwUnN1TPCg
and https://colab.research.google.com/drive/1zzV3XzFJ9ihhCJ680DaQV2QZ5XnHa06X#scrollTo=Sj29auXV8iPZ


In [20]:
from tqdm.notebook import tqdm
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from gensim.models import Word2Vec
import json
import numpy as np
import os

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

In [5]:
# Input
data_dir = os.path.join('..', 'data')
cdb_file = os.path.join(data_dir, 'cdb.dat')
vocab_file = os.path.join(data_dir, 'vocab.dat')
json_file_all = os.path.join(data_dir, 'emc-dcc_ann.json')
text_file = os.path.join(data_dir, 'data.txt')

# Output
output_dir = 'output'

## Create tokenizer

In [6]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(text_file)

In [7]:
# Save the tokenizer
tokenizer.save_model(output_dir, 'emc_dcc')

['output/emc_dcc-vocab.json', 'output/emc_dcc-merges.txt']

## Create embeddings matrix

In [8]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [9]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġdiarree', 0.9048820734024048),
 ('Ġkortademigheid', 0.9036903381347656),
 ('Ġbenauwdheid', 0.8889886140823364),
 ('Ġbraken', 0.8872215747833252),
 ('Ġjeuk', 0.8864785432815552),
 ('Ġniezen', 0.8711977005004883),
 ('Ġovergeven', 0.8703508973121643),
 ('Ġmisselijkheid', 0.8701860308647156),
 ('Ġspierpijn', 0.8695064187049866),
 ('Ġaanhoudende', 0.8685997724533081)]

In [10]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))

In [11]:
# Save the embeddings
embeddings_file = os.path.join(output_dir, "embeddings.npy")
np.save(open(embeddings_file, 'wb'), np.array(embeddings))

In [12]:
# Change tokenizer to MedCAT's TokenizerWrapperBPE
tokenizer = TokenizerWrapperBPE(tokenizer)

## Train MetaCAT on all documents

In [13]:
# train and evaluate MetaCAT on all negations of the EMC DCC dataset
mc_negation = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir=output_dir, device='cpu')
mc_negation.train(json_file_all, 'Negation', nepochs=10)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.88      0.71      0.79      1591
           1       0.95      0.98      0.97      9758

    accuracy                           0.95     11349
   macro avg       0.92      0.85      0.88     11349
weighted avg       0.94      0.95      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.91      0.76      0.83       182
           1       0.96      0.99      0.97      1080

    accuracy                           0.95      1262
   macro avg       0.93      0.87      0.90      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.18915263238415436
Test Loss:  0.1524209578637965






Model saved at epoch: 0 and f1: 0.9522919729435904
[[ 138   44]
 [  14 1066]]



Epoch: 1 **************************************************  Tr

{'f1': 0.9593057648127071,
 'p': 0.959424479623563,
 'r': 0.9603803486529319,
 'cls_report': {'0': {'precision': 0.9125,
   'recall': 0.8021978021978022,
   'f1-score': 0.8538011695906434,
   'support': 182},
  '1': {'precision': 0.9673321234119783,
   'recall': 0.987037037037037,
   'f1-score': 0.9770852428964254,
   'support': 1080},
  'accuracy': 0.9603803486529319,
  'macro avg': {'precision': 0.9399160617059892,
   'recall': 0.8946174196174197,
   'f1-score': 0.9154432062435345,
   'support': 1262},
  'weighted avg': {'precision': 0.959424479623563,
   'recall': 0.9603803486529319,
   'f1-score': 0.9593057648127071,
   'support': 1262}}}

## Example usage

In [14]:
# Load the cdb and vocab 
config = Config()

vocab = Vocab.load(vocab_file)
cdb = CDB.load(cdb_file)

# Create MedCAT pipeline
cat = CAT(cdb=cdb, vocab=vocab, config=config, meta_cats=[mc_negation])

In [15]:
text = 'De patient heeft geen longkanker.'
doc = cat(text)
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: longkanker
Meta Annotations: {'Negation': {'value': 'negated', 'confidence': 0.9952809, 'name': 'Negation'}}




## Train MetaCat on subsets of the data
The ContextD paper calculates precision, recall and F1-score on subsets of the data. In this section we calculate the same scores with the just created model. Note that this results in a calculation on a set of data that was included during the training phase. For proper score calculations, we will do cross validation at a later stage.

In [16]:
json_file_DL = os.path.join(data_dir, 'emc-dcc_ann_DL.json')
json_file_GP = os.path.join(data_dir, 'emc-dcc_ann_GP.json')
json_file_RD = os.path.join(data_dir, 'emc-dcc_ann_RD.json')
json_file_SP = os.path.join(data_dir, 'emc-dcc_ann_SP.json')

### Radiology letters

In [19]:
mc_negation.eval(json_file_RD)

TypeError: prepare_from_json() got an unexpected keyword argument 'lowercase'

### Specialist letters

In [None]:
# train and evaluate MetaCAT on specialist letters
mc_negation_SP = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_SP', device='cpu')
mc_negation_SP.train(json_file_SP, 'Negation', nepochs=10)

### Discharge letters

In [None]:
# train and evaluate MetaCAT on discharge letters
mc_negation_DL = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_DL', device='cpu')
mc_negation_DL.train(json_file_DL, 'Negation', nepochs=10)

### GP entries

In [None]:
# train and evaluate MetaCAT on GP entries
mc_negation_GP = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_GP', device='cpu')
mc_negation_GP.train(json_file_GP, 'Negation', nepochs=10)