# MetaCAT - BiLSTM for negations of Dutch Clinical Corpus

Based on https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy#scrollTo=dukwUnN1TPCg
and https://colab.research.google.com/drive/1zzV3XzFJ9ihhCJ680DaQV2QZ5XnHa06X#scrollTo=Sj29auXV8iPZ


In [1]:
import tqdm
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from gensim.models import Word2Vec
import json
import numpy as np


from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

  from tqdm.autonotebook import tqdm


In [2]:
DATA_DIR = 'data/'
DATA_DIR_PRIVATE = "/Users/myrthehemker/Data/EMC_Corpus/jsonfiles/"
cdb_file = DATA_DIR+ 'cdb.dat'
vocab_file = DATA_DIR+ 'vocab.dat'

In [3]:
json_file_DL = DATA_DIR_PRIVATE + "emc-dcc_ann_DL.json"
json_file_GP = DATA_DIR_PRIVATE + "emc-dcc_ann_GP.json"
json_file_RD = DATA_DIR_PRIVATE + "emc-dcc_ann_RD.json"
json_file_SP = DATA_DIR_PRIVATE + "emc-dcc_ann_SP.json"

In [4]:
# Load the cdb and vocab 
config = Config()
config.general['spacy_model'] = 'nl_core_news_sm'

vocab = Vocab.load(vocab_file)
cdb = CDB.load(cdb_file)

In [5]:
# Create MedCAT pipeline
cat = CAT(cdb=cdb, vocab=vocab, config=config)

In [6]:
# Create, train and save the tokenizer

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(DATA_DIR + "data.txt")
tokenizer.save_model("data/output/", 'emc_dcc')

['data/output/emc_dcc-vocab.json', 'data/output/emc_dcc-merges.txt']

In [7]:
# Now we tokenize all the text we have and train word2vec
f = open(DATA_DIR + "data.txt", 'r')
# Note that if you have a very large dataset, use iterators that
#read the text line by line from the file, do not load the whole file
#into memory.
data = []
for line in f:
    data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(data, size=300, min_count=1)

In [8]:
# Check is word2vec trained, Ġ - for this tokenizer denotes start of word (a space)
w2v.most_similar('Ġhoesten')

  


[('Ġkortademigheid', 0.9139964580535889),
 ('Ġjeuk', 0.8930736780166626),
 ('Ġdiarree', 0.8803220987319946),
 ('Ġbraken', 0.880200982093811),
 ('Ġmisselijkheid', 0.8777583837509155),
 ('Ġniezen', 0.8665359020233154),
 ('Ġbuikpijn', 0.8609310388565063),
 ('Ġspierpijn', 0.8597644567489624),
 ('Ġvermoeidheid', 0.859038233757019),
 ('Ġademhalingsproblemen', 0.8583375215530396)]

In [9]:
# Now we just have to create the embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
    # Assign a random vector if the word was not frequent enough to receive
    #an embedding
        embeddings.append(np.random.rand(300))

In [10]:
# Save the embeddings
np.save(open(DATA_DIR + "embeddings.npy", 'wb'), np.array(embeddings))

In [11]:
json_file_all = DATA_DIR_PRIVATE + 'emc-dcc_ann.json'
data = json.load(open(json_file_all))
print(data.keys())
print(data['projects'][0]['documents'][0]['annotations'][0].keys())

dict_keys(['projects'])
dict_keys(['id', 'user', 'cui', 'value', 'start', 'end', 'validated', 'correct', 'deleted', 'alternative', 'killed', 'meta_anns'])


In [15]:
# Get the required tokenizer (note that we have already downloaded the required models)
tokenizer = TokenizerWrapperBPE(ByteLevelBPETokenizer(vocab= "data/output/" + "emc_dcc-vocab.json", merges="data/output/" + "emc_dcc-merges.txt"))

## Evaluate different datatypes


Overall score

In [17]:
# train MedCAT model on the annotated concepts
cat.train_supervised(data_path=json_file_all, 
                     nepochs=1,
                     reset_cui_count=False,
                     print_stats=False,
                     test_size = 0.1,
                     use_filters=True) 

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Project', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Document', max=4830.0, style=ProgressStyle(description_wi…

The provided entity for cui <7> was empty, nothing to train


({}, {}, {}, {}, {}, {}, {}, {})

In [18]:
# train and evaluate MetaCAT on all negations of the EMC DCC dataset
mc_negation = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_all', device='cpu')
mc_negation.train(json_file_all, 'Negation', nepochs=10)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.87      0.70      0.78      1591
           1       0.95      0.98      0.97      9758

    accuracy                           0.94     11349
   macro avg       0.91      0.84      0.87     11349
weighted avg       0.94      0.94      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       182
           1       0.96      0.99      0.97      1080

    accuracy                           0.95      1262
   macro avg       0.94      0.87      0.90      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.19262081954274063
Test Loss:  0.169737848744262






Model saved at epoch: 0 and f1: 0.9512700672229851
[[ 136   46]
 [  13 1067]]



Epoch: 1 **************************************************  Tra

Epoch: 9 **************************************************  Train
              precision    recall  f1-score   support

           0       0.93      0.82      0.87      1591
           1       0.97      0.99      0.98      9758

    accuracy                           0.97     11349
   macro avg       0.95      0.90      0.93     11349
weighted avg       0.97      0.97      0.96     11349

Epoch: 9 **************************************************  Test
              precision    recall  f1-score   support

           0       0.94      0.75      0.83       182
           1       0.96      0.99      0.97      1080

    accuracy                           0.96      1262
   macro avg       0.95      0.87      0.90      1262
weighted avg       0.96      0.96      0.95      1262

Train Loss: 0.1206846370707712
Test Loss:  0.1555123495636508






Best/Average scores: F1: 0.957455816657798, P: 0.9578051907236043, R: 0.9587955625990491


{'f1': 0.957455816657798,
 'p': 0.9578051907236043,
 'r': 0.9587955625990491,
 'cls_report': {'0': {'precision': 0.9166666666666666,
   'recall': 0.7857142857142857,
   'f1-score': 0.8461538461538461,
   'support': 182},
  '1': {'precision': 0.9647377938517179,
   'recall': 0.9879629629629629,
   'f1-score': 0.9762122598353157,
   'support': 1080},
  'accuracy': 0.9587955625990491,
  'macro avg': {'precision': 0.9407022302591923,
   'recall': 0.8868386243386244,
   'f1-score': 0.9111830529945809,
   'support': 1262},
  'weighted avg': {'precision': 0.9578051907236043,
   'recall': 0.9587955625990491,
   'f1-score': 0.957455816657798,
   'support': 1262}}}

Radiology letters

In [20]:
# Create MedCAT trained only on radiology letters
cat_RD = CAT(cdb=cdb, vocab=vocab, config=config)
cat_RD.train_supervised(data_path=json_file_RD, 
                     nepochs=1,
                     reset_cui_count=False,
                     print_stats=False, 
                     test_size = 0.1,
                     use_filters=True) 

# train and evaluate MetaCAT on radiology letters
mc_negation_RD = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_RD', device='cpu')
mc_negation_RD.train(json_file_RD, 'Negation', nepochs=10)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Project', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Document', max=1340.0, style=ProgressStyle(description_wi…

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       539
           1       0.96      0.96      0.96      2775

    accuracy                           0.93      3314
   macro avg       0.88      0.87      0.87      3314
weighted avg       0.93      0.93      0.93      3314

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.92      0.86      0.89        56
           1       0.97      0.99      0.98       313

    accuracy                           0.97       369
   macro avg       0.95      0.92      0.93       369
weighted avg       0.97      0.97      0.97       369

Train Loss: 0.19597539066684894
Test Loss:  0.27506222212687137






Model saved at epoch: 0 and f1: 0.9669806856798727
[[ 48   8]
 [  4 309]]



Epoch: 1 **************************************************  Train

{'f1': 0.9693608531661936,
 'p': 0.9699489450251646,
 'r': 0.9701897018970189,
 'cls_report': {'0': {'precision': 0.9591836734693877,
   'recall': 0.8392857142857143,
   'f1-score': 0.8952380952380952,
   'support': 56},
  '1': {'precision': 0.971875,
   'recall': 0.9936102236421726,
   'f1-score': 0.9826224328593998,
   'support': 313},
  'accuracy': 0.9701897018970189,
  'macro avg': {'precision': 0.9655293367346939,
   'recall': 0.9164479689639434,
   'f1-score': 0.9389302640487475,
   'support': 369},
  'weighted avg': {'precision': 0.9699489450251646,
   'recall': 0.9701897018970189,
   'f1-score': 0.9693608531661936,
   'support': 369}}}

Specialist letters

In [21]:
# Create MedCAT trained only on specialist letters
cat_SP = CAT(cdb=cdb, vocab=vocab, config=config)
cat_SP.train_supervised(data_path=json_file_SP, 
                     nepochs=1,
                     test_size = 0.1,
                     reset_cui_count=False,
                     print_stats=True, 
                     use_filters=True) 

# train and evaluate MetaCAT on specialist letters
mc_negation_SP = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_SP', device='cpu')
mc_negation_SP.train(json_file_SP, 'Negation', nepochs=10)

HBox(children=(FloatProgress(value=0.0, description='Stats project', max=1.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='Stats document', max=97.0, style=ProgressStyle(descriptio…

Epoch: 0, Prec: 0.012698412698412698, Rec: 0.029304029304029304, F1: 0.017718715393133997

Docs with false positives: SP1618; SP1648; SP1175; SP1243; SP1830; SP1180; SP1476; SP2092; SP1219; SP1746

Docs with false negatives: SP1618; SP1648; SP1175; SP1243; SP1830; SP1180; SP1476; SP2092; SP1219; SP1746



False Positives

blauw~naevus                                                           - 5                    -         28
albuminurie                                                            - 6                    -         28
misselijkheid                                                          - 7                    -         28
acute~diarree                                                          - 13                   -         20
klacht                                                                 - C0277786             -         20
zwelling                                                               - 3                    -         20
congenitale~leverfibrose          

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Project', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Document', max=892.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Stats project', max=1.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='Stats document', max=97.0, style=ProgressStyle(descriptio…

Epoch: 1, Prec: 0.01904761904761905, Rec: 0.04395604395604396, F1: 0.026578073089701

Docs with false positives: SP1618; SP1648; SP1175; SP1243; SP1830; SP1180; SP1476; SP2092; SP1219; SP1746

Docs with false negatives: SP1618; SP1648; SP1175; SP1243; SP1830; SP1180; SP1476; SP2092; SP1219; SP1746



False Positives

blauw~naevus                                                           - 5                    -         30
misselijkheid                                                          - 7                    -         26
albuminurie                                                            - 6                    -         25
acute~diarree                                                          - 13                   -         20
klacht                                                                 - C0277786             -         20
zwelling                                                               - 3                    -         19
pijn~in~de~nek                         

Epoch: 6 **************************************************  Train
              precision    recall  f1-score   support

           0       0.92      0.86      0.89       391
           1       0.97      0.99      0.98      2061

    accuracy                           0.97      2452
   macro avg       0.95      0.92      0.94      2452
weighted avg       0.97      0.97      0.97      2452

Epoch: 6 **************************************************  Test
              precision    recall  f1-score   support

           0       0.72      0.92      0.81        25
           1       0.99      0.96      0.98       248

    accuracy                           0.96       273
   macro avg       0.86      0.94      0.89       273
weighted avg       0.97      0.96      0.96       273

Train Loss: 0.11033053060377677
Test Loss:  0.16150759496460004






Epoch: 7 **************************************************  Train
              precision    recall  f1-score   support

           0       0.

{'f1': 0.9706959706959707,
 'p': 0.9706959706959707,
 'r': 0.9706959706959707,
 'cls_report': {'0': {'precision': 0.84,
   'recall': 0.84,
   'f1-score': 0.8399999999999999,
   'support': 25},
  '1': {'precision': 0.9838709677419355,
   'recall': 0.9838709677419355,
   'f1-score': 0.9838709677419355,
   'support': 248},
  'accuracy': 0.9706959706959707,
  'macro avg': {'precision': 0.9119354838709677,
   'recall': 0.9119354838709677,
   'f1-score': 0.9119354838709677,
   'support': 273},
  'weighted avg': {'precision': 0.9706959706959707,
   'recall': 0.9706959706959707,
   'f1-score': 0.9706959706959707,
   'support': 273}}}

Discharge letters

In [23]:
# Create MedCAT trained only on discharge letters
cat_DL = CAT(cdb=cdb, vocab=vocab, config=config)
cat_DL.train_supervised(data_path=json_file_DL, 
                     nepochs=1,
                     test_size = 0.1,
                     reset_cui_count=False,
                     print_stats=False, 
                     use_filters=True) 


# train and evaluate MetaCAT on discharge letters
mc_negation_DL = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_DL', device='cpu')
mc_negation_DL.train(json_file_DL, 'Negation', nepochs=10)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Project', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Document', max=900.0, style=ProgressStyle(description_wid…

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.81      0.68      0.74       336
           1       0.95      0.98      0.96      2180

    accuracy                           0.94      2516
   macro avg       0.88      0.83      0.85      2516
weighted avg       0.93      0.94      0.93      2516

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.80      0.95      0.87        43
           1       0.99      0.96      0.97       237

    accuracy                           0.96       280
   macro avg       0.90      0.96      0.92       280
weighted avg       0.96      0.96      0.96       280

Train Loss: 0.22446347778988263
Test Loss:  0.09224599160786186






Model saved at epoch: 0 and f1: 0.9585986928786673
[[ 41   2]
 [ 10 227]]



Epoch: 1 **************************************************  Train

Epoch: 9 **************************************************  Train
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       336
           1       0.99      1.00      1.00      2180

    accuracy                           0.99      2516
   macro avg       0.99      0.98      0.98      2516
weighted avg       0.99      0.99      0.99      2516

Epoch: 9 **************************************************  Test
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        43
           1       0.98      1.00      0.99       237

    accuracy                           0.97       280
   macro avg       0.97      0.93      0.95       280
weighted avg       0.97      0.97      0.97       280

Train Loss: 0.024546770298103284
Test Loss:  0.17368482039351615






Best/Average scores: F1: 0.9818762922309182, P: 0.9820535714285713, R: 0.9821428571428571


{'f1': 0.9818762922309182,
 'p': 0.9820535714285713,
 'r': 0.9821428571428571,
 'cls_report': {'0': {'precision': 0.975,
   'recall': 0.9069767441860465,
   'f1-score': 0.9397590361445783,
   'support': 43},
  '1': {'precision': 0.9833333333333333,
   'recall': 0.9957805907172996,
   'f1-score': 0.989517819706499,
   'support': 237},
  'accuracy': 0.9821428571428571,
  'macro avg': {'precision': 0.9791666666666666,
   'recall': 0.9513786674516731,
   'f1-score': 0.9646384279255387,
   'support': 280},
  'weighted avg': {'precision': 0.9820535714285713,
   'recall': 0.9821428571428571,
   'f1-score': 0.9818762922309182,
   'support': 280}}}

GP entries

In [25]:
# Create MedCAT trained only on GP entries
cat_GP = CAT(cdb=cdb, vocab=vocab,config=config)
cat_GP.train_supervised(data_path=json_file_GP, 
                     nepochs=1,
                     test_size = 0.1,
                     reset_cui_count=False,
                     print_stats=False, 
                     use_filters=True) 

# train and evaluate MetaCAT on GP entries
mc_negation_GP = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, pad_id=len(embeddings) -1, save_dir='data/output/mc_negation_GP', device='cpu')
mc_negation_GP.train(json_file_GP, 'Negation', nepochs=10)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Project', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Document', max=1702.0, style=ProgressStyle(description_wi…

The provided entity for cui <2> was empty, nothing to train
The provided entity for cui <13> was empty, nothing to train


Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.73      0.60      0.66       346
           1       0.95      0.97      0.96      2720

    accuracy                           0.93      3066
   macro avg       0.84      0.79      0.81      3066
weighted avg       0.93      0.93      0.93      3066

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.87      0.92      0.89        37
           1       0.99      0.98      0.99       304

    accuracy                           0.98       341
   macro avg       0.93      0.95      0.94       341
weighted avg       0.98      0.98      0.98       341

Train Loss: 0.19272605841979384
Test Loss:  0.09101145052247578






Model saved at epoch: 0 and f1: 0.9768095655066356
[[ 34   3]
 [  5 299]]



Epoch: 1 **************************************************  Train

{'f1': 0.9852488519500573,
 'p': 0.9852031686510798,
 'r': 0.9853372434017595,
 'cls_report': {'0': {'precision': 0.9444444444444444,
   'recall': 0.918918918918919,
   'f1-score': 0.9315068493150684,
   'support': 37},
  '1': {'precision': 0.9901639344262295,
   'recall': 0.993421052631579,
   'f1-score': 0.9917898193760264,
   'support': 304},
  'accuracy': 0.9853372434017595,
  'macro avg': {'precision': 0.9673041894353369,
   'recall': 0.956169985775249,
   'f1-score': 0.9616483343455474,
   'support': 341},
  'weighted avg': {'precision': 0.9852031686510798,
   'recall': 0.9853372434017595,
   'f1-score': 0.9852488519500573,
   'support': 341}}}