# MetaCAT - Training biLSTM

In [1]:
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

In [2]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
annotation_file = data_dir / 'emc-dcc_ann.json'

# Output
output_dir = Path.cwd().parents[0] / 'output' / 'bilstm'
embeddings_file = output_dir / 'embeddings.npy'

# Hardware for training, 'cpu' or 'cuda'
device = 'cuda'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
tokenizer_name = 'bbpe_dutch-wikipedia'

## Load Tokenizer and embeddings matrix

In [3]:
tokenizer = TokenizerWrapperBPE.load(output_dir, tokenizer_name)
embeddings = np.load(embeddings_file)

## Train biLSTM

In [4]:
from medcat.meta_cat import MetaCAT

# Initiate MetaCAT
mc_negation = MetaCAT(tokenizer=tokenizer,
                      embeddings=embeddings,
                      pad_id=len(embeddings)-1,
                      save_dir=output_dir,
                      device='cuda')

# Train model
results = mc_negation.train(annotation_file, 
                            'Negation',
                            nepochs=10,
                            model_config={'tokenizer_name': tokenizer_name})

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      9758
           1       0.88      0.71      0.78      1591

    accuracy                           0.95     11349
   macro avg       0.92      0.85      0.88     11349
weighted avg       0.94      0.95      0.94     11349

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1080
           1       0.90      0.77      0.83       182

    accuracy                           0.95      1262
   macro avg       0.93      0.88      0.90      1262
weighted avg       0.95      0.95      0.95      1262

Train Loss: 0.1902009417814478
Test Loss:  0.15577004005899653






Model saved at epoch: 0 and f1: 0.9534265586727555
[[1064   16]
 [  41  141]]



Epoch: 1 **************************************************  Tr

Epoch: 9 **************************************************  Train
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9758
           1       0.93      0.84      0.88      1591

    accuracy                           0.97     11349
   macro avg       0.95      0.91      0.93     11349
weighted avg       0.97      0.97      0.97     11349

Epoch: 9 **************************************************  Test
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1080
           1       0.90      0.80      0.85       182

    accuracy                           0.96      1262
   macro avg       0.93      0.89      0.91      1262
weighted avg       0.96      0.96      0.96      1262

Train Loss: 0.10970763944279613
Test Loss:  0.16321234562201425






Best/Average scores: F1: 0.9612874872339285, P: 0.9623184031021568, R: 0.9627575277337559


## Save model

In [6]:
# Save model config
mc_negation.save(full_save=False)