# MetaCAT - Training biLSTM

In [16]:
import numpy as np
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE
from pathlib import Path
from medcat.meta_cat import MetaCAT
import json

In [17]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
annotation_file = data_dir / 'emc-dcc_ann.json'
split_list_file = data_dir / 'split_list.json'
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
embeddings_file = model_dir / 'embeddings.npy'
split_model_dir = model_dir / 'splits'

# Output
annotations_split_dir = data_dir / 'annotations_split'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
#tokenizer_name = 'bbpe_dutch-wikipedia'

# Create output dir
annotations_split_dir.mkdir(exist_ok=True)

split_model_dir.mkdir(exist_ok=True)

## Load Tokenizer and embeddings matrix

In [None]:
tokenizer = TokenizerWrapperBPE.load(model_dir)

In [22]:
embeddings = np.load(embeddings_file)

## Split annotation file

In [19]:
# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)
    
# Load split lists
with open(split_list_file) as f:
    split_lists = json.load(f)

split_list = split_lists[0]


In [20]:
train_annotations = []
test_annotations = []
                         
for document in annotations['projects'][0]['documents']:
    if document['name'] in split_list['train']:
        train_annotations.append(document)
    elif document['name'] in split_list['test']:
        test_annotations.append(document)
#     else:
#         print(f'{document["name"]} not found in either train or test')

# Create an annotation file for the split following MetaCAT's annotation format
project_train_annotations = {'projects': [{'documents': train_annotations}]}
project_test_annotations = {'projects': [{'documents': test_annotations}]}

# Write output files
train_output_file = annotations_split_dir / f'train_annotations_{split_list["split_id"]}.json'
with open(train_output_file, "w") as fp:
    json.dump(project_train_annotations, fp)
    
test_output_file = annotations_split_dir / f'test_annotations_{split_list["split_id"]}.json'
with open(test_output_file, "w") as fp:
    json.dump(project_test_annotations, fp)

## Train biLSTM from training sets

In [44]:
from medcat.config_meta_cat import ConfigMetaCAT
config_metacat = ConfigMetaCAT()
config_metacat.general['category_name'] = 'Negation'
config_metacat.train['nepochs'] = 2

In [45]:
for train_file in annotations_split_dir.rglob("train_annotations_*.json"):
    print(train_file)
    split_id = train_file.stem.split('_')[2]
    split_dir = split_model_dir / split_id
    split_dir.mkdir(exist_ok=True)
    
    # Initiate MetaCAT
    mc_negation = MetaCAT(tokenizer=tokenizer, config=config_metacat)

D:\Repositories\negation-detection\data\annotations_split\train_annotations_0.json


In [46]:
            # Train model
results = mc_negation.train(json_path=train_file, save_dir_path=split_dir)

Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      8729
           1       0.88      0.71      0.79      1411

    accuracy                           0.95     10140
   macro avg       0.92      0.85      0.88     10140
weighted avg       0.94      0.95      0.94     10140

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.95      0.83      0.89       161

    accuracy                           0.97      1126
   macro avg       0.96      0.91      0.93      1126
weighted avg       0.97      0.97      0.97      1126


##### Model saved to D:\Repositories\negation-detection\models\bilstm\splits\0\model.dat at epoch: 0 and f1: 0.9689596191472376 #####

Epoch: 1 **************************************************  Train
          

In [63]:
test_file = Path('D:/Repositories/negation-detection/data/annotations_split/test_annotations_0.json')
print(test_file)

D:\Repositories\negation-detection\data\annotations_split\test_annotations_0.json


In [68]:
result = mc_negation.eval(json_path=test_file)

Epoch: 0 **************************************************  Eval
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1097
           1       0.88      0.90      0.89       188

    accuracy                           0.97      1285
   macro avg       0.93      0.94      0.94      1285
weighted avg       0.97      0.97      0.97      1285



In [71]:
len(result['examples']['FP']['negated'])

23