# Training and testing using cross-validation
This notebook uses predefined subsets of examples to train and test models.|

In [None]:
import json
import numpy as np
import pandas as pd
from utils import evaluate_per_example
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.meta_cat import MetaCAT
from pathlib import Path

In [None]:
# Configure MetaCAT
CLASS = 'Temporality'
AVG = 'micro' # binary or micro
config_metacat = ConfigMetaCAT()
config_metacat.general['category_name'] = CLASS
config_metacat.train['nepochs'] = 10
config_metacat.train['score_average'] = AVG
config_metacat.model['n_classes'] = 3

# Input
data_dir = Path.cwd().parents[0] / 'data'
annotation_file = data_dir / 'emc-dcc_ann_Augmented.json'
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
embeddings_file = model_dir / 'embeddings.npy'

# Output
annotations_split_dir = data_dir / 'annotations_split'
models_split_dir = model_dir / 'model_splits' / CLASS
result_dir = Path.cwd().parents[0] / 'results'
score_result_file = result_dir / 'bilstm_scores_cv_augmented.csv.gz'
predictions_result_file = result_dir / 'bilstm_predictions_cv_augmented.csv.gz'

# Create output dirs
annotations_split_dir.mkdir(exist_ok=True)
models_split_dir.mkdir(exist_ok=True)

# num folds
n_folds = 10
group = 'name'


## Load tokenizer and embeddings matrix
Load a project-wide tokenizer and embeddings matrix which are created in `01_tokenizer_embeddings.ipynb`.

In [None]:
tokenizer = TokenizerWrapperBPE.load(model_dir)
embeddings = np.load(embeddings_file)

## Extract folds

In [None]:
annotations = json.load(open(annotation_file, 'r'))

In [None]:
names = [d['name'] for d in annotations['projects'][0]['documents']]
groups = [n.split("|")[0] for n in names]

In [None]:
from sklearn.model_selection import GroupKFold

In [None]:
Splitter = GroupKFold(n_splits=n_folds)
Texts = [d['text'] for d in annotations['projects'][0]['documents']]
Splitter.get_n_splits(Texts)

In [None]:
annotations_train = {'projects': [{'name': 'emc-dcc-synthAug', 'documents': []}]}
annotations_test = {'projects': [{'name': 'emc-dcc-synthAug', 'documents': []}]}
for i, (train_index, test_index) in enumerate(Splitter.split(Texts, groups=groups)):
    # collect train and test data
    
    annotations_train['projects'][0]['documents'] = [annotations['projects'][0]['documents'][i] for i in train_index]
    annotations_test['projects'][0]['documents'] = [annotations['projects'][0]['documents'][i] for i in test_index]
        
    # write train and test data to file
    fname_train = annotations_split_dir / f'train_annotations_{i}.json'
    fname_test = annotations_split_dir / f'test_annotations_{i}.json'
    json.dump(annotations_train, open(fname_train, 'w'))
    json.dump(annotations_test, open(fname_test, 'w'))

## Train and test on folds
Per fold, a MetaCAT model is trained and tested. Testing is done using MetaCAT's eval() function, which contains functionality to evaluate the model on a testset and returns a dictionary with scores and examples, but does not include the example ID, which we use to compare examples between different methods. Therefor we use a different evaluation function later in this notebook.

In [None]:
test_results['examples']['FN']['hypothetical']

In [None]:
# List to store results of individual folds
score_result_list = []

for train_file in annotations_split_dir.rglob("train_annotations_*.json"):
    print(train_file)
    split_id = train_file.stem.split('_')[2]
    split_id_dir = models_split_dir / split_id
    split_id_dir.mkdir(exist_ok=True)
    
    # Initiate MetaCAT
    meta_cat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, config=config_metacat)
    
    # Train model
    train_results = meta_cat.train_from_json(json_path=str(train_file), save_dir_path=str(split_id_dir))
    
    # Evaluate using MetaCAT's eval function
    test_file = train_file.parent / train_file.name.replace('train_annotations_', 'test_annotations_')
    test_results = meta_cat.eval(json_path=test_file)
    
    # Count positive and negatives
    tp = 0
    if 'negated' in test_results['examples']['TP']:
        tp = len(test_results['examples']['TP']['negated'])
    
    fp = 0
    if 'negated' in test_results['examples']['FP']:
        fp = len(test_results['examples']['FP']['negated'])
        
    fn = 0
    if 'negated' in test_results['examples']['FN']:
        fn = len(test_results['examples']['FN']['negated'])
    # Save test results
    score_result_list.append([split_id,
                              round(test_results['f1'], 2),
                              round(test_results['precision'], 2),
                              round(test_results['recall'], 2),
                              tp,
                              fp,
                              fn]
                             )

## Test on folds
Use this cell to test on the test data if the models are already created.

In [None]:
# List to store results of individual folds
score_result_list = []

for test_file in annotations_split_dir.rglob("test_annotations_*.json"):
    split_id = test_file.stem.split('_')[2]
    split_id_dir = models_split_dir / split_id
    
    # Load biLSTM
    meta_cat = MetaCAT.load(str(split_id_dir))
    
    # Evaluate using MetaCAT's eval function
    test_results = meta_cat.eval(json_path=test_file)
    
    # Save test results
    score_result_list.append([split_id,
                              round(test_results['f1'], 2),
                              round(test_results['precision'], 2),
                              round(test_results['recall'], 2),
                              len(test_results['examples']['TP']['hypothetical']),
                              len(test_results['examples']['FP']['hypothetical']),
                              len(test_results['examples']['FN']['hypothetical'])])

## Gather scores from folds
In this section, results are gathered from the folds and saved in a single CSV.

Currently, recall and precision are not returned by MetaCAT's eval() function. A future release will add this functionality (https://github.com/CogStack/MedCAT/pull/172).

In [None]:
def calculate_recall(row):
    tp = row.tp
    fp = row.fp
    fn = row.fn
    recall = round(tp / (tp + fn), 2)
    return recall

def calculate_precision(row):
    tp = row.tp
    fp = row.fp
    fn = row.fn
    precision = round(tp / (tp + fp), 2)
    return precision

def calculate_f1(row):
    tp = row.tp
    fp = row.fp
    fn = row.fn
    f1 = round((2*tp) / ((2*tp) + fp + fn), 2)
    return f1

score_results = pd.DataFrame(score_result_list, columns=['split_id', 'weighted_f1', 'weighted_precision', 'weighted_recall', 'tp', 'fp', 'fn'])
score_results['manual_recall'] = score_results.apply(calculate_recall, axis=1)
score_results['manual_precision'] = score_results.apply(calculate_precision, axis=1)
score_results['manual_f1'] = score_results.apply(calculate_f1, axis=1)
score_results.to_csv(score_result_file, index=False, compression='gzip')
score_results

## Custom evaluation per example per fold
In this project we are interested per example whether a negation has been correctly predicted or not. MetaCAT does not have such functionality; it only returns scores, predictions and examples.

In this section we iterate through all annotations from an annotation file (MedCAT Trainer format), create an ID for every example (`exampleID = documentID_start_end`), collect the prediction per example and save all predictions in a CSV.

In [None]:
# Evaluate models on their respective test sets
predictions_on_test_list = []
for annotation_filename in annotations_split_dir.rglob("test_annotations_*.json"):
    
    # Extract split ID
    split_id = annotation_filename.stem.split('_')[2]
    split_id_dir = models_split_dir / split_id
    print(f'Evaluating test set {split_id}')
    
    # Load MetaCAT model
    meta_cat = MetaCAT.load(split_id_dir)
    
    # Gather the predictions on every example in the provided annotation file.
    predictions_on_test_list.append(evaluate_per_example(annotation_filename, meta_cat, f'bilstm_cv'))
    
# Save al predictions in a single dataframe
predictions_on_test_df = pd.DataFrame(columns=['entity_id', 'bilstm_cv'])
for i in predictions_on_test_list:
    predictions_on_test_df = predictions_on_test_df.append(i)

# Save predictions in a csv
predictions_on_test_df.reset_index(drop=True, inplace=True)
predictions_on_test_df.to_csv(predictions_result_file, index=False, compression='gzip', line_terminator='\n')
predictions_on_test_df