# Evaluation

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np

data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_file = result_dir / 'bilstm_predictions.csv.gz'
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_predictions.csv.gz'
merged_result_file = result_dir / 'merged_results.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [2]:
# Load biLSTM results
bilstm_predictions = pd.read_csv(bilstm_predictions_file, sep=',')
print(bilstm_predictions.shape)
bilstm_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [3]:
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
bilstm_predictions_cv.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm_cv
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1116_32_41,not negated
4,DL1116_137_148,not negated


In [4]:
# Load rule based results
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
ruled_based_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,rule_based
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [5]:
# Load robbert results
robbert_predictions = pd.read_csv(robbert_predictions_file)
robbert_predictions.drop(['category', 'label', 'bilstm', 'bilstm_cv', 'rule_based'], axis=1, inplace=True)
robbert_predictions.shape
robbert_predictions.head()

Unnamed: 0,entity_id,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,not negated,not negated,not negated
1,DL1111_272_280,not negated,not negated,
2,DL1111_363_377,not negated,not negated,
3,DL1112_22_28,negated,negated,negated
4,DL1113_59_67,not negated,not negated,not negated


In [6]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


## Merge annotations from different methods

In [7]:
results = pd.merge(left=annotated_data, right = bilstm_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_predictions, left_on='entity_id', right_on='entity_id')
results.to_csv(merged_result_file, index=False, compression='gzip', line_terminator='\n')
print(results.shape)
results.head(5)

(12551, 9)


Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated


## Compare different ways of calculating scores

In [8]:
def create_score_record(prediction_method, average, pos_label=1, category=None):
    if category is not None:
        subset = results.loc[results.category == 'DL', ['label', prediction_method]].dropna()

        return [average, category, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average, pos_label=pos_label))) 
    subset = results[['label', prediction_method]].dropna()
    return [average, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average, pos_label=pos_label))) 

score_list = []
score_list.append(create_score_record('bilstm_cv', 'binary', pos_label='negated'))
score_list.append(create_score_record('bilstm_cv', 'micro'))
score_list.append(create_score_record('bilstm_cv', 'macro'))
score_list.append(create_score_record('bilstm_cv', 'weighted'))
scores = pd.DataFrame(score_list, columns=['average', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
scores.drop(['support'], axis=1, inplace=True)
scores

Unnamed: 0,average,prediction_method,precision,recall,f1
0,binary,bilstm_cv,0.925059,0.897727,0.911188
1,micro,bilstm_cv,0.97546,0.97546,0.97546
2,macro,bilstm_cv,0.954229,0.942933,0.948476
3,weighted,bilstm_cv,0.975218,0.97546,0.975306


## Scores per category (average: binary)

In [9]:
score_list = []
score_list.append(create_score_record('rule_based', 'binary', pos_label='negated', category = 'DL'))
score_list.append(create_score_record('bilstm_cv', 'binary', pos_label='negated', category = 'DL'))
score_list.append(create_score_record('robbert_512_2', 'binary', pos_label='negated', category = 'DL'))
score_list.append(create_score_record('rule_based', 'binary', pos_label='negated', category = 'GP'))
score_list.append(create_score_record('bilstm_cv', 'binary', pos_label='negated', category = 'GP'))
score_list.append(create_score_record('robbert_512_2', 'binary', pos_label='negated', category = 'GP'))
score_list.append(create_score_record('rule_based', 'binary', pos_label='negated', category = 'RD'))
score_list.append(create_score_record('bilstm_cv', 'binary', pos_label='negated', category = 'RD'))
score_list.append(create_score_record('robbert_512_2', 'binary', pos_label='negated', category = 'RD'))
score_list.append(create_score_record('rule_based', 'binary', pos_label='negated', category = 'SP'))
score_list.append(create_score_record('bilstm_cv', 'binary', pos_label='negated', category = 'SP'))
score_list.append(create_score_record('robbert_512_2', 'binary', pos_label='negated', category = 'SP'))
scores_binary = pd.DataFrame(score_list, columns=['average', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
scores_binary.drop(['support'], axis=1, inplace=True)
scores_binary

Unnamed: 0,average,category,prediction_method,precision,recall,f1
0,binary,DL,rule_based,0.892583,0.920844,0.906494
1,binary,DL,bilstm_cv,0.95664,0.931398,0.94385
2,binary,DL,robbert_512_2,0.955381,0.970667,0.962963
3,binary,GP,rule_based,0.892583,0.920844,0.906494
4,binary,GP,bilstm_cv,0.95664,0.931398,0.94385
5,binary,GP,robbert_512_2,0.955381,0.970667,0.962963
6,binary,RD,rule_based,0.892583,0.920844,0.906494
7,binary,RD,bilstm_cv,0.95664,0.931398,0.94385
8,binary,RD,robbert_512_2,0.955381,0.970667,0.962963
9,binary,SP,rule_based,0.892583,0.920844,0.906494


## Scores per category (average: micro)

In [10]:
score_list = []
score_list.append(create_score_record('rule_based', 'micro', category = 'DL'))
score_list.append(create_score_record('bilstm_cv', 'micro', category = 'DL'))
score_list.append(create_score_record('robbert_512_2', 'micro', category = 'DL'))
score_list.append(create_score_record('rule_based', 'micro', category = 'GP'))
score_list.append(create_score_record('bilstm_cv', 'micro', category = 'GP'))
score_list.append(create_score_record('robbert_512_2', 'micro', category = 'GP'))
score_list.append(create_score_record('rule_based', 'micro', category = 'RD'))
score_list.append(create_score_record('bilstm_cv', 'micro', category = 'RD'))
score_list.append(create_score_record('robbert_512_2', 'micro', category = 'RD'))
score_list.append(create_score_record('rule_based', 'micro', category = 'SP'))
score_list.append(create_score_record('bilstm_cv', 'micro', category = 'SP'))
score_list.append(create_score_record('robbert_512_2', 'micro', category = 'SP'))
scores_micro = pd.DataFrame(score_list, columns=['average', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
scores_micro.drop(['support'], axis=1, inplace=True)
scores_micro

Unnamed: 0,average,category,prediction_method,precision,recall,f1
0,micro,DL,rule_based,0.974249,0.974249,0.974249
1,micro,DL,bilstm_cv,0.984979,0.984979,0.984979
2,micro,DL,robbert_512_2,0.98935,0.98935,0.98935
3,micro,GP,rule_based,0.974249,0.974249,0.974249
4,micro,GP,bilstm_cv,0.984979,0.984979,0.984979
5,micro,GP,robbert_512_2,0.98935,0.98935,0.98935
6,micro,RD,rule_based,0.974249,0.974249,0.974249
7,micro,RD,bilstm_cv,0.984979,0.984979,0.984979
8,micro,RD,robbert_512_2,0.98935,0.98935,0.98935
9,micro,SP,rule_based,0.974249,0.974249,0.974249


## Scores per category (average: weighted)

In [11]:
score_list = []
score_list.append(create_score_record('rule_based', 'weighted', category = 'DL'))
score_list.append(create_score_record('bilstm_cv', 'weighted', category = 'DL'))
score_list.append(create_score_record('robbert_512_2', 'weighted', category = 'DL'))
score_list.append(create_score_record('rule_based', 'weighted', category = 'GP'))
score_list.append(create_score_record('bilstm_cv', 'weighted', category = 'GP'))
score_list.append(create_score_record('robbert_512_2', 'weighted', category = 'GP'))
score_list.append(create_score_record('rule_based', 'weighted', category = 'RD'))
score_list.append(create_score_record('bilstm_cv', 'weighted', category = 'RD'))
score_list.append(create_score_record('robbert_512_2', 'weighted', category = 'RD'))
score_list.append(create_score_record('rule_based', 'weighted', category = 'SP'))
score_list.append(create_score_record('bilstm_cv', 'weighted', category = 'SP'))
score_list.append(create_score_record('robbert_512_2', 'weighted', category = 'SP'))
scores_micro = pd.DataFrame(score_list, columns=['average', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
scores_micro.drop(['support'], axis=1, inplace=True)
scores_micro

Unnamed: 0,average,category,prediction_method,precision,recall,f1
0,weighted,DL,rule_based,0.974656,0.974249,0.974418
1,weighted,DL,bilstm_cv,0.984862,0.984979,0.984894
2,weighted,DL,robbert_512_2,0.98944,0.98935,0.989385
3,weighted,GP,rule_based,0.974656,0.974249,0.974418
4,weighted,GP,bilstm_cv,0.984862,0.984979,0.984894
5,weighted,GP,robbert_512_2,0.98944,0.98935,0.989385
6,weighted,RD,rule_based,0.974656,0.974249,0.974418
7,weighted,RD,bilstm_cv,0.984862,0.984979,0.984894
8,weighted,RD,robbert_512_2,0.98944,0.98935,0.989385
9,weighted,SP,rule_based,0.974656,0.974249,0.974418


## All scores (average: weighted)

In [12]:
score_list = []
score_list.append(create_score_record('rule_based', 'weighted'))
score_list.append(create_score_record('bilstm_cv', 'weighted'))
score_list.append(create_score_record('robbert_512_2', 'weighted'))
scores = pd.DataFrame(score_list, columns=['average', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
scores.drop(['support'], axis=1, inplace=True)
scores

Unnamed: 0,average,prediction_method,precision,recall,f1
0,weighted,rule_based,0.959842,0.958011,0.9587
1,weighted,bilstm_cv,0.975218,0.97546,0.975306
2,weighted,robbert_512_2,0.982006,0.982138,0.982045
