# Evaluation

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Predictions methods used in this project
PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_512_2']

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_file = result_dir / 'bilstm_predictions.csv.gz'
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_predictions.csv.gz'
merged_result_file = result_dir / 'merged_results.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [2]:
# Load biLSTM (complete model) results
bilstm_predictions = pd.read_csv(bilstm_predictions_file, sep=',')
print(bilstm_predictions.shape)
bilstm_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [3]:
# Load biLSTM (cross validation) results
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
bilstm_predictions_cv.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm_cv
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1116_32_41,not negated
4,DL1116_137_148,not negated


In [4]:
# Load rule based results
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
ruled_based_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,rule_based
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [5]:
# Load robbert results
robbert_predictions = pd.read_csv(robbert_predictions_file)
robbert_predictions.drop(['category', 'label', 'bilstm', 'bilstm_cv', 'rule_based'], axis=1, inplace=True)
robbert_predictions.shape
robbert_predictions.head()

Unnamed: 0,entity_id,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,not negated,not negated,not negated
1,DL1111_272_280,not negated,not negated,
2,DL1111_363_377,not negated,not negated,
3,DL1112_22_28,negated,negated,negated
4,DL1113_59_67,not negated,not negated,not negated


In [6]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


## Merge annotations from different methods

In [7]:
results = pd.merge(left=annotated_data, right = bilstm_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_predictions, left_on='entity_id', right_on='entity_id')
results.to_csv(merged_result_file, index=False, compression='gzip', line_terminator='\n')
print(results.shape)
results.head(5)

(12551, 9)


Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated


## Function to calculate scores

In [8]:
def create_score_record(prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
     # Using dropna because our robbert implementation does not work for some examples.
    if category == 'all':
        # Calculate score for all categories. 
        subset = results[['label', prediction_method]].dropna()
    else:
        # Calculate scores for specific category
        subset = results.loc[results.category == category, ['label', prediction_method]].dropna()
    return [average_method, category, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

## Compare different ways of calculating scores

In [9]:
prediction_method = 'bilstm_cv'
score_list = []
for average_method in AVERAGE_METHODS:
    score_list.append(create_score_record(prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,bilstm_cv,0.925059,0.897727,0.911188
1,micro,all,bilstm_cv,0.97546,0.97546,0.97546
2,macro,all,bilstm_cv,0.954229,0.942933,0.948476
3,weighted,all,bilstm_cv,0.975218,0.97546,0.975306


## Scores per category (average: binary)

In [10]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,rule_based,0.892583,0.920844,0.906494
1,binary,DL,bilstm_cv,0.95664,0.931398,0.94385
2,binary,DL,robbert_512_2,0.955381,0.970667,0.962963
3,binary,GP,rule_based,0.671024,0.804178,0.731591
4,binary,GP,bilstm_cv,0.885117,0.885117,0.885117
5,binary,GP,robbert_512_2,0.94586,0.883929,0.913846
6,binary,RD,rule_based,0.901254,0.966387,0.932685
7,binary,RD,bilstm_cv,0.932773,0.932773,0.932773
8,binary,RD,robbert_512_2,0.963287,0.958261,0.960767
9,binary,SP,rule_based,0.808153,0.836228,0.821951


## Scores per category (average: micro)

In [11]:
average_method = 'micro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,micro,DL,rule_based,0.974249,0.974249,0.974249
1,micro,DL,bilstm_cv,0.984979,0.984979,0.984979
2,micro,DL,robbert_512_2,0.98935,0.98935,0.98935
3,micro,GP,rule_based,0.933451,0.933451,0.933451
4,micro,GP,bilstm_cv,0.974087,0.974087,0.974087
5,micro,GP,robbert_512_2,0.979907,0.979907,0.979907
6,micro,RD,rule_based,0.977464,0.977464,0.977464
7,micro,RD,bilstm_cv,0.978279,0.978279,0.978279
8,micro,RD,robbert_512_2,0.985938,0.985938,0.985938
9,micro,SP,rule_based,0.945441,0.945441,0.945441


## Scores per category (average: macro)

In [12]:
average_method = 'macro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,macro,DL,rule_based,0.940055,0.951734,0.945781
1,macro,DL,bilstm_cv,0.972963,0.962389,0.96759
2,macro,DL,robbert_512_2,0.975244,0.981562,0.978372
3,macro,GP,rule_based,0.822744,0.877031,0.846804
4,macro,GP,bilstm_cv,0.935257,0.935257,0.935257
5,macro,GP,robbert_512_2,0.965045,0.938496,0.951237
6,macro,RD,rule_based,0.947343,0.972993,0.959576
7,macro,RD,bilstm_cv,0.95991,0.95991,0.95991
8,macro,RD,robbert_512_2,0.977077,0.97513,0.9761
9,macro,SP,rule_based,0.889469,0.900516,0.894868


## Scores per category (average: weighted)

In [13]:
average_method = 'weighted'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,weighted,DL,rule_based,0.974656,0.974249,0.974418
1,weighted,DL,bilstm_cv,0.984862,0.984979,0.984894
2,weighted,DL,robbert_512_2,0.98944,0.98935,0.989385
3,weighted,GP,rule_based,0.940242,0.933451,0.936029
4,weighted,GP,bilstm_cv,0.974087,0.974087,0.974087
5,weighted,GP,robbert_512_2,0.979604,0.979907,0.979612
6,weighted,RD,rule_based,0.97854,0.977464,0.977778
7,weighted,RD,bilstm_cv,0.978279,0.978279,0.978279
8,weighted,RD,robbert_512_2,0.985912,0.985938,0.985923
9,weighted,SP,rule_based,0.946292,0.945441,0.945822


## All scores (average: weighted)

In [14]:
average_method = 'weighted'
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,weighted,all,rule_based,0.959842,0.958011,0.9587
1,weighted,all,bilstm_cv,0.975218,0.97546,0.975306
2,weighted,all,robbert_512_2,0.982006,0.982138,0.982045
