# Evaluation
In this notebook the negation prediction methods are compared to each other. This notebook contains the following:
- Entities which have predictions from all 3 methods are joined in a single dataframe.
- The merged dataframe is saved in this repo.
- A voting ensemble method is applied.
- Precision, recall and F1-scores are calculated per category and all combined.
- Average methods for score calculations are compared.
- RobBERT methods are compared.

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Predictions methods to compare
PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_v2_dutch_base_3_32_512', 'ensemble']
ROBBERT_PREDICTION_METHODS = ['robbert_v2_dutch_base_3_32_512','robbert_v2_dutch_base_DAPT_3_32_512','MedRoBERTa_3_32_512']

# Pretty names:
PRETTY_NAMES = {'rule_based': 'Rule-based', 
                'bilstm_cv': 'BiLSTM', 
                'robbert_v2_dutch_base_3_32_512': 'RobBERT',
                'ensemble': 'Voting ensemble',
                'robbert_v2_dutch_base_DAPT_3_32_512': 'RobBERT DAPT',
                'MedRoBERTa_3_32_512': 'MedRoBERTa.nl',
                'average_method': 'Weighting average', 
                'category': 'Letter category',
                'prediction_method': 'Prediction method',
                'precision': 'Precision', 
                'recall': 'Recall',
                'f1': 'F1',
                'GP': 'General Practitioner entries',
                'SP': 'Specialist letters',
                'RD': 'Radiology reports',
                'DL': 'Discharge letters'}

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_v2_dutch_base_3_32_512_predictions.csv.gz'
robbertdapt_predictions_file = result_dir / 'robbert_v2_dutch_base_DAPT_3_32_512_predictions.csv.gz'
medrobertanl_predictions_file = result_dir / 'MedRoBERTa_3_32_512_predictions.csv.gz'
merged_predictions_file = result_dir / 'merged_predictions.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [2]:
# Load biLSTM (cross validation) predictions
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(len(bilstm_predictions_cv))
bilstm_predictions_cv.head()

12551


Unnamed: 0,entity_id,bilstm_cv
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1116_32_41,not negated
4,DL1116_137_148,not negated


In [3]:
# Load rule based predictions
ruled_based_predictions = pd.read_csv(rule_based_predictions_file, usecols = ['entity_id','rule_based'])
print(len(ruled_based_predictions))
ruled_based_predictions.head()

12551


Unnamed: 0,entity_id,rule_based
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [4]:
# Load robbert predictions
robbert_predictions = pd.read_csv(robbert_predictions_file, usecols = ['entity_id', 'robbert_v2_dutch_base_3_32_512'])
print(len(robbert_predictions))
robbert_predictions.head()

12419


Unnamed: 0,entity_id,robbert_v2_dutch_base_3_32_512
0,DL1111_272_280,not negated
1,DL1111_32_46,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_141_155,not negated


In [5]:
# Load annotated Dutch Clinical Corpus data from MedCAT format
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


## Merge annotations from different methods

In [6]:
predictions = pd.merge(left=annotated_data, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id', how='left')
predictions = pd.merge(left=predictions, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id', how='left')
predictions = pd.merge(left=predictions, right = robbert_predictions, left_on='entity_id', right_on='entity_id', how='left')
print(f'Total entities: {len(predictions)}')
predictions.dropna(inplace=True)
print(f'Total entities with prediction in all methods: {len(predictions)}')
predictions.to_csv(merged_predictions_file, index=False, compression='gzip', line_terminator='\n')
predictions.head(5)

Total entities: 12551
Total entities with prediction in all methods: 12419


Unnamed: 0,entity_id,category,label,bilstm_cv,rule_based,robbert_v2_dutch_base_3_32_512
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated
3,DL1112_22_28,DL,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated


In [7]:
def majority_vote(row):
    predicted_negated = 0
    for method in ['rule_based', 'bilstm_cv', 'robbert_v2_dutch_base_3_32_512']:
        if row[method] == 'negated':
            predicted_negated += 1
    if predicted_negated > 1:
        return 'negated'
    else:
        return 'not negated'

predictions['ensemble'] = predictions.apply(majority_vote, axis=1)

## Function to calculate scores

In [8]:
def create_score_record(prediction_df, prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
    if category == 'all':
        # Calculate score for all categories. 
        subset = prediction_df[['label', prediction_method]]
    else:
        # Calculate scores for specific category
        subset = prediction_df.loc[prediction_df.category == category, ['label', prediction_method]]
    return [average_method, category, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

## Compare different ways of calculating scores

In [9]:
score_list = []
for prediction_method in PREDICTION_METHODS:
    for average_method in AVERAGE_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,rule_based,0.824868,0.891876,0.857064
1,micro,all,rule_based,0.958129,0.958129,0.958129
2,macro,all,rule_based,0.903459,0.930429,0.916268
3,weighted,all,rule_based,0.959926,0.958129,0.958806
4,binary,all,bilstm_cv,0.925926,0.90103,0.913308
5,micro,all,bilstm_cv,0.975924,0.975924,0.975924
6,macro,all,bilstm_cv,0.954892,0.944611,0.949665
7,weighted,all,bilstm_cv,0.975705,0.975924,0.975786
8,binary,all,robbert_v2_dutch_base_3_32_512,0.950696,0.937643,0.944124
9,micro,all,robbert_v2_dutch_base_3_32_512,0.984379,0.984379,0.984379


Based on these results we dediced to use the `binary` avering methods, because it shows that largest differences between methods.

## Scores per category

In [10]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
score_df = create_score_dataframe(score_list)
score_df

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,rule_based,0.892583,0.920844,0.906494
1,binary,DL,bilstm_cv,0.95664,0.931398,0.94385
2,binary,DL,robbert_v2_dutch_base_3_32_512,0.953488,0.973615,0.963446
3,binary,DL,ensemble,0.963158,0.965699,0.964427
4,binary,GP,rule_based,0.674107,0.801061,0.732121
5,binary,GP,bilstm_cv,0.888594,0.888594,0.888594
6,binary,GP,robbert_v2_dutch_base_3_32_512,0.950276,0.912467,0.930988
7,binary,GP,ensemble,0.930362,0.885942,0.907609
8,binary,RD,rule_based,0.900943,0.966273,0.932465
9,binary,RD,bilstm_cv,0.93266,0.934233,0.933446


In [11]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['category', 'prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{llrrr}
\toprule
             Letter category & Prediction method &  Precision &  Recall &    F1 \\
\midrule
           Discharge letters &        Rule-based &      0.893 &   0.921 & 0.906 \\
           Discharge letters &            BiLSTM &      0.957 &   0.931 & 0.944 \\
           Discharge letters &           RobBERT &      0.953 &   0.974 & 0.963 \\
           Discharge letters &   Voting ensemble &      0.963 &   0.966 & 0.964 \\
General Practitioner entries &        Rule-based &      0.674 &   0.801 & 0.732 \\
General Practitioner entries &            BiLSTM &      0.889 &   0.889 & 0.889 \\
General Practitioner entries &           RobBERT &      0.950 &   0.912 & 0.931 \\
General Practitioner entries &   Voting ensemble &      0.930 &   0.886 & 0.908 \\
           Radiology reports &        Rule-based &      0.901 &   0.966 & 0.932 \\
           Radiology reports &            BiLSTM &      0.933 &   0.934 & 0.933 \\
           Radiology reports &           RobBE

## Scores for all categories combined

In [12]:
# Calculate F1 Scores
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions, prediction_method, average_method='binary', category='all'))
print(len(predictions))
score_df = create_score_dataframe(score_list)
score_df

12419


Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,rule_based,0.824868,0.891876,0.857064
1,binary,all,bilstm_cv,0.925926,0.90103,0.913308
2,binary,all,robbert_v2_dutch_base_3_32_512,0.950696,0.937643,0.944124
3,binary,all,ensemble,0.947801,0.924485,0.935998


In [13]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Prediction method &  Precision &  Recall &    F1 \\
\midrule
       Rule-based &      0.825 &   0.892 & 0.857 \\
           BiLSTM &      0.926 &   0.901 & 0.913 \\
          RobBERT &      0.951 &   0.938 & 0.944 \\
  Voting ensemble &      0.948 &   0.924 & 0.936 \\
\bottomrule
\end{tabular}



## Add other RobBERT models
### Load data

In [14]:
# Load robbert dapt predictions
robbertdapt_predictions = pd.read_csv(robbertdapt_predictions_file, usecols = ['entity_id', 'robbert_v2_dutch_base_DAPT_3_32_512'])
print(len(robbertdapt_predictions))
robbertdapt_predictions.head()

12419


Unnamed: 0,entity_id,robbert_v2_dutch_base_DAPT_3_32_512
0,DL1111_272_280,not negated
1,DL1111_32_46,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_141_155,not negated


In [15]:
# Load MedRoberta.nl predictions
medrobertanl_predictions = pd.read_csv(medrobertanl_predictions_file, usecols = ['entity_id', 'MedRoBERTa_3_32_512'])
print(len(medrobertanl_predictions))
medrobertanl_predictions.head()

12431


Unnamed: 0,entity_id,MedRoBERTa_3_32_512
0,DL1111_272_280,not negated
1,DL1111_32_46,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_141_155,not negated


In [16]:
multiple_robbert_preditions = annotated_data.copy()
multiple_robbert_preditions = pd.merge(left=multiple_robbert_preditions, right = robbert_predictions, left_on='entity_id', right_on='entity_id', how='left')
multiple_robbert_preditions = pd.merge(left=multiple_robbert_preditions, right = robbertdapt_predictions, left_on='entity_id', right_on='entity_id', how='left')
multiple_robbert_preditions = pd.merge(left=multiple_robbert_preditions, right = medrobertanl_predictions, left_on='entity_id', right_on='entity_id', how='left')
multiple_robbert_preditions.head()

Unnamed: 0,entity_id,category,label,robbert_v2_dutch_base_3_32_512,robbert_v2_dutch_base_DAPT_3_32_512,MedRoBERTa_3_32_512
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated
3,DL1112_22_28,DL,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated


In [17]:
# Check whether all entities have a prediction
print(f'Total entities: {len(robbert_predictions)}')
multiple_robbert_preditions.dropna(inplace=True)
multiple_robbert_preditions.drop_duplicates(inplace=True)
print(f'Entities for which there is a prediction for all methods: {len(robbert_predictions)}')

Total entities: 12419
Entities for which there is a prediction for all methods: 12419


### Compare RobBERT methods using all categories

In [18]:
score_list = []
for prediction_method in ROBBERT_PREDICTION_METHODS:
    score_list.append(create_score_record(multiple_robbert_preditions, prediction_method, average_method='binary', category = 'all'))
print(len(robbert_predictions))
score_df = create_score_dataframe(score_list)
score_df

12419


Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,robbert_v2_dutch_base_3_32_512,0.950696,0.937643,0.944124
1,binary,all,robbert_v2_dutch_base_DAPT_3_32_512,0.952134,0.944508,0.948306
2,binary,all,MedRoBERTa_3_32_512,0.954678,0.939931,0.947247


In [19]:
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Prediction method &  Precision &  Recall &    F1 \\
\midrule
          RobBERT &      0.951 &   0.938 & 0.944 \\
     RobBERT DAPT &      0.952 &   0.945 & 0.948 \\
    MedRoBERTa.nl &      0.955 &   0.940 & 0.947 \\
\bottomrule
\end{tabular}



### Compare RobBERT methods per category

In [20]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in ROBBERT_PREDICTION_METHODS:
        score_list.append(create_score_record(multiple_robbert_preditions, prediction_method, average_method, category = category))
score_df = create_score_dataframe(score_list)
score_df

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,robbert_v2_dutch_base_3_32_512,0.953488,0.973615,0.963446
1,binary,DL,robbert_v2_dutch_base_DAPT_3_32_512,0.963824,0.984169,0.97389
2,binary,DL,MedRoBERTa_3_32_512,0.973404,0.965699,0.969536
3,binary,GP,robbert_v2_dutch_base_3_32_512,0.950276,0.912467,0.930988
4,binary,GP,robbert_v2_dutch_base_DAPT_3_32_512,0.940541,0.923077,0.931727
5,binary,GP,MedRoBERTa_3_32_512,0.943089,0.923077,0.932976
6,binary,RD,robbert_v2_dutch_base_3_32_512,0.959664,0.962901,0.961279
7,binary,RD,robbert_v2_dutch_base_DAPT_3_32_512,0.956303,0.959528,0.957912
8,binary,RD,MedRoBERTa_3_32_512,0.962457,0.951096,0.956743
9,binary,SP,robbert_v2_dutch_base_3_32_512,0.934211,0.889724,0.911425


In [21]:
pretty_df = score_df.round(3)
pretty_df = pretty_df[['category', 'prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{llrrr}
\toprule
             Letter category & Prediction method &  Precision &  Recall &    F1 \\
\midrule
           Discharge letters &           RobBERT &      0.953 &   0.974 & 0.963 \\
           Discharge letters &      RobBERT DAPT &      0.964 &   0.984 & 0.974 \\
           Discharge letters &     MedRoBERTa.nl &      0.973 &   0.966 & 0.970 \\
General Practitioner entries &           RobBERT &      0.950 &   0.912 & 0.931 \\
General Practitioner entries &      RobBERT DAPT &      0.941 &   0.923 & 0.932 \\
General Practitioner entries &     MedRoBERTa.nl &      0.943 &   0.923 & 0.933 \\
           Radiology reports &           RobBERT &      0.960 &   0.963 & 0.961 \\
           Radiology reports &      RobBERT DAPT &      0.956 &   0.960 & 0.958 \\
           Radiology reports &     MedRoBERTa.nl &      0.962 &   0.951 & 0.957 \\
          Specialist letters &           RobBERT &      0.934 &   0.890 & 0.911 \\
          Specialist letters &      RobBERT DA