# Evaluation

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Predictions methods to compare
PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_512']
ROBBERT_PREDICTION_METHODS = ['robbert_2epochs', 'robbert_3epochs', 'robbert_dapt', 'medrobertanl']

# Pretty names:
PRETTY_NAMES = {'rule_based': 'Rule-based', 
                'bilstm_cv': 'BiLSTM', 
                'robbert_512': 'RobBERT',
                'ensemble': 'Ensemble',
                'robbert_2epochs': 'RobBERT (2 epochs)', 
                'robbert_3epochs': 'RobBERT (3 epochs)', 
                'robbert_dapt': 'RobBERT DAPT',
                'medrobertanl': 'MedRoBERTa.nl',
                'average_method': 'Weighting average', 
                'category': 'Letter category',
                'prediction_method': 'Prediction method',
                'precision': 'Precision', 
                'recall': 'Recall',
                'f1': 'F1',
                'GP': 'General Practitioner entries',
                'SP': 'Specialist letters',
                'RD': 'Radiology reports',
                'DL': 'Discharge letters'}

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_file = result_dir / 'bilstm_predictions.csv.gz'
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_predictions.csv.gz'
merged_predictions_file = result_dir / 'merged_predictions.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [2]:
# Load biLSTM (complete model) predictions
bilstm_predictions = pd.read_csv(bilstm_predictions_file, sep=',')
print(bilstm_predictions.shape)
bilstm_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [3]:
# Load biLSTM (cross validation) predictions
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
bilstm_predictions_cv.head()

(12551, 2)


Unnamed: 0,entity_id,bilstm_cv
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1116_32_41,not negated
4,DL1116_137_148,not negated


In [4]:
# Load rule based predictions
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
ruled_based_predictions.head()

(12551, 2)


Unnamed: 0,entity_id,rule_based
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [5]:
# Load robbert (2 epochs) predictions
robbert2epochs_predictions = pd.read_csv(robbert_predictions_file)
robbert2epochs_predictions.drop(['category', 'label', 'bilstm', 'bilstm_cv', 'rule_based'], axis=1, inplace=True)
robbert2epochs_predictions.rename(columns={'robbert_512_2': 'robbert_512',
                                    'robbert_128_2': 'robbert_128',
                                    'robbert_32_2': 'robbert_32'}, inplace=True)
robbert2epochs_predictions.shape
robbert2epochs_predictions.head()

Unnamed: 0,entity_id,robbert_512,robbert_128,robbert_32
0,DL1111_32_46,not negated,not negated,not negated
1,DL1111_272_280,not negated,not negated,
2,DL1111_363_377,not negated,not negated,
3,DL1112_22_28,negated,negated,negated
4,DL1113_59_67,not negated,not negated,not negated


In [6]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


## Merge annotations from different methods

In [7]:
predictions = pd.merge(left=annotated_data, right = bilstm_predictions, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = robbert2epochs_predictions, left_on='entity_id', right_on='entity_id')
predictions.to_csv(merged_predictions_file, index=False, compression='gzip', line_terminator='\n')
print(predictions.shape)
predictions.head(5)

(12551, 9)


Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512,robbert_128,robbert_32
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated


## Function to calculate scores

In [8]:
def create_score_record(prediction_df, prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
     # Using dropna because our robbert implementation does not work for some examples.
    if category == 'all':
        # Calculate score for all categories. 
        subset = prediction_df[['label', prediction_method]].dropna()
    else:
        # Calculate scores for specific category
        subset = prediction_df.loc[prediction_df.category == category, ['label', prediction_method]].dropna()
    return [average_method, category, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

## Compare different ways of calculating scores

In [9]:
prediction_method = 'bilstm_cv'
score_list = []
for average_method in AVERAGE_METHODS:
    score_list.append(create_score_record(predictions, prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,bilstm_cv,0.925059,0.897727,0.911188
1,micro,all,bilstm_cv,0.97546,0.97546,0.97546
2,macro,all,bilstm_cv,0.954229,0.942933,0.948476
3,weighted,all,bilstm_cv,0.975218,0.97546,0.975306


## Scores per category (average: binary)

In [10]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
score_df = create_score_dataframe(score_list)
score_df

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,rule_based,0.892583,0.920844,0.906494
1,binary,DL,bilstm_cv,0.95664,0.931398,0.94385
2,binary,DL,robbert_512,0.955381,0.970667,0.962963
3,binary,GP,rule_based,0.671024,0.804178,0.731591
4,binary,GP,bilstm_cv,0.885117,0.885117,0.885117
5,binary,GP,robbert_512,0.94586,0.883929,0.913846
6,binary,RD,rule_based,0.901254,0.966387,0.932685
7,binary,RD,bilstm_cv,0.932773,0.932773,0.932773
8,binary,RD,robbert_512,0.963287,0.958261,0.960767
9,binary,SP,rule_based,0.808153,0.836228,0.821951


In [11]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['category', 'prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
pretty_df

Unnamed: 0,Letter category,Prediction method,Precision,Recall,F1
0,Discharge letters,Rule-based,0.893,0.921,0.906
1,Discharge letters,BiLSTM,0.957,0.931,0.944
2,Discharge letters,RobBERT,0.955,0.971,0.963
3,General Practitioner entries,Rule-based,0.671,0.804,0.732
4,General Practitioner entries,BiLSTM,0.885,0.885,0.885
5,General Practitioner entries,RobBERT,0.946,0.884,0.914
6,Radiology reports,Rule-based,0.901,0.966,0.933
7,Radiology reports,BiLSTM,0.933,0.933,0.933
8,Radiology reports,RobBERT,0.963,0.958,0.961
9,Specialist letters,Rule-based,0.808,0.836,0.822


In [12]:
print(pretty_df.to_latex(index=False))

\begin{tabular}{llrrr}
\toprule
             Letter category & Prediction method &  Precision &  Recall &    F1 \\
\midrule
           Discharge letters &        Rule-based &      0.893 &   0.921 & 0.906 \\
           Discharge letters &            BiLSTM &      0.957 &   0.931 & 0.944 \\
           Discharge letters &           RobBERT &      0.955 &   0.971 & 0.963 \\
General Practitioner entries &        Rule-based &      0.671 &   0.804 & 0.732 \\
General Practitioner entries &            BiLSTM &      0.885 &   0.885 & 0.885 \\
General Practitioner entries &           RobBERT &      0.946 &   0.884 & 0.914 \\
           Radiology reports &        Rule-based &      0.901 &   0.966 & 0.933 \\
           Radiology reports &            BiLSTM &      0.933 &   0.933 & 0.933 \\
           Radiology reports &           RobBERT &      0.963 &   0.958 & 0.961 \\
          Specialist letters &        Rule-based &      0.808 &   0.836 & 0.822 \\
          Specialist letters &            BiLS

## Scores per category (average: micro)

In [13]:
average_method = 'micro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,micro,DL,rule_based,0.974249,0.974249,0.974249
1,micro,DL,bilstm_cv,0.984979,0.984979,0.984979
2,micro,DL,robbert_512,0.98935,0.98935,0.98935
3,micro,GP,rule_based,0.933451,0.933451,0.933451
4,micro,GP,bilstm_cv,0.974087,0.974087,0.974087
5,micro,GP,robbert_512,0.979907,0.979907,0.979907
6,micro,RD,rule_based,0.977464,0.977464,0.977464
7,micro,RD,bilstm_cv,0.978279,0.978279,0.978279
8,micro,RD,robbert_512,0.985938,0.985938,0.985938
9,micro,SP,rule_based,0.945441,0.945441,0.945441


## Scores per category (average: macro)

In [14]:
average_method = 'macro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,macro,DL,rule_based,0.940055,0.951734,0.945781
1,macro,DL,bilstm_cv,0.972963,0.962389,0.96759
2,macro,DL,robbert_512,0.975244,0.981562,0.978372
3,macro,GP,rule_based,0.822744,0.877031,0.846804
4,macro,GP,bilstm_cv,0.935257,0.935257,0.935257
5,macro,GP,robbert_512,0.965045,0.938496,0.951237
6,macro,RD,rule_based,0.947343,0.972993,0.959576
7,macro,RD,bilstm_cv,0.95991,0.95991,0.95991
8,macro,RD,robbert_512,0.977077,0.97513,0.9761
9,macro,SP,rule_based,0.889469,0.900516,0.894868


## Scores per category (average: weighted)

In [15]:
average_method = 'weighted'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,weighted,DL,rule_based,0.974656,0.974249,0.974418
1,weighted,DL,bilstm_cv,0.984862,0.984979,0.984894
2,weighted,DL,robbert_512,0.98944,0.98935,0.989385
3,weighted,GP,rule_based,0.940242,0.933451,0.936029
4,weighted,GP,bilstm_cv,0.974087,0.974087,0.974087
5,weighted,GP,robbert_512,0.979604,0.979907,0.979612
6,weighted,RD,rule_based,0.97854,0.977464,0.977778
7,weighted,RD,bilstm_cv,0.978279,0.978279,0.978279
8,weighted,RD,robbert_512,0.985912,0.985938,0.985923
9,weighted,SP,rule_based,0.946292,0.945441,0.945822


## Scores for all categories combined (average: binary)

In [16]:
# Calculate F1 Scores, drops rows containing NA for the only method that has NA's (robbert)
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions, prediction_method, average_method='binary', category='all'))
print(len(predictions))
create_score_dataframe(score_list)

12551


Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,rule_based,0.823622,0.891477,0.856207
1,binary,all,bilstm_cv,0.925059,0.897727,0.911188
2,binary,all,robbert_512,0.952204,0.928571,0.940239


In [17]:
# Calculate F1 Scores, drops rows containing NA for all methods
predictions_no_nan = predictions.copy()[['entity_id', 'label', 'bilstm_cv', 'rule_based', 'robbert_512']].dropna()
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions_no_nan, prediction_method, average_method='binary', category = 'all'))
print(len(predictions_no_nan))
create_score_dataframe(score_list)

10917


Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,rule_based,0.83196,0.91707,0.872445
1,binary,all,bilstm_cv,0.934608,0.91707,0.925756
2,binary,all,robbert_512,0.952204,0.928571,0.940239


## Investigate equal recall rule based and bilstm
Equal recall in both methods could be an indictation of a processing error. Investigate this by looking at examples and confusion matrix.

In [18]:
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_rows', 2000)
# predictions_no_nan.head(2000)

In [19]:
predictions_no_nan.loc[1486]

entity_id      DL1647_71_77
label               negated
bilstm_cv           negated
rule_based      not negated
robbert_512         negated
Name: 1486, dtype: object

In [20]:
predictions_no_nan.loc[1970]

entity_id      DL1802_38_46
label               negated
bilstm_cv       not negated
rule_based          negated
robbert_512         negated
Name: 1970, dtype: object

In [21]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.rule_based)

array([[1515,  137],
       [ 306, 8959]])

In [22]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.bilstm_cv)

array([[1515,  137],
       [ 106, 9159]])

## Ensemble method

In [23]:
def majority_vote(row):
    predicted_negated = 0
    for method in PREDICTION_METHODS:
        if row[method] == 'negated':
            predicted_negated += 1
    if predicted_negated > 1:
        return 'negated'
    else:
        return 'not negated'

predictions_no_nan['ensemble'] = predictions_no_nan.apply(majority_vote, axis=1)

In [24]:
score_list.append(create_score_record(predictions_no_nan, 'ensemble', average_method='binary', category = 'all'))
score_df = create_score_dataframe(score_list)
score_df

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,rule_based,0.83196,0.91707,0.872445
1,binary,all,bilstm_cv,0.934608,0.91707,0.925756
2,binary,all,robbert_512,0.952204,0.928571,0.940239
3,binary,all,ensemble,0.948543,0.92615,0.937213


In [25]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Prediction method &  Precision &  Recall &    F1 \\
\midrule
       Rule-based &      0.832 &   0.917 & 0.872 \\
           BiLSTM &      0.935 &   0.917 & 0.926 \\
          RobBERT &      0.952 &   0.929 & 0.940 \\
         Ensemble &      0.949 &   0.926 & 0.937 \\
\bottomrule
\end{tabular}



## Add other RobBERT models
### Load data

In [26]:
robbert3epochs_predictions_file = robbert_predictions_file = result_dir / 'robbert_v2_dutch_base_3_64_512_predictions.csv.gz'
robbertdapts_predictions_file = robbert_predictions_file = result_dir / 'robbert_v2_dutch_base_DAPT_3_64_512_predictions.csv.gz'
medrobertanl_predictions_file = robbert_predictions_file = result_dir / 'MedRoBERTa_3_2_512_predictions.csv.gz'

In [27]:
# Load robbert (3 epochs) predictions
robbert3epochs_predictions = pd.read_csv(robbert3epochs_predictions_file)
robbert3epochs_predictions.drop(['category', 'label'], axis=1, inplace=True)
robbert3epochs_predictions.shape
robbert3epochs_predictions.head()

Unnamed: 0,entity_id,robbert_v2_dutch_base_3_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [28]:
# Load robbert dapt predictions
robbertdapts_predictions = pd.read_csv(robbertdapts_predictions_file)
robbertdapts_predictions.drop(['category', 'label'], axis=1, inplace=True)
robbertdapts_predictions.shape
robbertdapts_predictions.head()

Unnamed: 0,entity_id,robbert_v2_dutch_base_DAPT_3_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [29]:
# Load robbert dapt predictions
medrobertanl_predictions = pd.read_csv(medrobertanl_predictions_file)
medrobertanl_predictions.drop(['category', 'label'], axis=1, inplace=True)
medrobertanl_predictions.shape
medrobertanl_predictions.head()

Unnamed: 0,entity_id,MedRoBERTa_3_2_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [30]:
robbert_predictions = robbert2epochs_predictions.copy()
robbert_predictions = pd.merge(left=robbert_predictions, right = robbert3epochs_predictions, left_on='entity_id', right_on='entity_id')
robbert_predictions = pd.merge(left=robbert_predictions, right = robbertdapts_predictions, left_on='entity_id', right_on='entity_id')
robbert_predictions = pd.merge(left=robbert_predictions, right = medrobertanl_predictions, left_on='entity_id', right_on='entity_id')
robbert_predictions = pd.merge(left=robbert_predictions, right = annotated_data, left_on='entity_id', right_on='entity_id')

In [31]:
robbert_predictions.drop(['robbert_128', 'robbert_32'], axis=1, inplace=True)
robbert_predictions.rename(columns={'robbert_512': 'robbert_2epochs',
                                    'robbert_v2_dutch_base_3_64_512': 'robbert_3epochs',
                                    'robbert_v2_dutch_base_DAPT_3_64_512': 'robbert_dapt',
                                    'MedRoBERTa_3_2_512': 'medrobertanl'}, inplace=True)

In [32]:
robbert_predictions.head()

Unnamed: 0,entity_id,robbert_2epochs,robbert_3epochs,robbert_dapt,medrobertanl,category,label
0,DL1111_32_46,not negated,not negated,not negated,not negated,DL,not negated
1,DL1111_272_280,not negated,not negated,not negated,not negated,DL,not negated
2,DL1111_363_377,not negated,not negated,not negated,not negated,DL,not negated
3,DL1112_22_28,negated,negated,negated,negated,DL,negated
4,DL1113_59_67,not negated,not negated,not negated,not negated,DL,not negated


In [33]:
print(f'Total entities: {len(robbert_predictions)}')
robbert_predictions.dropna(inplace=True)
print(f'Entities for which there is a prediction for all methods: {len(robbert_predictions)}')

Total entities: 12551
Entities for which there is a prediction for all methods: 10561


### Compare RobBERT methods based on all available entities

In [34]:
# Calculate F1 Scores, drops rows containing NA for all methods
score_list = []
for prediction_method in ROBBERT_PREDICTION_METHODS:
    score_list.append(create_score_record(robbert_predictions, prediction_method, average_method='binary', category = 'all'))
print(len(robbert_predictions))
create_score_dataframe(score_list)

10561


Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,robbert_2epochs,0.952681,0.932099,0.942278
1,binary,all,robbert_3epochs,0.954373,0.92963,0.941839
2,binary,all,robbert_dapt,0.952055,0.943827,0.947923
3,binary,all,medrobertanl,0.95758,0.947531,0.952529


In [35]:
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Prediction method &  Precision &  Recall &    F1 \\
\midrule
       Rule-based &      0.832 &   0.917 & 0.872 \\
           BiLSTM &      0.935 &   0.917 & 0.926 \\
          RobBERT &      0.952 &   0.929 & 0.940 \\
         Ensemble &      0.949 &   0.926 & 0.937 \\
\bottomrule
\end{tabular}



### Compare RobBERT methods per category

In [36]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in ROBBERT_PREDICTION_METHODS:
        score_list.append(create_score_record(robbert_predictions, prediction_method, average_method, category = category))
score_df = create_score_dataframe(score_list)
score_df

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,robbert_2epochs,0.954667,0.972826,0.963661
1,binary,DL,robbert_3epochs,0.957447,0.978261,0.967742
2,binary,DL,robbert_dapt,0.957784,0.986413,0.971888
3,binary,DL,medrobertanl,0.969945,0.964674,0.967302
4,binary,GP,robbert_2epochs,0.945161,0.890578,0.917058
5,binary,GP,robbert_3epochs,0.963576,0.884498,0.922345
6,binary,GP,robbert_dapt,0.964401,0.905775,0.934169
7,binary,GP,medrobertanl,0.963077,0.951368,0.957187
8,binary,RD,robbert_2epochs,0.962898,0.957821,0.960352
9,binary,RD,robbert_3epochs,0.954783,0.964851,0.95979


In [37]:
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
 Prediction method &  Precision &  Recall &    F1 \\
\midrule
RobBERT (2 epochs) &      0.955 &   0.973 & 0.964 \\
RobBERT (3 epochs) &      0.957 &   0.978 & 0.968 \\
      RobBERT DAPT &      0.958 &   0.986 & 0.972 \\
     MedRoBERTa.nl &      0.970 &   0.965 & 0.967 \\
RobBERT (2 epochs) &      0.945 &   0.891 & 0.917 \\
RobBERT (3 epochs) &      0.964 &   0.884 & 0.922 \\
      RobBERT DAPT &      0.964 &   0.906 & 0.934 \\
     MedRoBERTa.nl &      0.963 &   0.951 & 0.957 \\
RobBERT (2 epochs) &      0.963 &   0.958 & 0.960 \\
RobBERT (3 epochs) &      0.955 &   0.965 & 0.960 \\
      RobBERT DAPT &      0.946 &   0.960 & 0.953 \\
     MedRoBERTa.nl &      0.954 &   0.953 & 0.953 \\
RobBERT (2 epochs) &      0.940 &   0.887 & 0.913 \\
RobBERT (3 epochs) &      0.942 &   0.864 & 0.901 \\
      RobBERT DAPT &      0.944 &   0.910 & 0.927 \\
     MedRoBERTa.nl &      0.945 &   0.918 & 0.931 \\
\bottomrule
\end{tabular}

