# Evaluation

In [2]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np
import scipy as sc

pd.set_option('display.max_rows', 100)

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Predictions methods used in this project
#PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_512_2', 'MedRoBERTa_3_2_512', 
#                      'robbert_v2_dutch_base_DAPT_2_64_512','mv_ensemble', 'mv_ensemble_all']
PREDICTION_METHODS = ['robbert_v2_dutch_base_DAPT_3_32_512',
                      'robbert_v2_dutch_base_DAPT_3_32_32__centeredVal',
                      'robbert_v2_dutch_base_3_32_512',
                      'robbert_v2_dutch_base_3_32_32__centeredVal',
                      'MedRoBERTa_3_32_512',
                      'MedRoBERTa_3_32_32__centeredVal',
                      'robbert_v2_dutch_base_10_128_512_zeroShot',
                      'robbert_v2_dutch_base_DAPT_10_128_512_zeroShot',
                      'MedRoBERTa_10_128_512_zeroShot',     
                      'rule_based', 
                      'bilstm_cv',
                      'mv_ensemble',
                      'mv_ensemble_all']

# Pretty names:
PRETTY_NAMES = {'rule_based': 'Rule-based', 
                'bilstm_cv': 'BiLSTM', 
                'robbert_512': 'RobBERT',
                'ensemble': 'Ensemble',
                'average_method': 'Weighting average', 
                'category': 'Letter category',
                'prediction_method': 'Prediction method',
                'precision': 'Precision', 
                'recall': 'Recall',
                'f1': 'F1',
                'GP': 'General Practitioner entries',
                'SP': 'Specialist letters',
                'RD': 'Radiology reports',
                'DL': 'Discharge letters'}

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'

medroberta_prediction_file = result_dir / 'MedRoBERTa_3_32_512_predictions.csv.gz'
medroberta_prediction2_file = result_dir / 'MedRoBERTa_3_32_32__centeredVal_predictions.csv.gz'

robbert_dapt_prediction_file = result_dir /'robbert_v2_dutch_base_DAPT_3_32_512_predictions.csv.gz'
robbert_dapt_prediction2_file = result_dir /'robbert_v2_dutch_base_DAPT_3_32_32__centeredVal_predictions.csv.gz'

robbert_prediction_file = result_dir / 'robbert_v2_dutch_base_3_32_512_predictions.csv.gz'
robbert_prediction2_file = result_dir / 'robbert_v2_dutch_base_3_32_32__centeredVal_predictions.csv.gz'

robbert_predict_zeroShot_file = result_dir /'robbert_v2_dutch_base_10_128_512_zeroShot_predictions.csv.gz'
robbert_dapt_predict_zeroShot_file = result_dir /'robbert_v2_dutch_base_DAPT_10_128_512_zeroShot_predictions.csv.gz'
medroberta_predict_zeroShot_file = result_dir /'MedRoBERTa_10_128_512_zeroShot_predictions.csv.gz'

merged_result_file = result_dir / 'merged_results.csv.gz'



# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

In [3]:
# Load biLSTM (cross validation) predictions
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
print(bilstm_predictions_cv.head())

(12551, 2)
        entity_id    bilstm_cv
0    DL1111_32_46  not negated
1  DL1111_272_280  not negated
2  DL1111_363_377  not negated
3    DL1116_32_41  not negated
4  DL1116_137_148  not negated


In [4]:
# Load rule based predictions
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
print(ruled_based_predictions.head())

(12551, 2)
        entity_id   rule_based
0    DL1111_32_46  not negated
1  DL1111_272_280  not negated
2  DL1111_363_377  not negated
3    DL1112_22_28      negated
4    DL1113_59_67  not negated


In [6]:
# Load medroberta results
robbert_prediction = pd.read_csv(robbert_prediction_file)
robbert_prediction.drop(['category', 'label'], axis=1, inplace=True)
robbert_prediction.drop([c for c in robbert_prediction.columns if 'Unnamed' in c], axis=1, 
                        errors='ignore', inplace=True)

print(robbert_prediction.shape)

(12419, 2)


In [7]:
# Load RobBERT DAPT
robbert_dapt_prediction = pd.read_csv(robbert_dapt_prediction_file)
robbert_dapt_prediction.drop(['category', 'label'], axis=1, inplace=True)
robbert_dapt_prediction.drop([c for c in robbert_dapt_prediction.columns if 'Unnamed' in c], axis=1, 
                        errors='ignore', inplace=True)
print(robbert_dapt_prediction.shape)



(12419, 2)


In [9]:
# Load medroberta
medroberta_prediction = pd.read_csv(medroberta_prediction_file)
medroberta_prediction.drop(['category', 'label'], axis=1, inplace=True)
medroberta_prediction.drop([c for c in medroberta_prediction.columns if 'Unnamed' in c], axis=1, 
                        errors='ignore', inplace=True)
print(medroberta_prediction.shape)


(12431, 2)


In [11]:
'''
robbert_predict_zeroShot  = pd.read_csv(robbert_predict_zeroShot_file)
robbert_dapt_predict_zeroShot = pd.read_csv(robbert_dapt_predict_zeroShot_file)
medroberta_predict_zeroShot = pd.read_csv(medroberta_predict_zeroShot_file)

robbert_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)
robbert_dapt_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)
medroberta_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)
'''

"\nrobbert_predict_zeroShot  = pd.read_csv(robbert_predict_zeroShot_file)\nrobbert_dapt_predict_zeroShot = pd.read_csv(robbert_dapt_predict_zeroShot_file)\nmedroberta_predict_zeroShot = pd.read_csv(medroberta_predict_zeroShot_file)\n\nrobbert_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)\nrobbert_dapt_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)\nmedroberta_predict_zeroShot.drop(['category', 'label'], axis=1, inplace=True)\n"

In [12]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


In [14]:
robbert_prediction = robbert_prediction[~robbert_prediction.duplicated(keep='last')]
#robbert_prediction2 = robbert_prediction2[~robbert_prediction2.duplicated(keep='last')]

robbert_dapt_prediction = robbert_dapt_prediction[~robbert_dapt_prediction.duplicated(keep='last')]
#robbert_dapt_prediction2 = robbert_dapt_prediction2[~robbert_dapt_prediction2.duplicated(keep='last')]

medroberta_prediction = medroberta_prediction[~medroberta_prediction.duplicated(keep='last')]
#medroberta_prediction2 = medroberta_prediction2[~medroberta_prediction2.duplicated(keep='last')]

#robbert_predict_zeroShot = robbert_predict_zeroShot[~robbert_predict_zeroShot.duplicated(keep='last')]
#robbert_dapt_predict_zeroShot = robbert_dapt_predict_zeroShot[~robbert_dapt_predict_zeroShot.duplicated(keep='last')]
#medroberta_predict_zeroShot = medroberta_predict_zeroShot[~medroberta_predict_zeroShot.duplicated(keep='last')]

## Merge annotations from different methods

In [19]:
results = pd.merge(left=annotated_data, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')

results = pd.merge(left=results, right = robbert_prediction, left_on='entity_id', right_on='entity_id')
#results = pd.merge(left=results, right = robbert_prediction2, left_on='entity_id', right_on='entity_id')

results = pd.merge(left=results, right = medroberta_prediction, left_on='entity_id', right_on='entity_id')
#results = pd.merge(left=results, right = medroberta_prediction2, left_on='entity_id', right_on='entity_id')

results = pd.merge(left=results, right = robbert_dapt_prediction, left_on='entity_id', right_on='entity_id')
#results = pd.merge(left=results, right = robbert_dapt_prediction2, left_on='entity_id', right_on='entity_id')

#results = pd.merge(left=results, right = robbert_predict_zeroShot, left_on='entity_id', right_on='entity_id')
#results = pd.merge(left=results, right = robbert_dapt_predict_zeroShot, left_on='entity_id', right_on='entity_id')
#results = pd.merge(left=results, right = medroberta_predict_zeroShot, left_on='entity_id', right_on='entity_id')


results.to_csv(merged_result_file, index=False, compression='gzip', line_terminator='\n')
print(results.shape)
results.head(5)

(12419, 8)


Unnamed: 0,entity_id,category,label,bilstm_cv,rule_based,robbert_v2_dutch_base_3_32_512,MedRoBERTa_3_32_512,robbert_v2_dutch_base_DAPT_3_32_512
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated


In [46]:
results.entity_id.str.split("_").apply(lambda x: x[0]).nunique()

5365

In [31]:
results['mv_ensemble_all'] = results[['bilstm_cv', 'rule_based', 
                                      'MedRoBERTa_3_32_512', 'robbert_v2_dutch_base_DAPT_3_32_512',
                                      'robbert_v2_dutch_base_3_32_512']].apply(
                                        lambda x: sc.stats.mode(x)[0][0], axis=1)
results['mv_ensemble'] = results[['bilstm_cv', 'rule_based', 'robbert_v2_dutch_base_3_32_512']].apply(
                                        lambda x: sc.stats.mode(x)[0][0], axis=1)

In [48]:
sum(results.label=='negated'), \
sum((results.label=='negated') & (results.mv_ensemble_all=='negated')), \
sum((results.label=='negated') & (results.bilstm_cv=='negated')),\
sum((results.label=='negated') & (results.rule_based=='negated')),\
sum((results.label=='negated') & (results.robbert_v2_dutch_base_3_32_512=='negated')),\
sum((results.label=='negated') & (results.robbert_v2_dutch_base_3_32_512!='negated') & 
    (results.bilstm_cv!='negated') & 
    (results.rule_based!='negated')
    )

(1748, 1641, 1575, 1559, 1639, 61)

In [53]:
results[(results.label=='negated') & (results.robbert_v2_dutch_base_3_32_512!='negated') & 
        (results.MedRoBERTa_3_32_512!='negated') &
        (results.robbert_v2_dutch_base_DAPT_3_32_512!='negated') &
        (results.bilstm_cv!='negated') & 
        (results.rule_based!='negated')].shape

(43, 10)

## Function to calculate scores

In [37]:
def create_score_record(results, prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
     # Using dropna because our robbert implementation does not work for some examples.
    if category == 'all':
        # Calculate score for all categories. 
        subset = results[['label', prediction_method]].dropna()
    else:
        # Calculate scores for specific category
        subset = results.loc[results.category == category, ['label', prediction_method]].dropna()
    return [average_method, category, prediction_method] + \
            (list(precision_recall_fscore_support(subset.label, subset[prediction_method], 
                                                  labels=['negated', 'not negated'], average = 
                                                  average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 
                                               'prediction_method', 'precision', 
                                               'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

In [38]:
results.dropna().groupby('category').size()

category
DL    2797
GP    3335
RD    3677
SP    2629
dtype: int64

## Compare different ways of calculating scores

In [39]:
prediction_method = 'bilstm_cv'
score_list = []
for average_method in AVERAGE_METHODS:
    score_list.append(create_score_record(results.dropna(),
                                          prediction_method, 
                                          average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,bilstm_cv,0.926488,0.901249,0.913694
1,micro,all,bilstm_cv,0.97588,0.97588,0.97588
2,macro,all,bilstm_cv,0.955131,0.944723,0.949838
3,weighted,all,bilstm_cv,0.975659,0.97588,0.975741


## Scores per category (average: binary)

In [40]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(results.dropna(), prediction_method, average_method, category = category))
create_score_dataframe(score_list).round(2)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,robbert_v2_dutch_base_DAPT_3_32_512,0.96,0.98,0.97
1,binary,DL,robbert_v2_dutch_base_DAPT_3_32_32__centeredVal,0.96,0.97,0.97
2,binary,DL,robbert_v2_dutch_base_3_32_512,0.95,0.97,0.96
3,binary,DL,robbert_v2_dutch_base_3_32_32__centeredVal,0.95,0.98,0.97
4,binary,DL,MedRoBERTa_3_32_512,0.97,0.97,0.97
5,binary,DL,MedRoBERTa_3_32_32__centeredVal,0.97,0.97,0.97
6,binary,DL,robbert_v2_dutch_base_10_128_512_zeroShot,0.92,0.74,0.82
7,binary,DL,robbert_v2_dutch_base_DAPT_10_128_512_zeroShot,0.93,0.8,0.86
8,binary,DL,MedRoBERTa_10_128_512_zeroShot,0.98,0.87,0.93
9,binary,DL,rule_based,0.9,0.92,0.91


## Scores for all categories combined (average: binary)

In [42]:
# Calculate F1 Scores, drops rows containing NA for the only method that has NA's (robbert)
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(results.dropna(), prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list).round(3)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,robbert_v2_dutch_base_DAPT_3_32_512,0.953,0.945,0.949
1,binary,all,robbert_v2_dutch_base_DAPT_3_32_32__centeredVal,0.952,0.947,0.949
2,binary,all,robbert_v2_dutch_base_3_32_512,0.951,0.938,0.945
3,binary,all,robbert_v2_dutch_base_3_32_32__centeredVal,0.95,0.931,0.94
4,binary,all,MedRoBERTa_3_32_512,0.955,0.94,0.948
5,binary,all,MedRoBERTa_3_32_32__centeredVal,0.956,0.938,0.947
6,binary,all,robbert_v2_dutch_base_10_128_512_zeroShot,0.886,0.654,0.753
7,binary,all,robbert_v2_dutch_base_DAPT_10_128_512_zeroShot,0.895,0.754,0.818
8,binary,all,MedRoBERTa_10_128_512_zeroShot,0.94,0.821,0.876
9,binary,all,rule_based,0.826,0.892,0.858


## Investigate equal recall rule based and bilstm
Equal recall in both methods could be an indictation of a processing error. Investigate this by looking at examples and confusion matrix.

In [20]:
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_rows', 2000)
# predictions_no_nan.head(2000)

In [21]:
results.dropna().loc[1486]

entity_id                                         DL1647_71_77
category                                                    DL
label                                                  negated
bilstm                                                 negated
bilstm_cv                                              negated
rule_based                                         not negated
robbert_v2_dutch_base_2_64_512                         negated
robbert_v2_dutch_base_3_64_512                         negated
robbert_v2_dutch_base_3_32_512                         negated
robbert_v2_dutch_base_3_32_32                          negated
robbert_v2_dutch_base_3_32_64                          negated
MedRoBERTa_2_64_512                                    negated
MedRoBERTa_3_64_512                                    negated
MedRoBERTa_3_32_512                                    negated
robbert_v2_dutch_base_DAPT_2_64_512                    negated
robbert_v2_dutch_base_DAPT_3_64_512                    

In [22]:
results.dropna().loc[1970]

entity_id                                         DL1802_38_46
category                                                    DL
label                                                  negated
bilstm                                                 negated
bilstm_cv                                          not negated
rule_based                                             negated
robbert_v2_dutch_base_2_64_512                         negated
robbert_v2_dutch_base_3_64_512                         negated
robbert_v2_dutch_base_3_32_512                         negated
robbert_v2_dutch_base_3_32_32                          negated
robbert_v2_dutch_base_3_32_64                          negated
MedRoBERTa_2_64_512                                    negated
MedRoBERTa_3_64_512                                    negated
MedRoBERTa_3_32_512                                    negated
robbert_v2_dutch_base_DAPT_2_64_512                    negated
robbert_v2_dutch_base_DAPT_3_64_512                    

In [23]:
confusion_matrix(results.dropna().label, results.dropna().rule_based)

array([[ 1572,   204],
       [  331, 10346]])

In [24]:
confusion_matrix(results.dropna().label, results.dropna().bilstm_cv)

array([[ 1588,   188],
       [  126, 10551]])

In [25]:
confusion_matrix(results.dropna().label, results.dropna().MedRoBERTa_3_2_512)

AttributeError: 'DataFrame' object has no attribute 'MedRoBERTa_3_2_512'

## Ensemble method

In [None]:
def majority_vote(row):
    predicted_negated = 0
    for method in PREDICTION_METHODS:
        if row[method] == 'negated':
            predicted_negated += 1
    if predicted_negated > 1:
        return 'negated'
    else:
        return 'not negated'

predictions_no_nan['ensemble'] = results.dropna().apply(majority_vote, axis=1)

In [None]:
score_list.append(create_score_record(predictions_no_nan, 'ensemble', average_method='binary', category = 'all'))
score_df = create_score_dataframe(score_list)
score_df

In [None]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))