# Evaluation

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np
import scipy as sc

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Predictions methods used in this project
#PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_512_2', 'MedRoBERTa_3_2_512', 
#                      'robbert_v2_dutch_base_DAPT_2_64_512','mv_ensemble', 'mv_ensemble_all']
PREDICTION_METHODS = [ 'MedRoBERTa_3_2_512', 'robbert_v2_dutch_base_DAPT_3_64_512', 
                      'robbert_v2_dutch_base_3_64_512', 'medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512',
                     'mv_ensemble', 'mv_ensemble_all']

# Pretty names:
PRETTY_NAMES = {'rule_based': 'Rule-based', 
                'bilstm_cv': 'BiLSTM', 
                'robbert_512': 'RobBERT',
                'ensemble': 'Ensemble'
                'average_method': 'Weighting average', 
                'category': 'Letter category',
                'prediction_method': 'Prediction method',
                'precision': 'Precision', 
                'recall': 'Recall',
                'f1': 'F1',
                'GP': 'General Practitioner entries',
                'SP': 'Specialist letters',
                'RD': 'Radiology reports',
                'DL': 'Discharge letters'}

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_file = result_dir / 'bilstm_predictions.csv.gz'
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_predictions.csv.gz'
medroberta_predictions_file = result_dir / 'MedRoBERTa_3_2_512_predictions.csv.gz'
robbert_dapt_predicton_file= result_dir / 'robbert_v2_dutch_base_DAPT_2_64_512_predictions.csv.gz'
medroberta_dapt_prediction_file = result_dir / 'medroberta_journals_ARGUS_nvvc_IBD_RA_2_64_512_predictions.csv.gz'
robbert_dapt_prediction2_file = result_dir /'robbert_v2_dutch_base_DAPT_3_64_512_predictions.csv.gz'
robbert_prediction2_file = result_dir /'robbert_v2_dutch_base_3_64_512_predictions.csv.gz'
medroberta_dapt_prediction2_file = result_dir /'medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512_predictions.csv.gz'
merged_result_file = result_dir / 'merged_results.csv.gz'



# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

SyntaxError: invalid syntax (2297603330.py, line 19)

In [None]:
# Load biLSTM (complete model) predictions
bilstm_predictions = pd.read_csv(bilstm_predictions_file, sep=',')
print(bilstm_predictions.shape)
bilstm_predictions.head()

In [None]:
# Load biLSTM (cross validation) predictions
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
bilstm_predictions_cv.head()

In [None]:
# Load rule based predictions
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
ruled_based_predictions.head()

In [None]:
# Load robbert predictions
robbert_predictions = pd.read_csv(robbert_predictions_file)
robbert_predictions.drop(['category', 'label', 'bilstm', 'bilstm_cv', 'rule_based'], axis=1, inplace=True)
robbert_predictions.rename(columns={'robbert_512_2': 'robbert_512',
                                    'robbert_128_2': 'robbert_128',
                                    'robbert_32_2': 'robbert_32'}, inplace=True)
robbert_predictions.shape
robbert_predictions.head()

Unnamed: 0,entity_id,robbert_512_2,robbert_128_2,robbert_32_2,robbert_32_1
0,DL1111_32_46,not negated,not negated,not negated,not negated
1,DL1111_272_280,not negated,not negated,,
2,DL1111_363_377,not negated,not negated,,
3,DL1112_22_28,negated,negated,negated,negated
4,DL1113_59_67,not negated,not negated,not negated,not negated


In [6]:
# Load medroberta results
medroberta_predictions = pd.read_csv(medroberta_predictions_file)
medroberta_predictions.drop(['category', 'label'], axis=1, inplace=True)
medroberta_predictions.shape
medroberta_predictions.head()

Unnamed: 0,entity_id,MedRoBERTa_3_2_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [7]:
# Load RobBERT DAPT
robbert_dapt_predicton = pd.read_csv(robbert_dapt_predicton_file)
robbert_dapt_predicton.drop(['category', 'label'], axis=1, inplace=True)
robbert_dapt_predicton.shape
robbert_dapt_predicton.head()

Unnamed: 0,entity_id,robbert_v2_dutch_base_DAPT_2_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [8]:
# Load RobBERT DAPT
medroberta_dapt_prediction = pd.read_csv(medroberta_dapt_prediction_file)
medroberta_dapt_prediction.drop(['category', 'label'], axis=1, inplace=True)
medroberta_dapt_prediction.shape
medroberta_dapt_prediction.head()

Unnamed: 0,entity_id,medroberta_journals_ARGUS_nvvc_IBD_RA_2_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [9]:
# Load RobBERT DAPT, 3 epochs
robbert_dapt_prediction2 = pd.read_csv(robbert_dapt_prediction2_file)
robbert_dapt_prediction2.drop(['category', 'label'], axis=1, inplace=True)
robbert_dapt_prediction2.shape
robbert_dapt_prediction2.head()


Unnamed: 0,entity_id,robbert_v2_dutch_base_DAPT_3_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [10]:
# Load RobBERT, 3 epochs
robbert_prediction2 = pd.read_csv(robbert_prediction2_file)
robbert_prediction2.drop(['category', 'label'], axis=1, inplace=True)
robbert_prediction2.shape
robbert_prediction2.head()


Unnamed: 0,entity_id,robbert_v2_dutch_base_3_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [11]:
# Load RobBERT, 3 epochs
medroberta_dapt_prediction2 = pd.read_csv(medroberta_dapt_prediction2_file)
medroberta_dapt_prediction2.drop(['category', 'label'], axis=1, inplace=True)
medroberta_dapt_prediction2.shape
medroberta_dapt_prediction2.head()


Unnamed: 0,entity_id,medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512
0,DL1111_32_46,not negated
1,DL1111_272_280,not negated
2,DL1111_363_377,not negated
3,DL1112_22_28,negated
4,DL1113_59_67,not negated


In [12]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

(12551, 3)


Unnamed: 0,entity_id,category,label
0,DL1111_32_46,DL,not negated
1,DL1111_272_280,DL,not negated
2,DL1111_363_377,DL,not negated
3,DL1112_22_28,DL,negated
4,DL1113_59_67,DL,not negated


## Merge annotations from different methods

In [13]:
results = pd.merge(left=annotated_data, right = bilstm_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = medroberta_predictions, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_dapt_predicton, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = medroberta_dapt_prediction, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_dapt_prediction2, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = robbert_prediction2, left_on='entity_id', right_on='entity_id')
results = pd.merge(left=results, right = medroberta_dapt_prediction2, left_on='entity_id', right_on='entity_id')
results.to_csv(merged_result_file, index=False, compression='gzip', line_terminator='\n')
print(results.shape)
results.head(5)

(12551, 16)


Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512_2,robbert_128_2,robbert_32_2,robbert_32_1,MedRoBERTa_3_2_512,robbert_v2_dutch_base_DAPT_2_64_512,medroberta_journals_ARGUS_nvvc_IBD_RA_2_64_512,robbert_v2_dutch_base_DAPT_3_64_512,robbert_v2_dutch_base_3_64_512,medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,,,not negated,not negated,not negated,not negated,not negated,not negated
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,,,not negated,not negated,not negated,not negated,not negated,not negated
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated,not negated


In [14]:
results.drop(['robbert_128_2', 'robbert_32_2', 'robbert_32_1'], axis=1, inplace=True)

In [15]:
results['mv_ensemble_all'] = results[['bilstm', 'rule_based', 
                                      'MedRoBERTa_3_2_512', 'robbert_v2_dutch_base_DAPT_3_64_512',
                                      'robbert_v2_dutch_base_3_64_512']].apply(
                                        lambda x: sc.stats.mode(x)[0][0], axis=1)
results['mv_ensemble'] = results[['bilstm', 'rule_based', 'robbert_512_2']].apply(
                                        lambda x: sc.stats.mode(x)[0][0], axis=1)

## Function to calculate scores

In [16]:
def create_score_record(results, prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
     # Using dropna because our robbert implementation does not work for some examples.
    if category == 'all':
        # Calculate score for all categories. 
        subset = prediction_df[['label', prediction_method]].dropna()
    else:
        # Calculate scores for specific category
        subset = results.loc[results.category == category, ['label', prediction_method]].dropna()
    return [average_method, category, prediction_method] + \
            (list(precision_recall_fscore_support(subset.label, subset[prediction_method], 
                                                  labels=['negated', 'not negated'], average = 
                                                  average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 
                                               'prediction_method', 'precision', 
                                               'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

In [17]:
results.dropna().groupby('category').size()

category
DL    2549
GP    2639
RD    3062
SP    2147
dtype: int64

## Compare different ways of calculating scores

In [18]:
prediction_method = 'bilstm_cv'
score_list = []
for average_method in AVERAGE_METHODS:
    score_list.append(create_score_record(results.dropna(), prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,bilstm_cv,0.936278,0.921167,0.928661
1,micro,all,bilstm_cv,0.978071,0.978071,0.978071
2,macro,all,bilstm_cv,0.960933,0.954836,0.957852
3,weighted,all,bilstm_cv,0.977947,0.978071,0.977998


## Scores per category (average: binary)

In [28]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(results.dropna(), prediction_method, average_method, category = category))
create_score_dataframe(score_list).replace('robbert_512_2', 'RobBERT')\
                                  .replace('robbert_v2_dutch_base_DAPT_2_64_512', 'RobBERT DAPT')\
                                  .round(2)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,DL,MedRoBERTa_3_2_512,0.97,0.97,0.97
1,binary,DL,robbert_v2_dutch_base_DAPT_3_64_512,0.96,0.99,0.97
2,binary,DL,robbert_v2_dutch_base_3_64_512,0.96,0.98,0.97
3,binary,DL,medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512,0.96,0.97,0.96
4,binary,DL,mv_ensemble,0.98,0.99,0.98
5,binary,DL,mv_ensemble_all,0.97,0.99,0.98
6,binary,GP,MedRoBERTa_3_2_512,0.97,0.96,0.96
7,binary,GP,robbert_v2_dutch_base_DAPT_3_64_512,0.96,0.91,0.94
8,binary,GP,robbert_v2_dutch_base_3_64_512,0.96,0.9,0.93
9,binary,GP,medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512,0.96,0.93,0.95


## Scores per category (average: micro)

In [None]:
average_method = 'micro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(results, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores per category (average: macro)

In [None]:
average_method = 'macro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(results, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores per category (average: weighted)

In [None]:
average_method = 'weighted'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(results, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores for all categories combined (average: binary)

In [25]:
# Calculate F1 Scores, drops rows containing NA for the only method that has NA's (robbert)
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(results.dropna(), prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list).round(2)

Unnamed: 0,average_method,category,prediction_method,precision,recall,f1
0,binary,all,MedRoBERTa_3_2_512,0.96,0.95,0.95
1,binary,all,robbert_v2_dutch_base_DAPT_3_64_512,0.95,0.95,0.95
2,binary,all,robbert_v2_dutch_base_3_64_512,0.95,0.93,0.94
3,binary,all,medroberta_journals_ARGUS_nvvc_IBD_RA_3_64_512,0.95,0.94,0.95
4,binary,all,mv_ensemble,0.97,0.95,0.96
5,binary,all,mv_ensemble_all,0.97,0.95,0.96


In [None]:
# Calculate F1 Scores, drops rows containing NA for all methods
predictions_no_nan = predictions.copy()[['entity_id', 'label', 'bilstm_cv', 'rule_based', 'robbert_512']].dropna()
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions_no_nan, prediction_method, average_method='binary', category = 'all'))
print(len(predictions_no_nan))
create_score_dataframe(score_list)

## Investigate equal recall rule based and bilstm
Equal recall in both methods could be an indictation of a processing error. Investigate this by looking at examples and confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_rows', 2000)
# predictions_no_nan.head(2000)

In [None]:
predictions_no_nan.loc[1486]

In [None]:
predictions_no_nan.loc[1970]

In [None]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.rule_based)

In [None]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.bilstm_cv)

## Ensemble method

In [None]:
def majority_vote(row):
    predicted_negated = 0
    for method in PREDICTION_METHODS:
        if row[method] == 'negated':
            predicted_negated += 1
    if predicted_negated > 1:
        return 'negated'
    else:
        return 'not negated'

predictions_no_nan['ensemble'] = predictions_no_nan.apply(majority_vote, axis=1)

In [None]:
score_list.append(create_score_record(predictions_no_nan, 'ensemble', average_method='binary', category = 'all'))
score_df = create_score_dataframe(score_list)
score_df

In [None]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))