# Evaluation

In [1]:
from pathlib import Path
from evaluation_utils import print_statistics, get_document_text
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import json
import numpy as np

# Categories in DCC dataset
DOCUMENT_CATEGORIES = ['DL', 'GP', 'RD', 'SP']

# Primary predictions methods
PREDICTION_METHODS = ['rule_based', 'bilstm_cv', 'robbert_512']

# Pretty names:
PRETTY_NAMES = {'rule_based': 'Rule-based', 
                'bilstm_cv': 'BiLSTM', 
                'robbert_512': 'RobBERT',
                'ensemble': 'Ensemble'
                'average_method': 'Weighting average', 
                'category': 'Letter category',
                'prediction_method': 'Prediction method',
                'precision': 'Precision', 
                'recall': 'Recall',
                'f1': 'F1',
                'GP': 'General Practitioner entries',
                'SP': 'Specialist letters',
                'RD': 'Radiology reports',
                'DL': 'Discharge letters'}

# See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
AVERAGE_METHODS = ['binary', 'micro', 'macro', 'weighted']

# Input and output
data_dir = Path('data')
annotation_file = data_dir / 'emc-dcc_ann.json'
dcc_dir = data_dir / 'EMCDutchClinicalCorpus'
result_dir = Path('results')
bilstm_predictions_file = result_dir / 'bilstm_predictions.csv.gz'
bilstm_predictions_cv_file = result_dir / 'bilstm_predictions_cv.csv.gz'
rule_based_predictions_file = result_dir / 'rule-based_predictions.csv.gz'
robbert_predictions_file = result_dir / 'robbert_predictions.csv.gz'
merged_predictions_file = result_dir / 'merged_predictions.csv.gz'

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)

SyntaxError: invalid syntax (2297603330.py, line 19)

In [None]:
# Load biLSTM (complete model) predictions
bilstm_predictions = pd.read_csv(bilstm_predictions_file, sep=',')
print(bilstm_predictions.shape)
bilstm_predictions.head()

In [None]:
# Load biLSTM (cross validation) predictions
bilstm_predictions_cv = pd.read_csv(bilstm_predictions_cv_file)
print(bilstm_predictions_cv.shape)
bilstm_predictions_cv.head()

In [None]:
# Load rule based predictions
ruled_based_predictions = pd.read_csv(rule_based_predictions_file)
ruled_based_predictions.drop(['annotation'], axis=1, inplace=True)
print(ruled_based_predictions.shape)
ruled_based_predictions.head()

In [None]:
# Load robbert predictions
robbert_predictions = pd.read_csv(robbert_predictions_file)
robbert_predictions.drop(['category', 'label', 'bilstm', 'bilstm_cv', 'rule_based'], axis=1, inplace=True)
robbert_predictions.rename(columns={'robbert_512_2': 'robbert_512',
                                    'robbert_128_2': 'robbert_128',
                                    'robbert_32_2': 'robbert_32'}, inplace=True)
robbert_predictions.shape
robbert_predictions.head()

In [None]:
# Load annotated data
annotated_records = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        
        # Extract category
        if 'DL' in document_name:
            category = 'DL'
        elif 'GP' in document_name:
            category = 'GP'
        elif 'RD' in document_name:
            category = 'RD'
        else:
            category = 'SP'
        
        # Create row
        annotated_records.append([entity_id, category, negation_value])

annotated_data = pd.DataFrame(annotated_records, columns=['entity_id', 'category', 'label'])
print(annotated_data.shape)
annotated_data.head()

## Merge annotations from different methods

In [None]:
predictions = pd.merge(left=annotated_data, right = bilstm_predictions, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = bilstm_predictions_cv, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = ruled_based_predictions, left_on='entity_id', right_on='entity_id')
predictions = pd.merge(left=predictions, right = robbert_predictions, left_on='entity_id', right_on='entity_id')
predictions.to_csv(merged_predictions_file, index=False, compression='gzip', line_terminator='\n')
print(predictions.shape)
predictions.head(5)

## Function to calculate scores

In [None]:
def create_score_record(prediction_df, prediction_method, average_method, category='all'):
    """
    Wrapper for precision_recall_fscore_support().

    Returns list containing method parameters and scores that can be used as row in DataFrame.
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    """
    if average_method == 'binary':
        pos_label = 'negated'
    else:
        pos_label=1
    
     # Using dropna because our robbert implementation does not work for some examples.
    if category == 'all':
        # Calculate score for all categories. 
        subset = prediction_df[['label', prediction_method]].dropna()
    else:
        # Calculate scores for specific category
        subset = prediction_df.loc[prediction_df.category == category, ['label', prediction_method]].dropna()
    return [average_method, category, prediction_method] + (list(precision_recall_fscore_support(subset.label, subset[prediction_method], labels=['negated', 'not negated'], average = average_method, pos_label=pos_label)))

def create_score_dataframe(score_list):
    scores = pd.DataFrame(score_list, columns=['average_method', 'category', 'prediction_method', 'precision', 'recall', 'f1', 'support'])
    scores.drop(['support'], axis=1, inplace=True)
    return scores

## Compare different ways of calculating scores

In [None]:
prediction_method = 'bilstm_cv'
score_list = []
for average_method in AVERAGE_METHODS:
    score_list.append(create_score_record(predictions, prediction_method, average_method, category = 'all'))
create_score_dataframe(score_list)

## Scores per category (average: binary)

In [None]:
average_method = 'binary'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
score_df = create_score_dataframe(score_list)
score_df

In [None]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['category', 'prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
pretty_df

In [2]:
print(pretty_df.to_latex(index=False))

NameError: name 'pretty_df' is not defined

## Scores per category (average: micro)

In [None]:
average_method = 'micro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores per category (average: macro)

In [None]:
average_method = 'macro'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores per category (average: weighted)

In [None]:
average_method = 'weighted'
score_list = []
for category in DOCUMENT_CATEGORIES:
    for prediction_method in PREDICTION_METHODS:
        score_list.append(create_score_record(predictions, prediction_method, average_method, category = category))
create_score_dataframe(score_list)

## Scores for all categories combined (average: binary)

In [None]:
# Calculate F1 Scores, drops rows containing NA for the only method that has NA's (robbert)
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions, prediction_method, average_method='binary', category='all'))
print(len(predictions))
create_score_dataframe(score_list)

In [None]:
# Calculate F1 Scores, drops rows containing NA for all methods
predictions_no_nan = predictions.copy()[['entity_id', 'label', 'bilstm_cv', 'rule_based', 'robbert_512']].dropna()
score_list = []
for prediction_method in PREDICTION_METHODS:
    score_list.append(create_score_record(predictions_no_nan, prediction_method, average_method='binary', category = 'all'))
print(len(predictions_no_nan))
create_score_dataframe(score_list)

## Investigate equal recall rule based and bilstm
Equal recall in both methods could be an indictation of a processing error. Investigate this by looking at examples and confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_rows', 2000)
# predictions_no_nan.head(2000)

In [None]:
predictions_no_nan.loc[1486]

In [None]:
predictions_no_nan.loc[1970]

In [None]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.rule_based)

In [None]:
confusion_matrix(predictions_no_nan.label, predictions_no_nan.bilstm_cv)

## Ensemble method

In [None]:
def majority_vote(row):
    predicted_negated = 0
    for method in PREDICTION_METHODS:
        if row[method] == 'negated':
            predicted_negated += 1
    if predicted_negated > 1:
        return 'negated'
    else:
        return 'not negated'

predictions_no_nan['ensemble'] = predictions_no_nan.apply(majority_vote, axis=1)

In [None]:
score_list.append(create_score_record(predictions_no_nan, 'ensemble', average_method='binary', category = 'all'))
score_df = create_score_dataframe(score_list)
score_df

In [None]:
# Create pretty table
pretty_df = score_df.round(3)
pretty_df = pretty_df[['prediction_method', 'precision', 'recall', 'f1']]
pretty_df.rename(columns=PRETTY_NAMES, inplace=True)
pretty_df.replace(PRETTY_NAMES, inplace=True)
print(pretty_df.to_latex(index=False))