# Compare errors and errors categories
In previous notebooks we did cross validation for a set of entities to show how well our methods are able to predict a negation. Afterwards we extracted the errors per fold, manually checked what kind of errors were made and grouped the errors into categories. This notebook compares the errors and error categories between the different methods, to see whether certain types of errors are made by specific methods.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from pathlib import Path
from sklearn.metrics import cohen_kappa_score

pd.set_option('display.max_rows', 2000)

# Solve issue with PDF images https://github.com/plotly/plotly.py/issues/3469
pio.kaleido.scope.mathjax = None

In [None]:
# Pretty names for plots
pretty_names_error_methods = {'rule_based': 'Rule-based',
                              'bilstm': 'BiLSTM',
                              'robbert': 'RobBERT',
                              'bilstm_and_rule_based': 'BiLSTM & rule-based',
                              'robbert_and_rule_based': 'RobBERT & rule-based',
                              'robbert_and_bilstm': 'RobBERT & BiLSTM',
                              'all': 'All'}

pretty_names_error_categories = {'annotation_error': 'Annotation error',
                                 'negation_of_different_term': 'Negation of different term',
                                 'ambiguous': 'Ambiguous',
                                 'speculation': 'Speculation',
                                 'other': 'Other', 
                                 'punctuation': 'Punctuation',
                                 'minus': 'Minus',
                                 'scope': 'Scope',
                                 'uncommon_negation': 'Uncommon negation',
                                 'wrong_modality': 'Wrong modality'}

In [None]:
# Set file paths
error_analysis_dir = Path('data/error_analyses/')
fp_rule_based_file = error_analysis_dir / 'false-positives_rule-based.csv'
fn_rule_based_file = error_analysis_dir / 'false-negatives_rule-based.csv'
fp_bilstm_file = error_analysis_dir / 'false-positives_bilstm.ods'
fn_bilstm_file = error_analysis_dir / 'false-negatives_bilstm.ods'
fp_robbert_file = error_analysis_dir / 'false-positives_robbert.csv'
fn_robbert_file = error_analysis_dir / 'false-negatives_robbert.csv'

figure_dir = Path('figures')
figure_dir.mkdir(exist_ok=True)

predictions_file = Path('results/merged_predictions.csv.gz')

In [None]:
# Open files and normalize
fp_rule_based = pd.read_csv(fp_rule_based_file, sep=';', usecols=['entity_id', 'category'], index_col='entity_id')
fp_rule_based.rename(columns={'category': 'rule_based'}, inplace=True)
fp_rule_based.replace(' ', '_', regex=True, inplace=True)

fn_rule_based = pd.read_csv(fn_rule_based_file, sep=';', usecols=['entity_id', 'category'], index_col='entity_id')
fn_rule_based.rename(columns={'category': 'rule_based'}, inplace=True)
fn_rule_based.replace(' ', '_', regex=True, inplace=True)

fp_bilstm = pd.read_excel(fp_bilstm_file, usecols=['entity id','category'], index_col='entity id')
fp_bilstm.rename(columns={'category': 'bilstm'}, inplace=True)
fp_bilstm.index.name = 'entity_id'
fp_bilstm.replace(' ', '_', regex=True, inplace=True)

fn_bilstm = pd.read_excel(fn_bilstm_file, usecols=['entity id','category'], index_col='entity id')
fn_bilstm.rename(columns={'category': 'bilstm'}, inplace=True)
fn_bilstm.index.name = 'entity_id'
fn_bilstm.replace(' ', '_', regex=True, inplace=True)

fp_robbert = pd.read_csv(fp_robbert_file, sep=',', usecols=['entity_id','error_type'], index_col='entity_id')
fp_robbert.rename(columns={'error_type': 'robbert'}, inplace=True)

fn_robbert = pd.read_csv(fn_robbert_file, sep=',', usecols=['entity_id','error_type'], index_col='entity_id')
fn_robbert.rename(columns={'error_type': 'robbert'}, inplace=True)

In [None]:
# Print statistics per method
print(f'Number of false positives using rule based: {fp_rule_based.shape[0]}')
print(f'Number of false negatives using rule based: {fn_rule_based.shape[0]}')
print(f'Number of false positives using BiLSTM: {fp_bilstm.shape[0]}')
print(f'Number of false negatives using BiLSTM: {fn_bilstm.shape[0]}')
print(f'Number of false positives using RoBBERT: {fp_robbert.shape[0]}')
print(f'Number of false negatives using RoBBERT: {fn_robbert.shape[0]}')
print(f'Total number of errors: {fp_rule_based.shape[0] + fn_rule_based.shape[0] + fp_bilstm.shape[0] + fn_bilstm.shape[0] +fp_robbert.shape[0] +fn_robbert.shape[0]}')

In [None]:
def create_count_table(rule_based, bilstm, robbert):
    count_table = pd.DataFrame({'rule_based': rule_based.value_counts(), 
                                'bilstm': bilstm.value_counts(), 
                                'robbert': robbert.value_counts()})
    count_table.fillna(0, inplace=True)
    count_table.reset_index(level=[0],inplace=True)
    count_table.set_index('level_0', inplace=True)
    count_table.index.name = None
    count_table.loc['Total']= count_table.sum(numeric_only=True, axis=0)   
    count_table = count_table.astype('int').astype('string')
    count_table.rename(pretty_names_error_categories, inplace=True)
    count_table.rename(columns=pretty_names_error_methods, inplace=True)
    return count_table

In [None]:
def create_count_table_with_perc(rule_based, bilstm, robbert):
    count_table = pd.DataFrame({'rule_based': rule_based.value_counts(),
                                'rule_based_perc': round(rule_based.value_counts(normalize=True), 2) * 100,
                                'bilstm': bilstm.value_counts(), 
                                'bilstm_perc': round(bilstm.value_counts(normalize=True), 2) * 100, 
                                'robbert': robbert.value_counts(),
                                'robbert_perc': round(robbert.value_counts(normalize=True), 2) * 100})
    count_table.fillna(0, inplace=True)
    count_table.reset_index(level=[0],inplace=True)
    count_table.set_index('level_0', inplace=True)
    count_table.index.name = None
    count_table.loc['Total']= count_table.sum(numeric_only=True, axis=0)   
    count_table = count_table.astype('int').astype('string')
    count_table.rename(pretty_names_error_categories, inplace=True)
    count_table.rename(columns=pretty_names_error_methods, inplace=True)
    return count_table

## Create unfiltered count tables

In [None]:
# false_positive_counts = create_count_table(fp_rule_based, fp_bilstm, fp_robbert)
# false_positive_counts

In [None]:
# false_negative_counts = create_count_table(fn_rule_based, fn_bilstm, fn_robbert)
# false_negative_counts

## Filter entities
Remove entities that don't have a prediction in all methods.

In [None]:
# Load entities that have a prediction in all methods
predictions = pd.read_csv(predictions_file, usecols=['entity_id'])
entities = predictions.entity_id.to_list()
print(f'Total number of entities with a prediction: {len(entities)}')

In [None]:
fp_rule_based = fp_rule_based[fp_rule_based.index.isin(entities)]
fn_rule_based = fn_rule_based[fn_rule_based.index.isin(entities)]
fp_bilstm = fp_bilstm[fp_bilstm.index.isin(entities)]
fn_bilstm = fn_bilstm[fn_bilstm.index.isin(entities)]
fp_robbert = fp_robbert[fp_robbert.index.isin(entities)]
fn_robbert = fn_robbert[fn_robbert.index.isin(entities)]

## Create filtered count tables

In [None]:
false_positive_counts = create_count_table(fp_rule_based, fp_bilstm, fp_robbert)
# false_positive_counts

In [None]:
# print(false_positive_counts.to_latex(index=True))

In [None]:
# False positives
false_positive_counts_perc = create_count_table_with_perc(fp_rule_based, fp_bilstm, fp_robbert)
false_positive_counts_perc

In [None]:
print(false_positive_counts_perc.to_latex(index=True))

In [None]:
false_negative_counts = create_count_table(fn_rule_based, fn_bilstm, fn_robbert)
# false_negative_counts

In [None]:
# False negatives
false_negative_counts_perc = create_count_table_with_perc(fn_rule_based, fn_bilstm, fn_robbert)
false_negative_counts_perc

In [None]:
print(false_negative_counts_perc.to_latex(index=True))

## Concat false positives and negatives
For creating figures and comparing error categories, we concatenated false positives and negatives.

In [None]:
errors_rule_based = pd.concat([fp_rule_based, fn_rule_based])
errors_bilstm = pd.concat([fp_bilstm, fn_bilstm])
errors_robbert = pd.concat([fp_robbert, fn_robbert])

In [None]:
# Merge all errors based on entity_id
error_categories = pd.concat([errors_rule_based, errors_bilstm, errors_robbert], axis=1)
print(f'Number of entities with an error in at least 1 method: {error_categories.shape[0]}')
error_categories.head()

## Methods compared based on errors

In [None]:
# Convert errors to binary format
errors = error_categories.notna()
errors.head()

In [None]:
def check_errors_multiple_methods(row):
    """Check whether errors are made in multiple methods.
    """
    robbert_error = row['robbert']
    bilstm_error = row['bilstm']
    rule_based_error = row['rule_based']

    if robbert_error and not bilstm_error and not rule_based_error:
        return 'robbert'
    if not robbert_error and bilstm_error and not rule_based_error:
        return 'bilstm'
    if not robbert_error and not bilstm_error and rule_based_error:
        return 'rule_based'
    
    if robbert_error and bilstm_error and not rule_based_error:
        return 'robbert_and_bilstm'
    if robbert_error and not bilstm_error and rule_based_error:
        return 'robbert_and_rule_based'
    if not robbert_error and bilstm_error and rule_based_error:
        return 'bilstm_and_rule_based'

    return 'all'

errors['errors'] = errors.apply(check_errors_multiple_methods, axis=1)
errors.head()

In [None]:
# Create a table of the number of errors per method
errors_counts = errors.errors.value_counts()
errors_counts=errors_counts.reindex(["all", "rule_based", "bilstm", "robbert", "bilstm_and_rule_based", "robbert_and_rule_based", "robbert_and_bilstm"])
errors_counts.rename(pretty_names_error_methods, inplace=True)
errors_counts.reset_index()

In [None]:
# Plot whether methods make the same or different errors
fig = px.bar(errors_counts,
             title='Methods compared based on errors',
             labels={'index': 'Method',
                     'value': 'Number of errors'},
             template='plotly_white')
fig.update_layout(showlegend=False)
fig.update_layout(font_family="Serif")
fig.show()

In [None]:
pio.write_image(fig, figure_dir / 'fig1-methods-compared-on-errors.pdf')

## Error categories compared between methods

In [None]:
# Create long format table of errors per method & category
error_categories_m = error_categories.melt(ignore_index=False, var_name='Method', value_name='Category')
error_categories_m.dropna(inplace=True)
print(f'Total number of errors: {error_categories_m.shape[0]}')
error_categories_m.head()

In [None]:
# Add pretty names
error_categories_m['Method'].replace(pretty_names_error_methods, inplace=True)
error_categories_m['Category'].replace(pretty_names_error_categories, inplace=True)
error_categories_m.head()

In [None]:
# Create wide formatted table of number of errors per category & method
error_category_counts = error_categories_m.value_counts().reset_index()
error_category_counts.rename(columns={0: 'Count'}, inplace=True)
error_category_counts.pivot(columns='Category', index='Method')

In [None]:
# Plot errors per category
fig = px.bar(error_category_counts, x='Category', color='Method', y='Count',
             title='Error per category',
             labels={'count': 'Number of errors'},
             template='plotly_white')
fig.update_layout(font_family="Serif")
fig.show()

In [None]:
pio.write_image(fig, figure_dir / 'fig2-errors-per-category.pdf')

In [None]:
# Plot errors per method
fig = px.bar(error_category_counts, x='Method', color='Category', y='Count',
             title='Errors per method',
             labels={'count': 'Number of errors'},
             template='plotly_white')
fig.update_layout(font_family="Serif")
fig.show()

In [None]:
pio.write_image(fig, figure_dir / 'fig3-errors-per-prediction-method.pdf')

## Check error categorization
A few errors were made by multiple methods. We categorized all errors per method, so there could be a discrepency of how these were categorized. This section assesses this difference.

In [None]:
# Count number of errors in at least 2 methods
errors_at_least_2_methods = error_categories[error_categories.notna().sum(axis=1) >= 2].copy()
errors_at_least_2_methods.head()

In [None]:
print(f'Total number of errors: {errors.shape[0]}')
number_errors = errors_at_least_2_methods.shape[0]
print(f'Number of errors in at least 2 methods: {number_errors} ({round((number_errors / errors.shape[0])*100,2)}%)')

In [None]:
def check_categorization(row):
    """Check whether the different annotators agree on the category of common errors."""
    categories = [category for category in [row.robbert, row.bilstm, row.rule_based] if category is not np.NaN]
    if len(set(categories)) > 1:
        return False
    else:
        return True

errors_at_least_2_methods['annotators_agree'] = errors_at_least_2_methods.apply(check_categorization, axis=1)

In [None]:
in_agreement = sum(errors_at_least_2_methods["annotators_agree"])
print(f'Number of errors for which annotators assigned same category: {in_agreement} ({round((in_agreement / number_errors)*100,2)}%)')
not_in_agreement = errors_at_least_2_methods.shape[0] - sum(errors_at_least_2_methods["annotators_agree"])
print(f'Number of errors for which annotators assigned different category: {not_in_agreement} ({round((not_in_agreement / number_errors)*100,2)}%)')

In [None]:
errors_at_least_2_methods[~errors_at_least_2_methods["annotators_agree"]].head()

In [None]:
def extract_error_category_combination(row):
    return " & ".join(set([x for x in [row['rule_based'], row['bilstm'], row['robbert']] if not pd.isna(x)]))

errors_at_least_2_methods.replace(pretty_names_error_categories, inplace=True)
errors_at_least_2_methods['combinations'] = errors_at_least_2_methods.apply(extract_error_category_combination, axis=1)
errors_at_least_2_methods.head()

In [None]:
different_errors_at_least_2_methods = errors_at_least_2_methods[~errors_at_least_2_methods.annotators_agree].copy()
different_errors_at_least_2_methods.head()

In [None]:
# Create a table of the number of errors per method
combinations_counts = different_errors_at_least_2_methods.combinations.value_counts()

# Show head in DataFrame
combinations_counts.reset_index().head()

In [None]:
# Plot the counts
fig = px.bar(combinations_counts.head(6),
             title='Combinations of labeled errors',
             labels={'index': 'Combination',
                     'value': 'Number of occurences'},
             template='plotly_white')
fig.update_layout(showlegend=False,
                  font_family="Serif"
)
fig.show()

In [None]:
pio.write_image(fig, figure_dir / 'fig4-errors-category-combinations.pdf')

In [None]:
paired = errors_at_least_2_methods[['rule_based','bilstm']].copy().dropna()
print(f"Cohen's Kappa score rule based & BiLSTM:  {round(cohen_kappa_score(paired.rule_based, paired.bilstm),2)}")

paired = errors_at_least_2_methods[['rule_based','robbert']].copy().dropna()
print(f"Cohen's Kappa score rule based & RobBERT: {round(cohen_kappa_score(paired.rule_based, paired.robbert),2)}")

paired = errors_at_least_2_methods[['bilstm','robbert']].copy().dropna()
print(f"Cohen's Kappa score BiLSTM & RobBERT:     {round(cohen_kappa_score(paired.bilstm, paired.robbert),2)}")