In [1]:
from pathlib import Path
from evaluation_utils import get_document_text
import pandas as pd
import numpy as np
import json

dcc_dir = Path('data') / 'EMCDutchClinicalCorpus'
data_dir = Path('data')
result_dir = Path('results')
bilstm_result_file = result_dir / 'bilstm_predictions_cv.csv.gz'
robbert_result_file = result_dir / 'robbert_predictions.csv.gz'
merged_result_file = result_dir / 'merged_results.csv.gz'
annotation_file = data_dir / 'emc-dcc_ann.json'

# Load results
results = pd.read_csv(merged_result_file)
results.head()

Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated


# RobBERT

In [2]:
results = pd.read_csv(robbert_result_file)
print(results.shape[0], results.isna().sum())
results.head()

12551 entity_id           0
category            0
label               0
bilstm              0
bilstm_cv           0
rule_based          0
robbert_512_2    1634
robbert_128_2    2212
robbert_32_2     6194
dtype: int64


Unnamed: 0,entity_id,category,label,bilstm,bilstm_cv,rule_based,robbert_512_2,robbert_128_2,robbert_32_2
0,DL1111_32_46,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated
1,DL1111_272_280,DL,not negated,not negated,not negated,not negated,not negated,not negated,
2,DL1111_363_377,DL,not negated,not negated,not negated,not negated,not negated,not negated,
3,DL1112_22_28,DL,negated,negated,negated,negated,negated,negated,negated
4,DL1113_59_67,DL,not negated,not negated,not negated,not negated,not negated,not negated,not negated


## Overall

In [3]:
results.category.unique()

array(['DL', 'GP', 'RD', 'SP'], dtype=object)

In [4]:
FN = results[(results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

print("="*50)
print("Overall accuracy")
print("="*50)
print(f"\tTP:{TP} \tFP:{FP} \n\n\tFN:{FN} \t\tTN:{TN} \n")

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("-"*80)
print(f"ACC:{round(ACC,3)},\tSENS:{round(SENS,3)},\tSPEC:{round(SPEC,3)},\tPPV:{round(PPV,3)},\tNPV:{round(NPV,3)}")
print("-"*80+"\n")

#################

FN = results[(results.category=='DL') & (results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.category=='DL') & (results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.category=='DL') & (results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.category=='DL') & (results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

print("="*50)
print("DL accuracy")
print("="*50)
print(f"\tTP:{TP} \t\tFP:{FP} \n\n\tFN:{FN} \t\tTN:{TN} \n")

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("-"*80)
print(f"ACC:{round(ACC,3)},\tSENS:{round(SENS,3)},\tSPEC:{round(SPEC,3)},\tPPV:{round(PPV,3)},\tNPV:{round(NPV,3)}")
print("-"*80+"\n")

#################

FN = results[(results.category=='SP') & (results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.category=='SP') & (results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.category=='SP') & (results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.category=='SP') & (results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

print("="*50)
print("SP accuracy")
print("="*50)
print(f"\tTP:{TP} \t\tFP:{FP} \n\n\tFN:{FN} \t\tTN:{TN} \n")

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("-"*80)
print(f"ACC:{round(ACC,3)},\tSENS:{round(SENS,3)},\tSPEC:{round(SPEC,3)},\tPPV:{round(PPV,3)},\tNPV:{round(NPV,3)}")
print("-"*80+"\n")

#################

FN = results[(results.category=='RD') & (results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.category=='RD') & (results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.category=='RD') & (results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.category=='RD') & (results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

print("="*50)
print("RD accuracy")
print("="*50)
print(f"\tTP:{TP} \t\tFP:{FP} \n\n\tFN:{FN} \t\tTN:{TN} \n")

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("-"*80)
print(f"ACC:{round(ACC,3)},\tSENS:{round(SENS,3)},\tSPEC:{round(SPEC,3)},\tPPV:{round(PPV,3)},\tNPV:{round(NPV,3)}")
print("-"*80+"\n")

#################

FN = results[(results.category=='GP') & (results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.category=='GP') & (results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.category=='GP') & (results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.category=='GP') & (results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

print("="*50)
print("GP accuracy")
print("="*50)
print(f"\tTP:{TP} \t\tFP:{FP} \n\n\tFN:{FN} \t\tTN:{TN} \n")

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
V = TN/(TN+FN)

print("-"*80)
print(f"ACC:{round(ACC,3)},\tSENS:{round(SENS,3)},\tSPEC:{round(SPEC,3)},\tPPV:{round(PPV,3)},\tNPV:{round(NPV,3)}")
print("-"*80+"\n")

Overall accuracy
	TP:1534 	FP:77 

	FN:118 		TN:9188 

--------------------------------------------------------------------------------
ACC:0.982,	SENS:0.929,	SPEC:0.992,	PPV:0.952,	NPV:0.987
--------------------------------------------------------------------------------

DL accuracy
	TP:364 		FP:17 

	FN:11 		TN:2237 

--------------------------------------------------------------------------------
ACC:0.989,	SENS:0.971,	SPEC:0.992,	PPV:0.955,	NPV:0.995
--------------------------------------------------------------------------------

SP accuracy
	TP:322 		FP:22 

	FN:44 		TN:1913 

--------------------------------------------------------------------------------
ACC:0.971,	SENS:0.88,	SPEC:0.989,	PPV:0.936,	NPV:0.978
--------------------------------------------------------------------------------

RD accuracy
	TP:551 		FP:21 

	FN:24 		TN:2604 

--------------------------------------------------------------------------------
ACC:0.986,	SENS:0.958,	SPEC:0.992,	PPV:0.963,	NPV:0.991
-----

In [5]:
scores = []

FN = results[(results.label=='negated') & (results.robbert_512_2=='not negated')].shape[0]
TN = results[(results.label=='not negated') & (results.robbert_512_2=='not negated')].shape[0]
FP = results[(results.label=='not negated') & (results.robbert_512_2=='negated')].shape[0]
TP = results[(results.label=='negated') & (results.robbert_512_2=='negated')].shape[0]

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)

missing_perc = results.robbert_512_2.isna().sum()/results.shape[0]

scores.append({'blocksize': 512, 'acc': ACC, 'spec': SPEC, 'sens': SENS, 'ppv': PPV, 'npv': NPV, 'miss':missing_perc})

############################

FN = results[(results.label=='negated') & (results.robbert_128_2=='not negated')].shape[0]
TN = results[(results.label=='not negated') & (results.robbert_128_2=='not negated')].shape[0]
FP = results[(results.label=='not negated') & (results.robbert_128_2=='negated')].shape[0]
TP = results[(results.label=='negated') & (results.robbert_128_2=='negated')].shape[0]

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)

missing_perc = results.robbert_128_2.isna().sum()/results.shape[0]

scores.append({'blocksize': 128, 'acc': ACC, 'spec': SPEC, 'sens': SENS, 'ppv': PPV, 'npv': NPV, 'miss':missing_perc})

############################

FN = results[(results.label=='negated') & (results.robbert_32_2=='not negated')].shape[0]
TN = results[(results.label=='not negated') & (results.robbert_32_2=='not negated')].shape[0]
FP = results[(results.label=='not negated') & (results.robbert_32_2=='negated')].shape[0]
TP = results[(results.label=='negated') & (results.robbert_32_2=='negated')].shape[0]

ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)

missing_perc = results.robbert_32_2.isna().sum()/results.shape[0]

scores.append({'blocksize': 32, 'acc': ACC, 'spec': SPEC, 'sens': SENS, 'ppv': PPV, 'npv': NPV, 'miss':missing_perc})

scores_df = pd.DataFrame(scores)

In [6]:
scores_df

Unnamed: 0,blocksize,acc,spec,sens,ppv,npv,miss
0,512,0.982138,0.991689,0.928571,0.952204,0.98732,0.130189
1,128,0.981913,0.991897,0.926443,0.953655,0.986829,0.176241
2,32,0.982382,0.992808,0.921842,0.956667,0.986623,0.493506


Going from a block size of $512$ to a block-size of $32$ does not lead to a noticeable performance decrease, it does decrease the computation time by a factor of $10$. We have to note here that due to the current batch process we skip label tokens that are outside the block size. 

## False negatives

In [7]:
false_negatives = results[(results.label == 'negated') & (results.robbert_512_2== 'not negated')]
false_negatives.head()
print(false_negatives.shape[0])

false_positives = results[(results.label == 'not negated') & (results.robbert_512_2== 'negated')]
false_positives.head()
print(false_positives.shape[0])

118
77


In [8]:
# Show issue with a random record
random_entity = false_negatives.entity_id.tolist()[115]
text, start, end = get_document_text(random_entity, dcc_dir, results)

Entity: zwelling (8-16)

         entity_id category    label   bilstm    bilstm_cv   rule_based  \
12100  SP1962_8_16       SP  negated  negated  not negated  not negated   

      robbert_512_2 robbert_128_2 robbert_32_2  
12100   not negated   not negated  not negated  


In [9]:
# Show issue with a random record
random_entity = false_positives.entity_id.tolist()[69]
text, start, end = get_document_text(random_entity, dcc_dir, results)

Entity: ontsteking (22-32)

          entity_id category        label   bilstm    bilstm_cv rule_based  \
11681  SP1789_22_32       SP  not negated  negated  not negated    negated   

      robbert_512_2 robbert_128_2 robbert_32_2  
11681       negated       negated  not negated  


In [10]:
error_types = {'false_negative': {'uncommon': [0,5,8,10,12,22,36,47,51,53,63,66,70,77,
                                               78,79,80,84,85,88,89,90,96,97,99,103,104,
                                               107,112,113,114], 
                                  'annotation_error': [1,19,21,25,30,31,41,62,68,72,75,76,83,87,98,117],
                                  'uncertainty': [2,3,4,6,7,37,46,56,57,60,61,62,67,71,109],
                                  'long_distance': [9,40,50,52],
                                  'minus': [11,13,14,15,16,17,18,20,23,24,26,
                                            27,28,29,32,33,34,35,38,39,42,43,
                                            44,45,48,49,81,115],
                                  'other': [47,55,58,59,64,74,91,92,95,105,106,108,116],
                                  'list': [69,73,102],
                                  'sentence_structure': [82,86,93,94,95,96,101,110,111],
                                  'punctuation': []
                                 },
              'false_positive': {'annotation_error': [0,1,2,8,10,13,14,39,42,
                                                      43,49,53,54,55,57,59,71,
                                                      74,75],
                                 'negation_of_different_term': [3,4,5,6,9,12,15,18,26,27,
                                                                28,29,36,37,38,40,48,50,
                                                                58,65,70],
                                 'uncertainty': [7,11,16,41,44,52,68],
                                 'grammar': [],
                                 'punctuation': [30,23,32,21,20],
                                 'other': [17,19,24,25,35,46,47,51,60,61,64,
                                           72,73, 22,31,33,34,45,62,63,66,76],
                                 'list': [],                                                 
                                 'hyphen': [56,67,69]
              }}

In [11]:
tmp = pd.DataFrame(error_types)
tmp['false_negative'] = tmp['false_negative'].apply(lambda x: x if isinstance(x,list) else [])
tmp['false_positive'] = tmp['false_positive'].apply(lambda x: x if isinstance(x,list) else [])
tmp['fn_count'], tmp['fp_count'] = zip(*tmp.apply(lambda x: (len(x[0]), len(x[1])), axis=1))
tmp[['fn_perc','fp_perc']] = tmp[['fn_count', 'fp_count']]/tmp[['fn_count', 'fp_count']].sum(axis=0)

In [12]:
tmp[['fn_count', 'fp_count', 'fn_perc', 'fp_perc']].round(4)

Unnamed: 0,fn_count,fp_count,fn_perc,fp_perc
uncommon,31,0,0.2605,0.0
annotation_error,16,19,0.1345,0.2468
uncertainty,15,7,0.1261,0.0909
long_distance,4,0,0.0336,0.0
minus,28,0,0.2353,0.0
other,13,22,0.1092,0.2857
list,3,0,0.0252,0.0
sentence_structure,9,0,0.0756,0.0
punctuation,0,5,0.0,0.0649
negation_of_different_term,0,21,0.0,0.2727


In [13]:
false_negative_map = {_v:k for k,v in error_types['false_negative'].items() for _v in v}
false_positive_map = {_v:k for k,v in error_types['false_positive'].items() for _v in v}

In [14]:
false_negatives.reset_index(inplace=True, drop=True)
false_negatives.loc[:, 'error_type'] = "UNDEFINED"
false_negatives.loc[:, 'error_type'] = false_negatives.index.map(false_negative_map)

false_positives.reset_index(inplace=True, drop=True)
false_positives.loc[:, 'error_type'] = "UNDEFINED"
false_positives.loc[:, 'error_type'] = false_positives.index.map(false_positive_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  false_negatives.loc[:, 'error_type'] = "UNDEFINED"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  false_negatives.loc[:, 'error_type'] = false_negatives.index.map(false_negative_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  false_positives.loc[:, 'error_type'] = "UNDEFINED"
A value is trying 

In [16]:
args = [dcc_dir, results]
kwargs = {'print_html': False, 'print_text': False, 'obfuscate_entity': True}

false_positives['pattern'] = false_positives['entity_id'].apply(lambda x: get_document_text(x, *args, **kwargs)[0])
false_negatives['pattern'] = false_negatives['entity_id'].apply(lambda x: get_document_text(x, *args, **kwargs)[0])

false_positives[['entity_id', 'error_type', 'pattern']].to_csv("results/false-positives_robbert.csv", index=False)
false_negatives[['entity_id', 'error_type', 'pattern']].to_csv("results/false-negatives_robbert.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  false_positives['pattern'] = false_positives['entity_id'].apply(lambda x: get_document_text(x, *args, **kwargs)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  false_negatives['pattern'] = false_negatives['entity_id'].apply(lambda x: get_document_text(x, *args, **kwargs)[0])


# biLSTM

## False negatives biLSTM

In [None]:
# Select false negatives
false_negatives = results[(results.label == 'negated') & (results.bilstm == 'not negated')]
false_negatives.head()

In [None]:
# Show issue with a random record
random_entity = false_negatives.sample(1).entity_id.tolist()[0]
text, start, end = get_document_text(random_entity, dcc_dir, results)

In [None]:
# Count number of false negatives caused by -
count = 0
for index, record in false_negatives.iterrows():
    text, start, end = get_document_text(record.entity_id, dcc_dir, Path('data'), print_text=False)
    if text[end:end+1] == '-':
        count += 1
print(f'{count} of {false_negatives.shape[0]} ({round((count / false_negatives.shape[0]) * 100)}%) false negatives caused by negation described as "-"')

In [None]:
# Show text for all errors in BiLSTM cross-validation
model_pred = pd.read_csv(bilstm_result_file)

# Load annotated data
with open(annotation_file) as f:
    annotations = json.load(f)
result = []
for document in annotations['projects'][0]['documents']:
    document_name = document['name']
    text = document['text']

    for annotation in document['annotations']:

        # Extract data
        start_char = annotation['start']
        end_char = annotation['end']
        negation_value = annotation['meta_anns']['Negation']['value']

        # Create custom ID
        entity_id = f'{document_name}_{start_char}_{end_char}'
        result.append([entity_id, negation_value])
ann_labels = pd.DataFrame(result, columns=['entity_id', 'label'])  
cmp_labels = pd.merge(left=ann_labels, right = model_pred, left_on='entity_id', right_on='entity_id')
all_errors = cmp_labels[cmp_labels.label != cmp_labels.bilstm_cv]

for error_id, series in all_errors.iterrows():
    get_document_text(cmp_labels["entity_id"].iloc[error_id], dcc_dir, cmp_labels)

# Rule based

## False negatives


In [None]:
FNs_rule = results[(results.label == 'negated') & (results.rule_based == 'not negated')]['entity_id']
len(FNs_rule)

In [None]:
g_FNs_rule = iter(FNs_rule.to_list())

In [None]:
# iterate through one doc at a time
get_document_text(next(g_FNs_rule), dcc_dir, results);

### Observations:
1. Most frequent false negatives are a "list of negations", e.g.:
    - "`ENT`-" or "`ENT` -" or "`ENT`:-" (58 cases)
    - "`ENT`: nee" or "`ENT`: geen" or "`ENT`: negatief" (7 cases)
2. 2nd most frequent are actually not false negatives, but labeling errors (22 cases)
    - negation is labelled as part of the entity, e.g. "geen bijwerking" is the entity (GP1665_79_94, GP1681_0_11, GP2567_126_137)
    - no negation actually present (GP1558_64_70, GP2796_56_63, GP2967_0_6, RD1951_465_472, SP1164_179_186)
    - entity is a (sub)heading of report; not actually present (e.g. SP1188_26_31; only occurs in SP)
3. A negation trigger is missing that could easily be added
    - "neg" / "negatief" (13 cases)
    - "pleit tegen" (4 cases)
    - "niet voorafgegaan" (8 cases)
    - words like "niet" en "geen" that occur directly next to `ENT`, so have a scope of 1; probably too many false positives with broader scope (12 cases)

### Categories

In [None]:
# Load entity_id, snippet containing error, error category, and (absence of) trigger involved
errors_FN = pd.read_csv(result_dir / 'false-negatives_rule-based.csv', sep = ';')
errors_FN.head()

In [None]:
def print_category_counts(df, err_type):
    cnt = df['category'].value_counts()
    tot = df.shape[0]
    print(f'{err_type} ({tot} total)')
    print(pd.concat([cnt.rename('count'),
               (cnt / tot * 100).rename('percentage')],
              axis=1))

In [None]:
print_category_counts(errors_FN, 'False negatives')

In [None]:
def print_example_error(df, category):
    smpl = df[df['category'] == category].sample()
    get_document_text(smpl.entity_id.values[0], dcc_dir, results.iloc[:,0:3]);
    print(smpl)

In [None]:
print_example_error(errors_FN, 'sentence splitting')

## False positives


In [None]:
FPs_rule = results[(results.label == 'not negated') & (results.rule_based == 'negated')]['entity_id']
len(FPs_rule)

In [None]:
g_FPs_rule = iter(FPs_rule.to_list())

In [None]:
# iterate through one doc at a time
get_document_text(next(g_FPs_rule), dcc_dir, results);

### Observations:
- "wel" should be a termination trigger
- triggers like "geen" and "niet" should have a reduced scope (maybe even just 1?)
    - might also help to add punctuation like `,` and `;` as termination triggers

### Categories

In [None]:
# Load entity_id, snippet containing error, error category, and (absence of) trigger involved
errors_FP = pd.read_csv(result_dir / 'false-positives_rule-based.csv', sep = ';')
errors_FP.head()

In [None]:
print_category_counts(errors_FP, 'False positives')

In [None]:
print_example_error(errors_FP, 'missing termination trigger')

# Ensemble

In [None]:
results[['label', 'bilstm', 'bilstm_cv', 'rule_based', 
         'robbert_512_2', 'robbert_128_2', 'robbert_32_2']]=\
results[['label', 'bilstm', 'bilstm_cv', 'rule_based', 
         'robbert_512_2', 'robbert_128_2', 'robbert_32_2']].apply(lambda x: x.map({'not negated':0,
                                                                                   'negated': 1}), 
                                                                  axis=1)
results.dropna(subset=['robbert_512_2'], inplace=True)
results.drop(['bilstm', 'robbert_128_2', 'robbert_32_2'], axis=1, inplace=True)

In [None]:
'''
ACC = (TP+TN)/(TP+TN+FP+FN)
SPEC = TN/(TN+FP)
SENS = TP/(TP+FN)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
'''

results['ensemble_mv'] = results[['bilstm_cv', 'rule_based', 'robbert_512_2']].\
                            apply(lambda x: round(np.mean(x)),axis=1)

In [None]:
def confusion_matrix(ys,est='ensemble_mv'):
    y_est = ys[est]
    y_true = ys['label']
    TP = sum((y_est==y_true) & (y_true==1))
    TN = sum((y_est==y_true) & (y_true==0))
    FP = sum((y_est!=y_true) & (y_true==0))
    FN = sum((y_est!=y_true) & (y_true==1))
    
    ACC = (TP+TN)/(TP+TN+FP+FN)
    SPEC = TN/(TN+FP)
    SENS = TP/(TP+FN)
    PPV = TP/(TP+FP)
    NPV = TN/(TN+FN)

    return round(ACC,3),round(SPEC,3),round(SENS,3),round(PPV,3),round(NPV,3)

results[['category', 'label', 'ensemble_mv']].groupby('category').apply(confusion_matrix)

In [None]:
results[['category', 'label', 'bilstm_cv']].groupby('category').apply(lambda x: confusion_matrix(x,est='bilstm_cv'))

In [None]:
results[['category', 'label', 'rule_based']].groupby('category').apply(lambda x: confusion_matrix(x,est='rule_based'))

In [None]:
results[['category', 'label', 'robbert_512_2']].groupby('category').apply(lambda x: confusion_matrix(x,est='robbert_512_2'))