### This file compares the approaches for predicting appropriateness (table 4 in the paper)

In [None]:
import json
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon
from scipy.stats import ttest_ind

In [None]:
RESULTDIMS = [
    'eval_Inappropriateness_macroF1',
    'eval_Inappropriateness_precision',
    'eval_Inappropriateness_recall',
    'eval_Missing Intelligibility_macroF1',
    'eval_Missing Intelligibility_precision',
    'eval_Missing Intelligibility_recall',
    'eval_Confusing Reasoning_macroF1',
    'eval_Confusing Reasoning_precision',
    'eval_Confusing Reasoning_recall',
    'eval_Unclear Meaning_macroF1',
    'eval_Unclear Meaning_precision',
    'eval_Unclear Meaning_recall',
    'eval_Missing Relevance_macroF1',
    'eval_Missing Relevance_precision',
    'eval_Missing Relevance_recall',
    'eval_Missing Commitment_macroF1',
    'eval_Missing Commitment_precision',
    'eval_Missing Commitment_recall',
    'eval_Missing Openness_macroF1',
    'eval_Missing Openness_precision',
    'eval_Missing Openness_recall',
    'eval_Missing Seriousness_macroF1',
    'eval_Missing Seriousness_precision',
    'eval_Missing Seriousness_recall',
    'eval_Excessive Intensity_macroF1',
    'eval_Excessive Intensity_precision',
    'eval_Excessive Intensity_recall',
    'eval_Emotional Deception_macroF1',
    'eval_Emotional Deception_precision',
    'eval_Emotional Deception_recall',
    'eval_Toxic Emotions_macroF1',
    'eval_Toxic Emotions_precision',
    'eval_Toxic Emotions_recall',
    'eval_Reason Unclassified_macroF1',
    'eval_Reason Unclassified_precision',
    'eval_Reason Unclassified_recall',
    'eval_Detrimental Orthography_macroF1',
    'eval_Detrimental Orthography_precision',
    'eval_Detrimental Orthography_recall',
    'eval_Other Reasons_macroF1',
    'eval_Other Reasons_precision',
    'eval_Other Reasons_recall',
    'eval_mean_F1',
    'eval_mean_precision',
    'eval_mean_recall'
]

In [None]:
model_dir = '../../data/models/'

In [None]:
approaches = [
    'human-upperbound',
    'majority-baseline',
    'multilabel-roberta-baseline',
    'multilabel-roberta-baseline-shuffle',
    'multilabel-roberta-baseline-wo-issue',
    'random-baseline'
]

In [None]:
### Average all the folds
test_dict = {x: [] for x in RESULTDIMS}
for approach in approaches:
    tmp_results = []
    for repeat in range(5):
        for k in range(5):
            with open(model_dir+approach+'/fold{}.{}/test_results.json'.format(repeat,k), 'r') as f:
                tmp_result = json.load(f)
            tmp_results.append(tmp_result)
    d = {}
    for k, _ in tmp_results[0].items():
        d[k] = np.std([d[k] for d in tmp_results])
    for dim in RESULTDIMS:
        test_dict[dim].append(d[dim])

In [None]:
test_dict['approach'] = approaches

In [None]:
df = pd.DataFrame(data=test_dict)

In [None]:
### Print F1-scores (table 4 in the paper)
df[[
    'approach',
    'eval_mean_F1',
    'eval_Inappropriateness_macroF1',
    'eval_Toxic Emotions_macroF1',
    'eval_Excessive Intensity_macroF1',
    'eval_Emotional Deception_macroF1',
    'eval_Missing Commitment_macroF1',
    'eval_Missing Seriousness_macroF1',
    'eval_Missing Openness_macroF1',
    'eval_Missing Intelligibility_macroF1',
    'eval_Unclear Meaning_macroF1',
    'eval_Missing Relevance_macroF1',
    'eval_Confusing Reasoning_macroF1',
    'eval_Other Reasons_macroF1',
    'eval_Detrimental Orthography_macroF1',
    'eval_Reason Unclassified_macroF1',
]].sort_values('eval_mean_F1', ascending=False).round(4)

In [None]:
test_dict = {x: [] for x in RESULTDIMS}
for approach in approaches:
    tmp_results = []
    for repeat in range(5):
        for k in range(5):
            with open(model_dir+approach+'/fold{}.{}/test_results.json'.format(repeat,k), 'r') as f:
                tmp_result = json.load(f)
            tmp_results.append(tmp_result)
    for dim in RESULTDIMS:
        test_dict[dim].append([x[dim] for x in tmp_results])

In [None]:
### Check significance of all approaches
for dim in RESULTDIMS:
    if 'F1' in dim:
        for i, approach1 in enumerate(approaches):
            for j, approach2 in enumerate(approaches):
                if i<j:
                    w, p = wilcoxon(test_dict[dim][i], test_dict[dim][j], mode='exact')
                    print((dim, approach1, approach2))
                    print(p<=0.5)