### This file combines the annotations from our study to create the final corpus used for the automated prediction:

In [None]:
import pandas as pd
import krippendorff
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import ast
import math
from collections import Counter
from itertools import combinations
from statsmodels.stats import inter_rater as irr

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
data_dir = '../../data/'

annotations_df = pd.read_csv(data_dir+'annotations/study.annotation.csv')
annotations_df["result"] = annotations_df["result"].apply(ast.literal_eval)
dataset_df = pd.read_csv(data_dir+'appropriateness-corpus/annotation_dataset_types.csv')

label2type = {"not": 1, "partial": 2, "fully": 3}
id_to_type = {id_: type_ for id_, type_ in zip(dataset_df['id'].tolist(),dataset_df['types'].tolist())}

In [None]:
annotations_df["types"] = annotations_df["post_id"].apply(lambda x: id_to_type[x])

In [None]:
### Binarize the annotations
def process_results(x, label, sub):
    if label2type[x["appropriatenessQuestion"]]!=3:
        if not sub:
            return 1 if x[label+"Question"] == 'yes' else 0
        else:
            if label != 'other':
                return 1 if label in x.values() else 0 if x[label[:-1]+"Question"] == 'yes' else 0
            else:
                return 1 if label in x.keys() else 0 if x[label+"Question"] == 'yes' else 0
    else:
        return 0

In [None]:
annotations_df["Appropriate (1-3 or ?)"] = annotations_df["result"].apply(lambda x: label2type[x["appropriatenessQuestion"]])
annotations_df["1"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion', False))
annotations_df["1.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion1', True))
annotations_df["1.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion2', True))
annotations_df["2"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment', False))
annotations_df["2.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment1', True))
annotations_df["2.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment2', True))
annotations_df["3"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion', False))
annotations_df["3.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion1', True))
annotations_df["3.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion2', True))
annotations_df["3.3"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion3', True))
annotations_df["4"] = annotations_df["result"].apply(lambda x: process_results(x, 'other', False))
annotations_df["4.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'other1', True))
annotations_df["4.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'other', True))

In [None]:
### Map annotator ids from the interface to annotators and batches
user_dict = {
            '6':  6,'8':  8,'7':  7, # batch0
            '10': 6,'9':  8,'11': 7, # batch1
            '13': 6,'12': 8,'14': 7, # batch2
            '19': 6,'17': 8,'15': 7, # batch3
            '21': 6,'25': 8,'18': 7, # batch4
            '24': 6,'27': 8,'22': 7, # batch5
            '28': 6,'30': 8,'23': 7, # batch6
            '31': 6,'33': 8,'26': 7, # batch7
            '35': 6,'34': 8,'29': 7, # batch8
            '37': 6,'39': 8,'32': 7, # batch9
            '44': 6,'40': 8,'36': 7, # batch10
            '46': 6,'43': 8,'38': 7, # batch11
            '48': 6,'45': 8,'41': 7, # batch12
            '49': 6,'47': 8,'42': 7, # batch13
            }

annotations_df['user_id'] = annotations_df['user_id'].apply(lambda x: user_dict[str(x)] if str(x) in user_dict else -1)

In [None]:
user_dfs = []
for user in [6,7,8]:
    tmp_df = annotations_df[annotations_df["user_id"]==user].sort_values("post_id")
    tmp_df['Appropriate (binary)'] = tmp_df['Appropriate (1-3 or ?)'].apply(lambda x: 1 if x in [1,2] else 0)
    user_dfs.append(tmp_df)

In [None]:
### Create a proper sentence from the issue
def process_issue(x):
    x = x.replace('-', ' ').strip().capitalize()
    if x[-1]!= ['.','!','?',':']:
        x = x+':'
    return x

In [None]:
### Create full annotator agreement corpus
data = {
    'post_id': user_dfs[0]['post_id'].values,
    'source_dataset': user_dfs[0]['types'].values,
    'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
    'post_text': user_dfs[0]['post_text'].values,
    'Inappropriateness': [x[0] if len(set(x))==1 else 4 for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
    'Toxic Emotions': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Excessive Intensity': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Emotional Deception': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Commitment': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Seriousness': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Openness': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Intelligibility': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Unclear Meaning': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Relevance': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Confusing Reasoning': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Other Reasons': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
    'Detrimental Orthography': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Reason Unclassified': [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()]
}
dataset_df = pd.DataFrame(data=data)
dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_full.csv', index=False)

In [None]:
### Create majority annotator agreement corpus
data = {
    'post_id': user_dfs[0]['post_id'].values,
    'source_dataset': user_dfs[0]['types'].values,
    'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
    'post_text': user_dfs[0]['post_text'].values,
    'Inappropriateness': [max(set(x), key=x.count) for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
    'Toxic Emotions': [max(set(x), key=x.count) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Excessive Intensity': [max(set(x), key=x.count) for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Emotional Deception': [max(set(x), key=x.count) for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Commitment': [max(set(x), key=x.count) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Seriousness': [max(set(x), key=x.count) == 1 for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Openness': [max(set(x), key=x.count) == 1 for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Intelligibility': [max(set(x), key=x.count) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Unclear Meaning': [max(set(x), key=x.count) for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Relevance': [max(set(x), key=x.count) for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Confusing Reasoning': [max(set(x), key=x.count) for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Other Reasons': [max(set(x), key=x.count) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
    'Detrimental Orthography': [max(set(x), key=x.count) for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Reason Unclassified': [max(set(x), key=x.count) for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()]
}
dataset_df = pd.DataFrame(data=data)
dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_majority.csv', index=False)

In [None]:
### Create conservative annotator agreement corpus (used in the paper)
data = {
    'post_id': user_dfs[0]['post_id'].values,
    'source_dataset': user_dfs[0]['types'].values,
    'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
    'post_text': user_dfs[0]['post_text'].values,
    'Inappropriateness': [max(x) for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
    'Toxic Emotions': [max(x) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Excessive Intensity': [max(x) for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Emotional Deception': [max(x) for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Commitment': [max(x) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Seriousness': [max(x) == 1 for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Openness': [max(x) == 1 for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Intelligibility': [max(x) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Unclear Meaning': [max(x) for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Relevance': [max(x) for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Confusing Reasoning': [max(x) for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Other Reasons': [max(x) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
    'Detrimental Orthography': [max(x) for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Reason Unclassified': [max(x) for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()]
}
dataset_df = pd.DataFrame(data=data)
dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_conservative.csv', index=False)

In [None]:
### Create liberal annotator agreement corpus
data = {
    'post_id': user_dfs[0]['post_id'].values,
    'source_dataset': user_dfs[0]['types'].values,
    'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
    'post_text': user_dfs[0]['post_text'].values,
    'Inappropriateness': [min(x) for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
    'Toxic Emotions': [min(x) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Excessive Intensity': [min(x) for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Emotional Deception': [min(x) for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Commitment': [min(x) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Seriousness': [min(x) == 1 for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Openness': [min(x) == 1 for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Intelligibility': [min(x) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Unclear Meaning': [min(x) for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Relevance': [min(x) for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Confusing Reasoning': [min(x) for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Other Reasons': [min(x) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
    'Detrimental Orthography': [min(x) for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Reason Unclassified': [min(x) for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()]
}
dataset_df = pd.DataFrame(data=data)
dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_liberal.csv', index=False)

In [None]:
### Create mean annotator agreement corpus
data = {
    'post_id': user_dfs[0]['post_id'].values,
    'source_dataset': user_dfs[0]['types'].values,
    'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
    'post_text': user_dfs[0]['post_text'].values,
    'Inappropriateness': [np.mean(x) for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
    'Toxic Emotions': [np.mean(x) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Excessive Intensity': [np.mean(x) for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Emotional Deception': [np.mean(x) for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Commitment': [np.mean(x) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Seriousness': [np.mean(x) for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Openness': [np.mean(x) for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Intelligibility': [np.mean(x) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Unclear Meaning': [np.mean(x) for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Missing Relevance': [np.mean(x) for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
    'Confusing Reasoning': [np.mean(x) for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
    'Other Reasons': [np.mean(x) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
    'Detrimental Orthography': [np.mean(x) for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
    'Reason Unclassified': [np.mean(x) for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()]
}
dataset_df = pd.DataFrame(data=data)
dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_mean.csv', index=False)

In [None]:
### Create predictions of each annotator in isolation (used for the human upperbound in the paper)
for i, ann in enumerate([6,7,8]):
    data = {
        'post_id': user_dfs[0]['post_id'].values,
        'source_dataset': user_dfs[0]['types'].values,
        'issue': [process_issue(x) for x in user_dfs[0]['issue'].values],
        'post_text': user_dfs[0]['post_text'].values,
        'Inappropriateness': [x[i] for x in np.array([user_df['Appropriate (binary)'].tolist() for user_df in user_dfs]).T.tolist()],
        'Toxic Emotions': [x[i] for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()],
        'Excessive Intensity': [x[i] for x in np.array([user_df['1.1'].tolist() for user_df in user_dfs]).T.tolist()],
        'Emotional Deception': [x[i] for x in np.array([user_df['1.2'].tolist() for user_df in user_dfs]).T.tolist()],
        'Missing Commitment': [x[i] for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()],
        'Missing Seriousness': [x[i] for x in np.array([user_df['2.1'].tolist() for user_df in user_dfs]).T.tolist()],
        'Missing Openness': [x[i] for x in np.array([user_df['2.2'].tolist() for user_df in user_dfs]).T.tolist()],
        'Missing Intelligibility': [x[i] for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()],
        'Unclear Meaning': [x[i] for x in np.array([user_df['3.1'].tolist() for user_df in user_dfs]).T.tolist()],
        'Missing Relevance': [x[i] for x in np.array([user_df['3.2'].tolist() for user_df in user_dfs]).T.tolist()],
        'Confusing Reasoning': [x[i] for x in np.array([user_df['3.3'].tolist() for user_df in user_dfs]).T.tolist()],
        'Other Reasons': [x[i] for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()],
        'Detrimental Orthography': [x[i] for x in np.array([user_df['4.1'].tolist() for user_df in user_dfs]).T.tolist()],
        'Reason Unclassified': [x[i] for x in np.array([user_df['4.2'].tolist() for user_df in user_dfs]).T.tolist()],
    }
    dataset_df = pd.DataFrame(data=data)
    dataset_df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_annotator{}.csv'.format(ann), index=False)