### This file combines the annotations from our study and computes table 1 of the paper

In [None]:
import pandas as pd
import krippendorff
import seaborn as sn
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import ast
import math
from collections import Counter
from itertools import combinations
from statsmodels.stats import inter_rater as irr

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
data_dir = '../../data/'

annotations_df = pd.read_csv(data_dir+'annotations/study.annotation.csv')
annotations_df["result"] = annotations_df["result"].apply(ast.literal_eval)
dataset_df = pd.read_csv(data_dir+'appropriateness-corpus//annotation_dataset_types.csv')

label2type = {"not": 1, "partial": 2, "fully": 3}
id_to_type = {id_: type_ for id_, type_ in zip(dataset_df['id'].tolist(),dataset_df['types'].tolist())}

In [None]:
annotations_df["types"] = annotations_df["post_id"].apply(lambda x: id_to_type[x])

In [None]:
### Binarize the annotations
def process_results(x, label, sub):
    if label2type[x["appropriatenessQuestion"]]!=3:
        if not sub:
            return 1 if x[label+"Question"] == 'yes' else 3
        else:
            if label != 'other':
                return 1 if label in x.values() else 3 if x[label[:-1]+"Question"] == 'yes' else 3
            else:
                return 1 if label in x.keys() else 3 if x[label+"Question"] == 'yes' else 3
    else:
        return 3

In [None]:
annotations_df["Appropriate (1-3 or ?)"] = annotations_df["result"].apply(lambda x: label2type[x["appropriatenessQuestion"]])
annotations_df["1"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion', False))
annotations_df["1.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion1', True))
annotations_df["1.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'emotion2', True))
annotations_df["2"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment', False))
annotations_df["2.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment1', True))
annotations_df["2.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'commitment2', True))
annotations_df["3"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion', False))
annotations_df["3.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion1', True))
annotations_df["3.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion2', True))
annotations_df["3.3"] = annotations_df["result"].apply(lambda x: process_results(x, 'confusion3', True))
annotations_df["4"] = annotations_df["result"].apply(lambda x: process_results(x, 'other', False))
annotations_df["4.1"] = annotations_df["result"].apply(lambda x: process_results(x, 'other1', True))
annotations_df["4.2"] = annotations_df["result"].apply(lambda x: process_results(x, 'other', True))

In [None]:
### Map annotator ids from the interface to annotators and batches
user_dict = {
            '6':  6,'8':  8,'7':  7, # batch0
            '10': 6,'9':  8,'11': 7, # batch1
            '13': 6,'12': 8,'14': 7, # batch2
            '19': 6,'17': 8,'15': 7, # batch3
            '21': 6,'25': 8,'18': 7, # batch4
            '24': 6,'27': 8,'22': 7, # batch5
            '28': 6,'30': 8,'23': 7, # batch6
            '31': 6,'33': 8,'26': 7, # batch7
            '35': 6,'34': 8,'29': 7, # batch8
            '37': 6,'39': 8,'32': 7, # batch9
            '44': 6,'40': 8,'36': 7, # batch10
            '46': 6,'43': 8,'38': 7, # batch11
            '48': 6,'45': 8,'41': 7, # batch12
            '49': 6,'47': 8,'42': 7, # batch13
            }

annotations_df['user_id'] = annotations_df['user_id'].apply(lambda x: user_dict[str(x)] if str(x) in user_dict else -1)

In [None]:
users = [6,7,8]
output = sum([list(map(list, combinations(users, i))) for i in range(len(users) + 1)], [])
relevant_combinations = []
for combination in output:
    if len(combination) > 1:
        relevant_combinations.append(combination)

In [None]:
### These are the possible variables that can be used to aggregate the annotations
types = annotations_df["types"].unique().tolist()
types.append('-1')
app_modes = ['Appropriate (1-3 or ?)', 'Appropriate (1,2)', 'Appropriate (2,3)']
levels = ['first_level','second_level']
eval_modes = ['all','with_agreement','only_inappropriate']
combine_modes = ['min', 'max', 'majority', 'full']

### This are the variables used to produce table 1 in the paper
debug = False
if debug == False:
    relevant_combinations = [users]
    types = ['-1']
    app_modes = ['Appropriate (1-3 or ?)']
    levels = ['first_level']
    eval_modes = ['all']
    combine_modes = ['min']

In [None]:
for type_ in types:
    for combination in relevant_combinations:
        print('Users: '+str(combination))
        user_dfs = []
        for user in combination:
            tmp_df = annotations_df[annotations_df["user_id"]==user].sort_values("post_id")
            tmp_df['Appropriate (1,2)'] = tmp_df['Appropriate (1-3 or ?)'].apply(lambda x: 1 if x in [1,2] else 3)
            tmp_df['Appropriate (2,3)'] = tmp_df['Appropriate (1-3 or ?)'].apply(lambda x: 3 if x != 1 else 1)
            if type_ == '-1':
                user_dfs.append(tmp_df)
            else:
                user_dfs.append(tmp_df[tmp_df['types']==type_])

        for app_mode in app_modes:
            print('Users: '+str(combination)+' App mode: '+str(app_mode))
            if app_mode == 'Appropriate (1-3 or ?)':
                print([len(user_df[app_mode].tolist()) for user_df in user_dfs])
                app_krippendorffs_alpha = krippendorff.alpha(reliability_data=[user_df[app_mode].tolist() for user_df in user_dfs], level_of_measurement="ordinal")
            else:
                app_krippendorffs_alpha = krippendorff.alpha(reliability_data=[user_df[app_mode].tolist() for user_df in user_dfs], level_of_measurement="nominal")
            agg = irr.aggregate_raters(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T)
            app_fleiss_kappa = irr.fleiss_kappa(agg[0], method='fleiss')

            min_inappropriate_vote = [np.nanmin(x) for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()]
            max_inappropriate_vote = [max(x) for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()]
            majority_inappropriate_vote = [max(set(x), key=x.count) if len(set(x))!=len(combination) else 4 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()]
            full_inappropriate_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()]

            min_1_vote = [np.nanmin(x) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()]
            max_1_vote = [max(x) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()]
            majority_1_vote = [max(set(x), key=x.count) for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()]
            full_1_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['1'].tolist() for user_df in user_dfs]).T.tolist()]

            min_2_vote = [np.nanmin(x) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()]
            max_2_vote = [max(x) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()]
            majority_2_vote = [max(set(x), key=x.count) for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()]
            full_2_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['2'].tolist() for user_df in user_dfs]).T.tolist()]

            min_3_vote = [np.nanmin(x) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()]
            max_3_vote = [max(x) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()]
            majority_3_vote = [max(set(x), key=x.count) for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()]
            full_3_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['3'].tolist() for user_df in user_dfs]).T.tolist()]

            min_4_vote = [np.nanmin(x) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()]
            max_4_vote = [max(x) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()]
            majority_4_vote = [max(set(x), key=x.count) for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()]
            full_4_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array([user_df['4'].tolist() for user_df in user_dfs]).T.tolist()]

            for user_df in user_dfs:
                user_df['min_inappropriate_vote'] = min_inappropriate_vote
                user_df['max_inappropriate_vote'] = max_inappropriate_vote
                user_df['majority_inappropriate_vote'] = majority_inappropriate_vote
                user_df['full_inappropriate_vote'] = full_inappropriate_vote

                user_df['min_1_vote'] = min_1_vote
                user_df['max_1_vote'] = max_1_vote
                user_df['majority_1_vote'] = majority_1_vote
                user_df['full_1_vote'] = full_1_vote

                user_df['min_2_vote'] = min_2_vote
                user_df['max_2_vote'] = max_2_vote
                user_df['majority_2_vote'] = majority_2_vote
                user_df['full_2_vote'] = full_2_vote

                user_df['min_3_vote'] = min_3_vote
                user_df['max_3_vote'] = max_3_vote
                user_df['majority_3_vote'] = majority_3_vote
                user_df['full_3_vote'] = full_3_vote

                user_df['min_4_vote'] = min_4_vote
                user_df['max_4_vote'] = max_4_vote
                user_df['majority_4_vote'] = majority_4_vote
                user_df['full_4_vote'] = full_4_vote


            for level in levels:
                for eval_mode in eval_modes:
                    for combine_mode in combine_modes:
                        print('Users: '+str(combination)+' App mode: '+str(app_mode)+' Combine mode: '+combine_mode)
                        count_1 = list(np.zeros(14))
                        count_2 = list(np.zeros(14))
                        count_3 = list(np.zeros(14))
                        undecided = list(np.zeros(14))
                        majority_agreement = list(np.zeros(14))
                        full_agreement = list(np.zeros(14))
                        
                        majority_agreement_1 = list(np.zeros(14))
                        full_agreement_1 = list(np.zeros(14))
                        majority_agreement_2 = list(np.zeros(14))
                        full_agreement_2 = list(np.zeros(14))
                        majority_agreement_3 = list(np.zeros(14))
                        full_agreement_3 = list(np.zeros(14))
                        
                        undecided = list(np.zeros(14))
                        fleiss_kappa = list(np.zeros(14))
                        krippendorffs_alpha = list(np.zeros(14))
                        fleiss_kappa[0] = app_fleiss_kappa
                        krippendorffs_alpha[0 ]= app_krippendorffs_alpha
                        for p, col_name in enumerate(['1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']):
                            preds = []
                            tmp_sum = 0
                            try:
                                if combine_mode == 'min':
                                    for j in range(len(user_dfs)):
                                        tmp_df = user_dfs[j]
                                        if eval_mode == 'only_inappropriate':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_1_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_1_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_2_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_2_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_3_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_3_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_4_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2])) & (tmp_df['min_4_vote'].isin([1]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['min_inappropriate_vote'].isin([1,2])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['min_inappropriate_vote'].isin([1,2])][col_name].tolist()])
                                        elif eval_mode == 'with_agreement': # does not work for this category
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_1_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_1_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_2_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_2_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_3_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_3_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_4_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['min_inappropriate_vote'].isin([1,2,3])) & (tmp_df['min_4_vote'].isin([1,3]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['min_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['min_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist()])
                                        else:
                                            preds.append(tmp_df[col_name].tolist())
                                            tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[col_name].tolist()])
                                        tmp_counter = dict(Counter(user_dfs[0]['min_inappropriate_vote'].tolist()))
                                if combine_mode == 'max':
                                    for j in range(len(user_dfs)):
                                        tmp_df = user_dfs[j]
                                        if eval_mode == 'only_inappropriate':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_1_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_1_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_2_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_2_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_3_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_3_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_4_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2])) & (tmp_df['max_4_vote'].isin([1]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['max_inappropriate_vote'].isin([1,2])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['max_inappropriate_vote'].isin([1,2])][col_name].tolist()])
                                        elif eval_mode == 'with_agreement': # does not work for this category
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_1_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_1_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_2_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_2_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_3_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_3_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_4_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['max_inappropriate_vote'].isin([1,2,3])) & (tmp_df['max_4_vote'].isin([1,3]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['max_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['max_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist()])
                                        else:
                                            preds.append(tmp_df[col_name].tolist())
                                            tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[col_name].tolist()])
                                        tmp_counter = dict(Counter(user_dfs[0]['max_inappropriate_vote'].tolist()))
                                elif combine_mode == 'majority':
                                    for j in range(len(user_dfs)):
                                        tmp_df = user_dfs[j]
                                        if eval_mode == 'only_inappropriate':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_1_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_1_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_2_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_2_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_3_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_3_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_4_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2])) & (tmp_df['majority_4_vote'].isin([1]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2])][col_name].tolist()])
                                        elif eval_mode == 'with_agreement':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_1_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_1_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_2_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_2_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_3_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_3_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_4_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['majority_inappropriate_vote'].isin([1,2,3])) & (tmp_df['majority_4_vote'].isin([1,3]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2,3])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2,3])][col_name].tolist()])
                                        else:                                              
                                            preds.append(tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist())
                                            tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['majority_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist()])
                                        tmp_counter = dict(Counter(user_dfs[0]['majority_inappropriate_vote'].tolist()))
                                elif combine_mode == 'full':
                                    for j in range(len(user_dfs)):
                                        tmp_df = user_dfs[j]
                                        if eval_mode == 'only_inappropriate':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_1_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_1_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_2_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_2_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_3_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_3_vote'].isin([1]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_4_vote'].isin([1]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2])) & (tmp_df['full_4_vote'].isin([1]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2])][col_name].tolist()])
                                        elif eval_mode == 'with_agreement':
                                            if level == 'second_level' and len(col_name) == 3:
                                                if col_name[0] == '1':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_1_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_1_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '2':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_2_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_2_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '3':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_3_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_3_vote'].isin([1,3]))][col_name].tolist()])
                                                elif col_name[0] == '4':
                                                    preds.append(tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_4_vote'].isin([1,3]))][col_name].tolist())
                                                    tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[(tmp_df['full_inappropriate_vote'].isin([1,2,3])) & (tmp_df['full_4_vote'].isin([1,3]))][col_name].tolist()])
                                            else:                                                      
                                                preds.append(tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2,3])][col_name].tolist())
                                                tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2,3])][col_name].tolist()])
                                        else:                                              
                                            preds.append(tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist())
                                            tmp_sum += max([1 if x == 1  else 0 for x in tmp_df[tmp_df['full_inappropriate_vote'].isin([1,2,3,4])][col_name].tolist()])
                                        tmp_counter = dict(Counter(user_dfs[0]['full_inappropriate_vote'].tolist()))
                            except:
                                pass
                            for i in range(5):
                                if i in tmp_counter:
                                    if i == 0:
                                        count_3[0] = tmp_counter[0]
                                    elif i == 1:
                                        count_1[0] = tmp_counter[1]
                                    elif i == 2:
                                        count_2[0] = tmp_counter[2]
                                    elif i == 3:
                                        count_3[0] = tmp_counter[3]
                                    elif i == 4:
                                        undecided[0] = tmp_counter[4]

                            majority_agreement[0] = sum([max(list(dict(Counter(x)).values())) > sorted(list(dict(Counter(x)).values()))[-2] if len(list(dict(Counter(x)).values())) > 1 else True  for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            full_agreement[0] = sum([1 if len(set(x)) == 1 else 0 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())

                            majority_agreement_1[0] = sum([max(list(dict(Counter(x)).values())) > sorted(list(dict(Counter(x)).values()))[-2] if len(list(dict(Counter(x)).values())) > 1 and max(list(dict(Counter(x)).values())) == 1 else True if max(list(dict(Counter(x)).values())) == 1  else False for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            full_agreement_1[0] = sum([1 if len(set(x)) == 1 and x[0] == 1 else 0 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            majority_agreement_2[0] = sum([max(list(dict(Counter(x)).values())) > sorted(list(dict(Counter(x)).values()))[-2] if len(list(dict(Counter(x)).values())) > 1 and max(list(dict(Counter(x)).values())) == 2 else True if max(list(dict(Counter(x)).values())) == 2  else False for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            full_agreement_2[0] = sum([1 if len(set(x)) == 1 and x[0] == 2 else 0 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            majority_agreement_3[0] = sum([max(list(dict(Counter(x)).values())) > sorted(list(dict(Counter(x)).values()))[-2] if len(list(dict(Counter(x)).values())) > 1 and max(list(dict(Counter(x)).values())) == 3 else True if max(list(dict(Counter(x)).values())) == 3  else False for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            full_agreement_3[0] = sum([1 if len(set(x)) == 1 and x[0] == 3 else 0 for x in np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist()])/len(np.array([user_df[app_mode].tolist() for user_df in user_dfs]).T.tolist())
                            
                            if tmp_sum >= 1:
                                agg = irr.aggregate_raters(np.array(preds).T)
                                try:
                                    tmp_krippendorffs_alpha = krippendorff.alpha(reliability_data=preds, level_of_measurement="nominal")
                                except:
                                    tmp_krippendorffs_alpha = None
                                try:
                                    tmp_fleiss_kappa = irr.fleiss_kappa(agg[0], method='fleiss')
                                except:
                                    tmp_fleiss_kappa = None
                                    
                                krippendorffs_alpha[p+1] = tmp_krippendorffs_alpha
                                fleiss_kappa[p+1] = tmp_fleiss_kappa


                                if combine_mode == 'min':
                                    tmp_inappropriate_vote = [np.nanmin(x) for x in np.array(preds).T.tolist()] # we need to use max here because 1 will be translated to 0
                                elif combine_mode == 'max':
                                    tmp_inappropriate_vote = [max(x) for x in np.array(preds).T.tolist()]
                                elif combine_mode == 'majority':
                                    tmp_inappropriate_vote = [max(set(x), key=x.count) if len(set(x))!=len(combination) else 4 for x in np.array(preds).T.tolist()]
                                elif combine_mode == 'full':
                                    tmp_inappropriate_vote = [x[0] if len(set(x)) == 1 else 4 for x in np.array(preds).T.tolist()]
                                    
                                tmp_counter = dict(Counter(tmp_inappropriate_vote))
                                for i in range(5):
                                    if i in tmp_counter:
                                        if i == 0:
                                            count_3[p+1] = tmp_counter[0]
                                        elif i == 1:
                                            count_1[p+1] = tmp_counter[1]
                                        elif i == 2:
                                            count_2[p+1] = tmp_counter[2]
                                        elif i == 3:
                                            count_3[p+1] = tmp_counter[3]
                                        elif i == 4:
                                            count_2[p+1] = tmp_counter[4]

                                majority_agreement[p+1] = sum([max(list(dict(Counter(x)).values())) > sorted(list(dict(Counter(x)).values()))[-2] if len(list(dict(Counter(x)).values())) > 1 else True  for x in np.array(preds).T.tolist()])/len(np.array(preds).T.tolist())
                                full_agreement[p+1] = sum([1 if len(set(x)) == 1 else 0 for x in np.array(preds).T.tolist()])/len(np.array(preds).T.tolist())
                            else:
                                print('No annotations')


                        if combine_mode == 'min':
                            corrMatrix = np.mean([user_df[['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)
                        if combine_mode == 'max':
                            corrMatrix = np.mean([user_df[['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)
                        elif combine_mode == 'majority':
                            if eval_mode == 'only_inappropriate':
                                corrMatrix = np.mean([user_df[user_df['majority_inappropriate_vote'].isin([1,2])][['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)
                            else:
                                corrMatrix = np.mean([user_df[user_df['majority_inappropriate_vote'].isin([1,3])][['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)
                        elif combine_mode == 'full':
                            if eval_mode == 'only_inappropriate':
                                corrMatrix = np.mean([user_df[user_df['full_inappropriate_vote'].isin([1,2])][['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)
                            else:
                                corrMatrix = np.mean([user_df[user_df['full_inappropriate_vote'].isin([1,3])][['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2']].corr(method='kendall').values for user_df in user_dfs], axis=0)


                        data = {
                            '': ['Appropriate (1-3 or ?)','1','1.1','1.2','2','2.1','2.2','3','3.1','3.2','3.3','4','4.1','4.2'],
                            'count_1': count_1,
                            'count_2': count_2,
                            'count_3': count_3,
                            'full_agreement': full_agreement,
                            'majority_agreement': majority_agreement,
                            'krippendorffs_alpha': krippendorffs_alpha,
                            'Appropriate (1-3 or ?)': corrMatrix[0],
                            '1': np.round(corrMatrix[1],4),
                            '1.1': np.round(corrMatrix[2],4),
                            '1.2': np.round(corrMatrix[3],4),
                            '2': np.round(corrMatrix[4],4),
                            '2.1': np.round(corrMatrix[5],4),
                            '2.2':np.round(corrMatrix[6],4),
                            '3': np.round(corrMatrix[7],4),
                            '3.1':np.round(corrMatrix[8],4),
                            '3.2':np.round(corrMatrix[9],4),
                            '3.3':np.round(corrMatrix[10],4),
                            '4': np.round(corrMatrix[11],4),
                            '4.1':np.round(corrMatrix[12],4),
                            '4.2':np.round(corrMatrix[13],4),
                        }
                        
                        tmp_df = pd.DataFrame(data=data)
                        tmp_df.to_csv(data_dir+'annotations/study.annotation.evaluation.{}.{}.{}.{}.{}.{}.csv'.format(app_mode, eval_mode, combine_mode, level, type_, combination), index=False)