In [1]:
import pandas as pd
from sklearn.metrics import ndcg_score
import random
from scipy import stats
import numpy as np
from ast import literal_eval
from scipy.optimize import minimize
from itertools import combinations

In [2]:
pd.set_option('display.max_colwidth', None)

In [4]:
    columns = ['a_id', 'user_id', 'post_id', 'post_text', 'annotation_date', 'result', 'issue', 'comments']
    df = pd.read_csv('../../data/style-transfer/study_pairs_abs_results.csv')
    df['result'] = df['result'].apply(literal_eval)
    df['app'] = df['result'].apply(lambda x: int(x['otherErrorQuestion1'][-1]))
    df['sim'] = df['result'].apply(lambda x: int(x['otherErrorQuestion2'][-1])-5 if x['otherErrorQuestion2'][-2:] != '10' else int(x['otherErrorQuestion2'][-2:])-5)
    df['fluency'] = df['result'].apply(lambda x: int(x['otherErrorQuestion3'][-2:])-10)
    df['id_source'] = df['post_id'].apply(lambda x: x.split('_')[0])
    df['id_model'] = df['post_id'].apply(lambda x: x.split('_')[1])

    # keep only users that are in [3,4,5,6,7]
    df = df[df['user_id'].isin([2,3,4,5,6,7,8,9,10,11])]
    print(len(df))
    
    # print # of annotations per user
    print(df['user_id'].value_counts())
    # only keep post_ids that appear 5 times in the df
    ids_to_keep = df['post_id'].value_counts()[df['post_id'].value_counts() == 5].index.tolist()
    df = df[df['post_id'].isin(ids_to_keep)]
    print(len(df))

    # calc mean of  app, sim, fluency for each model
    df_mean = df.groupby(['id_model']).mean().reset_index()
    df_mean['app'] = df_mean['app'].apply(lambda x: round(x, 2))
    df_mean['sim'] = df_mean['sim'].apply(lambda x: round(x, 2))
    df_mean['fluency'] = df_mean['fluency'].apply(lambda x: round(x, 2))
    print(df_mean[['id_model', 'app', 'sim', 'fluency']])

    # calc krippendorff's alpha between annotators [2,3,4,5,6] and [7,8,9,10,11]
    df_1 = df[df['user_id'].isin([2,3,4,5,6])]
    df_2 = df[df['user_id'].isin([7,8,9,10,11])]
    print('Calculating agreement')
    rd_df1_app = [l+[np.nan for _ in [df_2[df_2['user_id']==x].sort_values('post_id')['app'].tolist() for x in df_2.user_id.unique()][0]] for l in [df_1[df_1['user_id']==x].sort_values('post_id')['app'].tolist() for x in df_1.user_id.unique()]]
    rd_df2_app = [[np.nan for _ in [df_1[df_1['user_id']==x].sort_values('post_id')['app'].tolist() for x in df_1.user_id.unique()][0]]+l for l in [df_2[df_2['user_id']==x].sort_values('post_id')['app'].tolist() for x in df_2.user_id.unique()]]
    rd_df1_sim = [l+[np.nan for _ in [df_2[df_2['user_id']==x].sort_values('post_id')['sim'].tolist() for x in df_2.user_id.unique()][0]] for l in [df_1[df_1['user_id']==x].sort_values('post_id')['sim'].tolist() for x in df_1.user_id.unique()]]
    rd_df2_sim = [[np.nan for _ in [df_1[df_1['user_id']==x].sort_values('post_id')['sim'].tolist() for x in df_1.user_id.unique()][0]]+l for l in [df_2[df_2['user_id']==x].sort_values('post_id')['sim'].tolist() for x in df_2.user_id.unique()]]
    rd_df1_fluency = [l+[np.nan for _ in [df_2[df_2['user_id']==x].sort_values('post_id')['fluency'].tolist() for x in df_2.user_id.unique()][0]] for l in [df_1[df_1['user_id']==x].sort_values('post_id')['fluency'].tolist() for x in df_1.user_id.unique()]]
    rd_df2_fluency = [[np.nan for _ in [df_1[df_1['user_id']==x].sort_values('post_id')['fluency'].tolist() for x in df_1.user_id.unique()][0]]+l for l in [df_2[df_2['user_id']==x].sort_values('post_id')['fluency'].tolist() for x in df_2.user_id.unique()]]

6750
9     678
8     678
7     678
11    678
10    678
4     672
6     672
3     672
2     672
5     672
Name: user_id, dtype: int64
6750
   id_model   app   sim  fluency
0  10a-00ss  4.11  3.22     4.48
1  40a-60ss  3.55  4.56     3.75
2  50a-50ss  3.78  3.69     3.97
3  60a-40ss  3.99  3.50     4.32
4     human  4.05  3.93     4.28
5  instruct  3.83  4.31     4.10
Calculating agreement


  df_mean = df.groupby(['id_model']).mean().reset_index()


In [4]:
post_ids = []
for dim in ['sim','app','fluency']:
    with open('../../data/style-transfer/study.annotation.evaluation.MACE.{}.csv'.format(dim), 'w') as fp:  
        for post_id in df[df['user_id'].isin([2,3,4,5,6])].post_id.unique():
            post_user_votes = []
            if dim == 'sim':
                post_ids.append(post_id) 
            for user_id in [2,3,4,5,6]:
                post_user_vote = str(df[(df['post_id']==post_id) & (df['user_id']==user_id)][dim].tolist()[0])
                post_user_votes.append(post_user_vote)
            fp.write(','.join(post_user_votes)+'\n')
    !module load Java && ../repo/MACE/MACE --prefix {dim} /bigwork/nhwpziet/appropriateness-style-transfer/data/style-transfer/study.annotation.evaluation.MACE.{dim}.csv
    !module load Java && ../repo/MACE/MACE --distribution --prefix distribution.{dim} /bigwork/nhwpziet/appropriateness-style-transfer/data/style-transfer/study.annotation.evaluation.MACE.{dim}.csv
    
for dim in ['sim','app','fluency']:
    with open('../../data/style-transfer/study.annotation2.evaluation.MACE.{}.csv'.format(dim), 'w') as fp:  
        for post_id in df[df['user_id'].isin([7,8,9,10,11])].post_id.unique():
            post_user_votes = []
            if dim == 'sim':
                post_ids.append(post_id) 
            for user_id in [7,8,9,10,11]:
                post_user_vote = str(df[(df['post_id']==post_id) & (df['user_id']==user_id)][dim].tolist()[0])
                post_user_votes.append(post_user_vote)
            fp.write(','.join(post_user_votes)+'\n')
    !module load Java && ../repo/MACE/MACE --prefix {dim+'2'} /bigwork/nhwpziet/appropriateness-style-transfer/data/style-transfer/study.annotation2.evaluation.MACE.{dim}.csv
    !module load Java && ../repo/MACE/MACE --distribution --prefix distribution.{dim+'2'} /bigwork/nhwpziet/appropriateness-style-transfer/data/style-transfer/study.annotation2.evaluation.MACE.{dim}.csv

Module for Java, version 11.0.18 loaded
Reading CSV file '/bigwork/nhwpziet/appropriateness-style-transfer/data/style-transfer/study.annotation.evaluation.MACE.sim.csv'
....................100
....................200
....................300
....................400
....................500
....................600
..............
stats:
	672 instances,
	5 labels [5, 4, 3, 2, 1],
	5 annotators

Running Variational Bayes EM training with the following settings:
	50 iterations
	10 restarts
	smoothing = 0.002
	alpha = 0.5
	beta = 0.5

Restart 1
initial log marginal likelihood = -5302.310835727014
final log marginal likelihood = -4267.33723447076

Restart 2
initial log marginal likelihood = -5268.958187116642
final log marginal likelihood = -4267.337513347807

Restart 3
initial log marginal likelihood = -5338.670796272917
final log marginal likelihood = -4267.337888939685

Restart 4
initial log marginal likelihood = -5360.07826441742
final log marginal likelihood = -4267.336356555964

Restart 5

In [5]:
data = {'post_id': post_ids, 'sim': [], 'app': [], 'fluency': []}
for dim in ['sim','app','fluency']:
    dim_name_file = './{}.prediction'.format(dim.replace(' ','_'))
    #print(dim_name_file)
    with open(dim_name_file, 'r') as fp:  
        lines = fp.readlines()
        lines = [int(line.replace('\n','')) for line in lines]
    data[dim] += lines
for dim in ['sim2','app2','fluency2']:
    dim_name_file = './{}.prediction'.format(dim.replace(' ','_'))
    #print(dim_name_file)
    with open(dim_name_file, 'r') as fp:  
        lines = fp.readlines()
        lines = [int(line.replace('\n','')) for line in lines]
    data[dim[:-1]] += lines

In [35]:
len(data['sim'])

1103

In [6]:
mace_df = pd.DataFrame(data)

In [8]:
    mace_df['id_source'] = mace_df['post_id'].apply(lambda x: x.split('_')[0])
    mace_df['id_model'] = mace_df['post_id'].apply(lambda x: x.split('_')[1])

In [10]:
    df_mean = mace_df.groupby(['id_model']).mean().reset_index()
    df_mean['app'] = df_mean['app'].apply(lambda x: round(x, 2))
    df_mean['sim'] = df_mean['sim'].apply(lambda x: round(x, 2))
    df_mean['fluency'] = df_mean['fluency'].apply(lambda x: round(x, 2))
    print(df_mean[['id_model', 'app', 'sim', 'fluency']])


   id_model   app   sim  fluency
0  10a-00ss  3.77  2.65     4.16
1  40a-60ss  2.70  4.75     2.89
2  50a-50ss  3.15  3.38     3.34
3  60a-40ss  3.50  2.96     3.77
4     human  3.60  3.48     3.82
5  instruct  3.22  4.17     3.40


In [14]:
conservative_df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_conservative.csv')
liberal_df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_liberal.csv')
majority_df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_majority.csv')
full_df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_full.csv')

conservative_df['source_id'] = dataset_df['full_id'].tolist()
liberal_df['source_id'] = dataset_df['full_id'].tolist()
majority_df['source_id'] = dataset_df['full_id'].tolist()
full_df['source_id'] = dataset_df['full_id'].tolist()

In [15]:
dataset_df.sort_values('id').head(5)

Unnamed: 0,full_id,source,issue,batch,types,id,Inappropriateness,Toxic Emotions,Excessive Intensity,Emotional Deception,Missing Commitment,Missing Seriousness,Missing Openness,Missing Intelligibility,Unclear Meaning,Missing Relevance,Confusing Reasoning,Other Reasons,Detrimental Orthography,Reason Unclassified
0,arg284049,students should wear what they like and feel free about their clothes,is-the-school-uniform-a-good-or-bad-idea-,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,arg339127,"people cant be forced to wear school uniforms, i mean each person has theri own wish whether they want to or dont want to wear school uniforms. I think each principal should think once again regarding the uniforms",is-the-school-uniform-a-good-or-bad-idea-,0,0,1,1,0,0,0,1,0,1,1,0,0,1,1,0,1
2,412,"That form of argument degrades this forum, and will cause the arguments to fall to the lowest common denominator. <br/> This word, ""indisputable"", I do not think it means what you think it does. I dispute your claim from personal experience, if nothing else. That makes it disputable. I think Yahoo! has chat rooms more attuned to your style of debate. Check them out.",firefox-vs-internet-explorer,0,0,2,1,1,0,1,1,1,1,1,1,1,0,0,0,0
3,arg33226,I wouldnt turn her in becuase she is my wife. She made a mistake that we can get over it. If she trusted me by telling me what she did then I couldn't do that to her.,if-your-spouse-committed-murder-and-he-or-she-confided-in-you-would-you-turn-them-in-,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,arg33285,No I wouldn't turn in my spouse. Just because the girl that i married killed someone doesn't make them a terrible person. People make mistakes.,if-your-spouse-committed-murder-and-he-or-she-confided-in-you-would-you-turn-them-in-,0,0,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [16]:
conservative_df.sort_values('post_id').head(5)

Unnamed: 0,post_id,source_dataset,issue,post_text,Inappropriateness,Toxic Emotions,Excessive Intensity,Emotional Deception,Missing Commitment,Missing Seriousness,Missing Openness,Missing Intelligibility,Unclear Meaning,Missing Relevance,Confusing Reasoning,Other Reasons,Detrimental Orthography,Reason Unclassified,source_id
0,0,0,Is the school uniform a good or bad idea:,students should wear what they like and feel free about their clothes,1,0,0,0,0,False,False,1,0,0,1,0,0,0,arg284049
1,1,0,Is the school uniform a good or bad idea:,"people cant be forced to wear school uniforms, i mean each person has theri own wish whether they want to or dont want to wear school uniforms. I think each principal should think once again regarding the uniforms",1,0,0,0,1,False,True,1,0,0,1,1,0,1,arg339127
2,2,0,Firefox vs internet explorer:,"That form of argument degrades this forum, and will cause the arguments to fall to the lowest common denominator.\r\nThis word, ""indisputable"", I do not think it means what you think it does. I dispute your claim from personal experience, if nothing else. That makes it disputable. I think Yahoo! has chat rooms more attuned to your style of debate. Check them out.",1,1,0,1,1,True,True,1,1,1,0,0,0,0,412
3,3,0,If your spouse committed murder and he or she confided in you would you turn them in:,I wouldnt turn her in becuase she is my wife. She made a mistake that we can get over it. If she trusted me by telling me what she did then I couldn't do that to her.,0,0,0,0,0,False,False,0,0,0,0,0,0,0,arg33226
4,4,0,If your spouse committed murder and he or she confided in you would you turn them in:,No I wouldn't turn in my spouse. Just because the girl that i married killed someone doesn't make them a terrible person. People make mistakes.,1,0,0,0,1,False,True,0,0,0,0,0,0,0,arg33285


In [21]:
corr_dict = {
    dim_name: None for dim_name in dim_names
}

for dim_name in dim_names:
    x1 = [int(x) for x in dataset_df.sort_values('id')[dim_name].tolist()]
    x2 = [int(x) for x in conservative_df.sort_values('post_id')[dim_name].tolist()]
    krippendorffs_alpha = krippendorff.alpha(reliability_data=[x1,x2], level_of_measurement="nominal")
    corr_dict[dim_name] = np.round(krippendorffs_alpha,2)

corr_dict

{'Inappropriateness': 0.95,
 'Toxic Emotions': 1.0,
 'Excessive Intensity': 1.0,
 'Emotional Deception': 1.0,
 'Missing Commitment': 1.0,
 'Missing Seriousness': 1.0,
 'Missing Openness': 0.96,
 'Missing Intelligibility': 1.0,
 'Unclear Meaning': 1.0,
 'Missing Relevance': 1.0,
 'Confusing Reasoning': 0.95,
 'Other Reasons': 1.0,
 'Detrimental Orthography': 1.0,
 'Reason Unclassified': 1.0}

In [20]:
corr_dict = {
    dim_name: None for dim_name in dim_names
}

for dim_name in dim_names:
    x1 = [int(x) for x in dataset_df.sort_values('id')[dim_name].tolist()]
    x2 = [int(x) for x in liberal_df.sort_values('post_id')[dim_name].tolist()]
    krippendorffs_alpha = krippendorff.alpha(reliability_data=[x1,x2], level_of_measurement="nominal")
    corr_dict[dim_name] = np.round(krippendorffs_alpha,2)

corr_dict

{'Inappropriateness': 0.16,
 'Toxic Emotions': 0.14,
 'Excessive Intensity': -0.08,
 'Emotional Deception': 0.05,
 'Missing Commitment': -0.03,
 'Missing Seriousness': 0.27,
 'Missing Openness': -0.12,
 'Missing Intelligibility': -0.03,
 'Unclear Meaning': -0.07,
 'Missing Relevance': -0.04,
 'Confusing Reasoning': -0.04,
 'Other Reasons': 0.08,
 'Detrimental Orthography': 0.13,
 'Reason Unclassified': -0.01}

In [18]:
corr_dict = {
    dim_name: None for dim_name in dim_names
}

for dim_name in dim_names:
    x1 = [int(x) for x in dataset_df.sort_values('id')[dim_name].tolist()]
    x2 = [int(x) for x in majority_df.sort_values('post_id')[dim_name].tolist()]
    krippendorffs_alpha = krippendorff.alpha(reliability_data=[x1,x2], level_of_measurement="nominal")
    corr_dict[dim_name] = np.round(krippendorffs_alpha,2)

corr_dict

{'Inappropriateness': 0.54,
 'Toxic Emotions': 0.45,
 'Excessive Intensity': 0.3,
 'Emotional Deception': 0.41,
 'Missing Commitment': 0.3,
 'Missing Seriousness': 0.54,
 'Missing Openness': 0.22,
 'Missing Intelligibility': 0.41,
 'Unclear Meaning': 0.19,
 'Missing Relevance': 0.22,
 'Confusing Reasoning': 0.19,
 'Other Reasons': 0.31,
 'Detrimental Orthography': 0.42,
 'Reason Unclassified': -0.01}

In [19]:
corr_dict = {
    dim_name: None for dim_name in dim_names
}

for dim_name in dim_names:
    x1 = [int(x) for x in dataset_df.sort_values('id')[dim_name].tolist()]
    x2 = [int(x) for x in full_df.sort_values('post_id')[dim_name].tolist()]
    krippendorffs_alpha = krippendorff.alpha(reliability_data=[x1,x2], level_of_measurement="nominal")
    corr_dict[dim_name] = np.round(krippendorffs_alpha,2)

corr_dict

{'Inappropriateness': 0.16,
 'Toxic Emotions': 0.14,
 'Excessive Intensity': -0.08,
 'Emotional Deception': 0.05,
 'Missing Commitment': -0.03,
 'Missing Seriousness': 0.27,
 'Missing Openness': -0.12,
 'Missing Intelligibility': -0.03,
 'Unclear Meaning': -0.07,
 'Missing Relevance': -0.04,
 'Confusing Reasoning': -0.04,
 'Other Reasons': 0.08,
 'Detrimental Orthography': 0.13,
 'Reason Unclassified': -0.01}