# Compute human-algorithm correlation

* Computes agreement between human annotators

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
df_result = pd.read_csv('anon_results.csv')
df_result.index.name = 'id'
df_result = df_result.reset_index()
df_triplet = pd.read_csv('triplets.csv')
df_triplet = df_triplet.set_index('triplet_id')

## 1. Basic Stats

In [3]:
'''users and the numers of their labels'''
df_result.groupby('user_id').count()['id'].sort_values(ascending=False)

user_id
user_5     1753
user_8     1618
user_1     1518
user_0     1239
user_22    1029
user_38    1007
user_15     604
user_37     407
user_19     190
user_34     188
user_16     170
user_40     109
user_41     100
user_35      88
user_25      86
user_6       72
user_2       68
user_9       67
user_4       46
user_7       45
user_3       40
user_20      37
user_18      35
user_11      35
user_28      34
user_24      21
user_13      20
user_26      14
user_30      11
user_36       9
user_39       8
user_21       8
user_32       7
user_23       6
user_10       5
user_27       5
user_33       3
user_12       3
user_29       2
user_17       2
user_31       2
user_14       1
Name: id, dtype: int64

In [4]:
'''filter users with low answer count (<10)'''
filter_count = 10
user_count = df_result.groupby('user_id').count()['id']
l_filtered_users = (user_count[user_count > filter_count]).index.to_list()
print(l_filtered_users)
print(len(l_filtered_users))

['user_0', 'user_1', 'user_11', 'user_13', 'user_15', 'user_16', 'user_18', 'user_19', 'user_2', 'user_20', 'user_22', 'user_24', 'user_25', 'user_26', 'user_28', 'user_3', 'user_30', 'user_34', 'user_35', 'user_37', 'user_38', 'user_4', 'user_40', 'user_41', 'user_5', 'user_6', 'user_7', 'user_8', 'user_9']
29


In [5]:
len(df_triplet)

1752

## 2. Inter-user agreement score for each user

### Agreement metric from (Grodo and Larlus, 2017)
For each triplet,

$o1, o2$: the number of times the first (resp. second) image was chosen.  
$o3$: the number of times people did not pick any of the two images.  

Agreement score is computed as 

$$
s = (\frac{o_1+o_2 - 1}{o_1 + o_2 + o_3 - 1}) (\frac{o_i - 1}{o_1 + o_2 - 1})
$$
where $o_i$ $i\in\{1,2\}$ is the choice of the user.

## Our agreement metric
Let  
$o1, o2$ : the number of times the first (resp. second) image was chosen.  
$p$ : the number of times both of images were chosen  
$q$ : the nubmer of times neither of images were chosen


If a person choses the first or the second image,  
$$
s = \frac{o_i + 0.5 p - 1}{o_1 + o2 + p + q - 1}
$$
where $o_i$ $i\in\{1,2\}$ is the choice of the user.

If a person choses the "both" option,
$$
s = \frac{0.5o_1 + 0.5o_2 + p - 1}{o_1 + o_2 + p + q - 1}
$$

If a person choses the "neither" options,
$$
s = 0

$$

Given a person, the score is averaged over the whole triplet which he/she labeled.

In [6]:
'''prepare answer_cnt'''
answer_cnt = df_result[['id', 'triplet_id', 'answer']].pivot_table(index='triplet_id', columns='answer', aggfunc='count').fillna(0)
answer_cnt.columns = answer_cnt.columns.droplevel(0)
answer_cnt = answer_cnt.rename(columns={0:'o1', 1:'o2', 2:'both', 3:'neither'})
answer_cnt['o3'] = answer_cnt['both'] + answer_cnt['neither']
answer_cnt.head()

answer,o1,o2,both,neither,o3
triplet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,5.0,0.0,1.0,0.0,1.0
4,0.0,5.0,0.0,0.0,0.0
9,7.0,0.0,0.0,0.0,0.0
14,5.0,0.0,0.0,0.0,0.0
15,4.0,1.0,1.0,0.0,1.0


In [7]:
'''user agreement'''
user_agree = {'user': [], 'score': []}
for user in l_filtered_users:    
    print(user)
    df_answer_person = df_result[df_result['user_id'] == user].sort_values('triplet_id')
    l_score = []
    for i, row in df_answer_person.iterrows():
        triplet_id = row['triplet_id']
        if row['answer'] not in (0, 1, 2):
            continue

        answer = answer_cnt.loc[triplet_id]
        o1, o2, o3, p, q = answer['o1'], answer['o2'], answer['o3'], answer['both'], answer['neither']

        # requires more than two active votes for the triplet
        if o1 + o2 < 2:
            continue

        if row['answer'] in {0, 1}:
            if row['answer'] == 0:
                oi = o1
            else:
                oi = o2
            s = (oi + 0.5 * p  - 1) / (o1 + o2 + p + q - 1)
        elif row['answer'] == 2:
            s = (0.5 * o1 + 0.5 * o2 + p - 1 ) / (o1 + o2 + p + q - 1)
        else:
            s = 0
        

        l_score.append(s)
    score = np.mean(l_score)
    user_agree['user'].append(user)
    user_agree['score'].append(score)
user_agree = pd.DataFrame(user_agree)    
user_agree.head()
print(user_agree.mean(), user_agree.std())

user_0
user_1
user_11
user_13
user_15
user_16
user_18
user_19
user_2
user_20
user_22
user_24
user_25
user_26
user_28
user_3
user_30
user_34
user_35
user_37
user_38
user_4
user_40
user_41
user_5
user_6
user_7
user_8
user_9
score    0.726913
dtype: float64 score    0.050892
dtype: float64
