# Compute human-algorithm correlation

* 사람실험 결과와 알고리즘 출력 간의 correlation을 계산한다.
* 데이터 자체에 대한 분석은 별도의 노트북인 `human_label.ipynb` 를 참고한다.

In [3]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [4]:
df_result = pd.read_csv('results_v1_201911141704_final.csv')
df_triplet = pd.read_csv('triplets_v1_201911102026.csv')
df_triplet = df_triplet.set_index('triplet_id')

## 1. Basic Stats

In [5]:
'''users and the numers of their labels'''
df_result.groupby('user_id').count()['id'].sort_values(ascending=False)

user_id
cjhan                    9017
erin122                  4908
epsilon.kim              3002
wookee3                  2422
woong.ssang              2263
edwin.kang               1283
yoomin618                1075
hexa.ai                  1039
JRW                      1013
chico2121                 519
jinhyun.b                 474
eunjin                    392
IceAmericano              345
LSW                       298
Julyeon Seo               274
Dray.Choe                 255
Jinyeong                  251
hyunji                    250
cnuh                      211
jihoon.lee                205
yejun                     200
robert.p                  136
hihello2                  102
Jonggwon                   87
motherfathergentleman      76
eos73                      73
jay.mini                   50
junojuno                   45
sungbin.lim                31
ian.theman                 31
hans                       28
kwon.g                     25
scarlett.heo               21
dh

In [6]:
'''filter users with low answer count (<10)'''
filter_count = 10
user_count = df_result.groupby('user_id').count()['id']
l_filtered_users = (user_count[user_count > filter_count]).index.to_list()
print(l_filtered_users)
print(len(l_filtered_users))

['Dray.Choe', 'IceAmericano', 'JRW', 'Jinyeong', 'Jonggwon', 'Julyeon Seo', 'LSW', 'chico2121', 'cjhan', 'cnuh', 'dhkwak', 'edwin.kang', 'eos73', 'epsilon.kim', 'erin122', 'eunjin', 'hans', 'hexa.ai', 'hihello', 'hihello2', 'hyunji', 'ian.theman', 'jay.mini', 'jihoon.lee', 'jinhyun.b', 'junojuno', 'kwon.g', 'motherfathergentleman', 'robert.p', 'scarlett.heo', 'sungbin.lim', 'wookee3', 'woong.ssang', 'yejun', 'yoomin618']
35


## 2. Inter-user agreement score for each user

### Agreement metric from (Grodo and Larlus, 2017)
For each triplet,

$o1, o2$: the number of times the first (resp. second) image was chosen.  
$o3$: the number of times people did not pick any of the two images.  

Agreement score is computed as 

$$
s = (\frac{o_1+o_2 - 1}{o_1 + o_2 + o_3 - 1}) (\frac{o_i - 1}{o_1 + o_2 - 1})
$$
where $o_i$ $i\in\{1,2\}$ is the choice of the user.

## Our agreement metric
Let  
$o1, o2$ : the number of times the first (resp. second) image was chosen.  
$p$ : the number of times both of images were chosen  
$q$ : the nubmer of times neither of images were chosen


If a person choses the first or the second image,  
$$
s = \frac{o_i + 0.5 p - 1}{o_1 + o2 + p + q - 1}
$$
where $o_i$ $i\in\{1,2\}$ is the choice of the user.

If a person choses the "both" option,
$$
s = \frac{0.5o_1 + 0.5o_2 + p - 1}{o_1 + o_2 + p + q - 1}
$$

If a person choses the "neither" options,
$$
s = 0
% s = \frac{q - 1}{o_1 + o_2 + p + q - 1}
$$

Given a person, the score is averaged over the whole triplet which he/she labeled.

In [7]:
'''prepare answer_cnt'''
answer_cnt = df_result[['id', 'triplet_id', 'answer']].pivot_table(index='triplet_id', columns='answer', aggfunc='count').fillna(0)
answer_cnt.columns = answer_cnt.columns.droplevel(0)
answer_cnt = answer_cnt.rename(columns={0:'o1', 1:'o2', 2:'both', 3:'neither'})
answer_cnt['o3'] = answer_cnt['both'] + answer_cnt['neither']
answer_cnt.head()

answer,o1,o2,both,neither,o3
triplet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,2.0,0.0,0.0,0.0
2,0.0,2.0,2.0,0.0,2.0
3,4.0,0.0,1.0,0.0,1.0
4,0.0,3.0,0.0,0.0,0.0
5,5.0,0.0,0.0,0.0,0.0


In [8]:
'''user agreement'''
user_agree = {'user': [], 'score': []}
# for user in l_users:
for user in l_filtered_users:    
    print(user)
    df_answer_person = df_result[df_result['user_id'] == user].sort_values('triplet_id')
    l_score = []
    for i, row in df_answer_person.iterrows():
        triplet_id = row['triplet_id']
        if row['answer'] not in (0, 1, 2):
            continue

        answer = answer_cnt.loc[triplet_id]
        o1, o2, o3, p, q = answer['o1'], answer['o2'], answer['o3'], answer['both'], answer['neither']

#         o1, o2 = answer['o1'], answer['o2']
#         o3 = len(l_users) - o1 - o2

        # requires more than two active votes for the triplet
        if o1 + o2 < 2:
            continue

        if row['answer'] in {0, 1}:
            if row['answer'] == 0:
                oi = o1
            else:
                oi = o2
            s = (oi + 0.5 * p  - 1) / (o1 + o2 + p + q - 1)
        elif row['answer'] == 2:
            s = (0.5 * o1 + 0.5 * o2 + p - 1 ) / (o1 + o2 + p + q - 1)
        else:
            s = 0
#             s = (q - 1) / (o1 + o2 + p + 1 - 1)
        

#         s = (o1 + o2 - 1) / (o1 + o2 + o3 - 1) * (oi - 1) / (o1 + o2 - 1)
        

        l_score.append(s)
    score = np.mean(l_score)
    user_agree['user'].append(user)
    user_agree['score'].append(score)
user_agree = pd.DataFrame(user_agree)    
user_agree.head()
print(user_agree.mean(), user_agree.std())

Dray.Choe
IceAmericano
JRW
Jinyeong
Jonggwon
Julyeon Seo
LSW
chico2121
cjhan
cnuh
dhkwak
edwin.kang
eos73
epsilon.kim
erin122
eunjin
hans
hexa.ai
hihello
hihello2
hyunji
ian.theman
jay.mini
jihoon.lee
jinhyun.b
junojuno
kwon.g
motherfathergentleman
robert.p
scarlett.heo
sungbin.lim
wookee3
woong.ssang
yejun
yoomin618
score    0.690198
dtype: float64 score    0.040811
dtype: float64


## Agreement between algorithm and human

Let  
$o1, o2$ : the number of times the first (resp. second) image was chosen.  
$p$ : the number of times "both" icon were chosen  
$q$ : the nubmer of times "neither" icon of images were chosen


If an algorithm choses the first or the second image, it recieves the following agreement score  
$$
s = \frac{o_i + 0.5 p - 1}{o_1 + o_2 + p + q - 1}
$$
where $o_i$ $i\in\{1,2\}$ is the choice of the algorithm.

In [11]:
def read_similarity_data(algo, df_triplet):
    '''pre-fetch similarity scores'''
    d_sim = {}
    result_path = '/data/project/rw/viewer_CBIR/viewer/vg_coco_results/'
    for query_id, by_qid in df_triplet.groupby('query_id'):
        l_target_ids = list(by_qid['target_id1']) + list(by_qid['target_id2'])
        
        data = pd.read_csv(os.path.join(result_path, algo, f'{query_id}.tsv'), delimiter='\t', header=None)
        data = data.rename(columns={0:'id', 1:'sim'}).set_index('id')
#         sim = data.loc[[int(s) for s in l_target_ids]].to_dict()['sim']
        sim = data.reindex([int(s) for s in l_target_ids]).to_dict()['sim']
        d_sim[query_id] = sim
    return d_sim

In [12]:
def read_result(algo, query_id, l_target_ids):
    result_path = '/data/project/rw/viewer_CBIR/viewer/vg_coco_results/'
    data = pd.read_csv(os.path.join(result_path, algo, f'{query_id}.tsv'), delimiter='\t', header=None)
    data = data.rename(columns={0:'id', 1:'sim'}).set_index('id')
    return data.loc[[int(s) for s in l_target_ids]].to_dict()['sim']

In [25]:
l_algorithms = ['vg_han_tmb_bin_epoch_19']
# l_algorithms = ['v1_resnetweight05_epoch_10_rerank', 'gen_v1_resnet07_epoch_99_rerank', 'gen_v1_resnet07_epoch_99']
algo_scores = {'triplet_id': list(answer_cnt.index)}
d_resnet_sim = read_similarity_data('resnet', df_triplet)
for algo in tqdm(l_algorithms):
    d_sim_result = read_similarity_data(algo, df_triplet)
    l_score = []
    for triplet_id, row in answer_cnt.iterrows():
        o1, o2, o3, p, q = row['o1'], row['o2'], row['o3'], row['both'], row['neither']

        if o1 + o2  < 2:
            l_score.append(np.nan)
            continue
            
        # get algorithm's prediction
        triplet = df_triplet.loc[triplet_id]
        sim1 = d_sim_result[triplet['query_id']][triplet['target_id1']]
        sim2 = d_sim_result[triplet['query_id']][triplet['target_id2']]
        
        # process reranking
        if (np.isnan(sim1)) and (np.isnan(sim2)):
            sim1 = d_resnet_sim[triplet['query_id']][triplet['target_id1']]
            sim2 = d_resnet_sim[triplet['query_id']][triplet['target_id2']]
        elif (np.isnan(sim1)) and not (np.isnan(sim2)):
            sim1 = - np.inf
        elif not (np.isnan(sim1)) and (np.isnan(sim2)):
            sim2 = - np.inf
        
        if sim1 > sim2:
            oi = o1
        else:
            oi = o2
        
        s = (oi + 0.5 * p ) / (o1 + o2 + p + q )
        l_score.append(s)
    algo_scores[algo] = l_score
algo_scores = pd.DataFrame(algo_scores)


100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


In [26]:
algo_scores.mean()

triplet_id                 4995.389622
vg_han_tmb_bin_epoch_19       0.581261
dtype: float64

In [24]:
algo_scores.mean()

triplet_id    4995.389622
boo_tfidf        0.593685
dtype: float64

In [102]:
algo_scores.mean()

triplet_id                 4995.389622
gwl_rerank                    0.580386
gwl_gen_wordonly_rerank       0.590975
dtype: float64