In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import collections
import scipy.stats

import warnings
warnings.filterwarnings('ignore')

In [2]:
from ranking import RankingMeasures

In [11]:
data_cats = ['idea', 'design']
data_keys = {'idea': ['bike', 'cheat', 'meeting', 'night', 'visitor'], 
             'design': ['ai_character', 'olympic']}
data_keys_arr = data_keys['idea'] + data_keys['design']
d_arr = [2, 3]
methods = ['crowdea_%d' % d for d in d_arr] + ['crowdbt', 'bt']

In [12]:
data_dir = Path('../data/')
result_dir = Path('../result/')

ndcg5_data = []
ndcg10_data = []

for data_cat in data_cats:
    for data_key in data_keys[data_cat]:
        truth_df = pd.read_csv(data_dir / data_cat / data_key / 'truth.tsv', index_col=0, sep='\t')        
        result_dir_sub = result_dir / data_cat / data_key
        for method in methods:
            if method == 'bt' or method == 'crowdbt':
                for param in [0.001, 0.01, 0.1]:
                    if method == 'bt':
                        p = np.loadtxt(result_dir_sub / 'bt_x_beta{}.dat'.format(param))
                    elif method == 'crowdbt':
                        p = np.loadtxt(result_dir_sub / 'crowdbt_x_lambda{}.dat'.format(param))
                        
                    for viewpoint in truth_df.columns:
                        if viewpoint[0] != '*': 
                            continue
                            
                        p_truth = truth_df.loc[:, viewpoint].values
                        rm = RankingMeasures(p, p_truth)
                        ndcg5 = rm.nDCG(5)
                        ndcg10 = rm.nDCG(10)
                        
                        ndcg5_data.append({'data_key': data_key, 'ndcg5': ndcg5, 
                                           'method': method, 'viewpoint': viewpoint, 
                                           'param': param})
                        ndcg10_data.append({'data_key': data_key, 'ndcg10': ndcg10, 
                                            'method': method, 'viewpoint': viewpoint, 
                                            'param': param})
                        
            else:
                d = int(method.split('_')[-1])
                x = np.loadtxt(result_dir_sub / 'crowdea_x_alpha0.1_d{}.dat'.format(d))
                
                for viewpoint in truth_df.columns:
                    if viewpoint[0] != '*': 
                        continue
                        
                    p_truth = truth_df.loc[:, viewpoint].values
                    v_opt = np.linalg.lstsq(x, p_truth)[0]
                    p = x.dot(v_opt)
                    
                    rm = RankingMeasures(p, p_truth)
                    ndcg5 = rm.nDCG(5)
                    ndcg10 = rm.nDCG(10)
                    
                    ndcg5_data.append({'data_key': data_key, 'ndcg5': ndcg5, 
                                       'method': method, 'viewpoint': viewpoint})
                    ndcg10_data.append({'data_key': data_key, 'ndcg10': ndcg10, 
                                       'method': method, 'viewpoint': viewpoint})
                    
score_dfs = {'ndcg5': pd.DataFrame(ndcg5_data),
            'ndcg10': pd.DataFrame(ndcg10_data)}

In [13]:
# select best parameters of BT and CrowdBT for each (measure, data_key)
best_params = collections.defaultdict(dict)

for measure in score_dfs.keys():
    score_df = score_dfs[measure]
    for method in ['bt', 'crowdbt']:
        avg_df = score_df.loc[score_df.method == method].groupby(['data_key', 'method', 'param']).mean()
        for data_key in data_keys_arr:
            tmp_df = avg_df.loc[(data_key, method)]
            tmp_df = tmp_df.sample(tmp_df.shape[0]) # for tie-breaking
            best_params[measure][(method, data_key)] = tmp_df.sort_values(measure, ascending=False).index[0]
            
for measure in score_dfs.keys():
    score_df = score_dfs[measure]
    for index, row in score_df.iterrows():
        if row.method in ['bt', 'crowdbt']:
            if best_params[measure][(row.method, row.data_key)] != row.param:
                score_df.loc[index, measure] = np.nan
    score_df.drop('param', axis=1, inplace=True)
    score_df = score_df.dropna()
    score_dfs[measure] = score_df

In [14]:
avg_dfs = {}
for measure in score_dfs.keys():
    avg_dfs[measure] = score_dfs[measure].groupby(['data_key', 'method']).mean()

In [15]:
# "win" is True if CrowDEA performs better than BT and CrowdBT
for measure in score_dfs.keys():
    avg_df = avg_dfs[measure]
    for data_key in avg_dfs[measure].index.levels[0]:
        for method in [m for m in methods if 'crowdea' in m]:
            crowdea = avg_df.loc[(data_key, method), measure]
            bt = avg_df.loc[(data_key, 'bt'), measure]
            crowdbt = avg_df.loc[(data_key, 'crowdbt'), measure]
            if crowdea >= bt and crowdea >= crowdbt:
                win = True
            else:
                win = False
            avg_df.loc[(data_key, method), 'win'] = win

In [16]:
# "stats_win" is True if CrowDEA is the statistically significant (p < 0.05) winner by the Wilcoxon signed rank test 
for measure in score_dfs.keys():
    avg_df = avg_dfs[measure]
    score_df = score_dfs[measure]
    for data_key in avg_dfs[measure].index.levels[0]:
        for method in [m for m in methods if 'crowdea' in m]:
            win = avg_df.loc[(data_key, method), 'win']
            if win == False:
                stats_win = False
            else:
                crowdea = score_df.loc[(score_df.method == method) & (score_df.data_key == data_key), measure].values
                bt = score_df.loc[(score_df.method == 'bt') & (score_df.data_key == data_key), measure].values
                crowdbt = score_df.loc[(score_df.method == 'crowdbt') & (score_df.data_key == data_key), measure].values
                s, p_bt = scipy.stats.wilcoxon(crowdea, bt)
                s, p_crowdbt = scipy.stats.wilcoxon(crowdea, crowdbt)
                if p_crowdbt <= 0.05 and p_bt <= 0.05:
                    stats_win = True
                else:
                    stats_win = False
            avg_df.loc[(data_key, method), 'stats_win'] = stats_win

In [17]:
result_dfs = {}
win_dfs = {}
stats_win_dfs = {}

for measure in avg_dfs.keys():
    result_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    win_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    stats_win_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    
    for data_key in data_keys_arr:
        for method in methods:
            result_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), measure]
            win_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), 'win']
            stats_win_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), 'stats_win']
            
    result_dfs[measure] = result_df
    win_dfs[measure] = win_df
    stats_win_dfs[measure] = stats_win_df

---

# nDCG@5

In [18]:
result_dfs['ndcg5']

Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,0.790658,0.805714,0.779258,0.77152
cheat,0.844024,0.85892,0.766565,0.767964
meeting,0.837196,0.847141,0.814916,0.816918
night,0.847652,0.850171,0.789876,0.79048
visitor,0.915786,0.899715,0.824591,0.818361
ai_character,0.901221,0.913235,0.911642,0.902411
olympic,0.909284,0.921099,0.925774,0.925774


In [19]:
print('Does CrowDEA perform better than the baselines?')
win_dfs['ndcg5']

Does CrowDEA perform better than the baselines?


Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,True,True,,
cheat,True,True,,
meeting,True,True,,
night,True,True,,
visitor,True,True,,
ai_character,False,True,,
olympic,False,False,,


In [20]:
print('Is CrowDEA the statistically significant winner?')
stats_win_dfs['ndcg5'] # statistical winner or not

Is CrowDEA the statistically significant winner?


Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,False,True,,
cheat,True,True,,
meeting,True,False,,
night,True,True,,
visitor,True,True,,
ai_character,False,False,,
olympic,False,False,,


# nDCG@10

In [21]:
result_dfs['ndcg10']

Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,0.813167,0.827508,0.800301,0.79844
cheat,0.85344,0.881363,0.790706,0.795173
meeting,0.851563,0.866054,0.825414,0.823989
night,0.859483,0.869703,0.80832,0.808924
visitor,0.925417,0.91418,0.834817,0.832233
ai_character,0.919311,0.924941,0.920829,0.911494
olympic,0.923439,0.929849,0.936815,0.936976


In [22]:
print('Does CrowDEA perform better than the baselines?')
win_dfs['ndcg10']

Does CrowDEA perform better than the baselines?


Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,True,True,,
cheat,True,True,,
meeting,True,True,,
night,True,True,,
visitor,True,True,,
ai_character,False,True,,
olympic,False,False,,


In [23]:
print('Is CrowDEA the statistically significant winner?')
stats_win_dfs['ndcg10']

Is CrowDEA the statistically significant winner?


Unnamed: 0,crowdea_2,crowdea_3,crowdbt,bt
bike,False,True,,
cheat,True,True,,
meeting,True,True,,
night,False,True,,
visitor,True,True,,
ai_character,False,False,,
olympic,False,False,,
