In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import collections
import scipy.stats

import warnings
warnings.filterwarnings('ignore')

In [2]:
from ranking import RankingMeasures

In [3]:
data_cats = ['idea', 'design']
data_keys = {'idea': ['bike', 'cheat', 'festival', 'meeting', 'name', 'night', 'trash', 'visitor'], 
             'design': ['ai_character', 'olympic']}
data_keys_arr = data_keys['idea'] + data_keys['design']
d_arr = [2, 3, 5, 10]
methods = ['crowdea_%d' % d for d in d_arr] + ['crowdbt', 'bt']

In [6]:
data_dir = Path('../data/')
result_dir = Path('../result/')

ndcg5_data = []
ndcg10_data = []

for data_cat in data_cats:
    for data_key in data_keys[data_cat]:
        truth_df = pd.read_csv(data_dir / data_cat / data_key / 'truth.tsv', index_col=0, sep='\t')        
        result_dir_sub = result_dir / data_cat / data_key
        for method in methods:
            if method == 'bt' or method == 'crowdbt':
                for param in [0.001, 0.01, 0.1]:
                    if method == 'bt':
                        p = np.loadtxt(result_dir_sub / 'bt_x_beta{}.dat'.format(param))
                    elif method == 'crowdbt':
                        p = np.loadtxt(result_dir_sub / 'crowdbt_x_lambda{}.dat'.format(param))
                        
                    for viewpoint in truth_df.columns:
                        if viewpoint[0] != '*': 
                            continue
                            
                        p_truth = truth_df.loc[:, viewpoint].values
                        rm = RankingMeasures(p, p_truth)
                        ndcg5 = rm.nDCG(5)
                        ndcg10 = rm.nDCG(10)
                        
                        ndcg5_data.append({'data_key': data_key, 'ndcg5': ndcg5, 
                                           'method': method, 'viewpoint': viewpoint, 
                                           'param': param})
                        ndcg10_data.append({'data_key': data_key, 'ndcg10': ndcg10, 
                                            'method': method, 'viewpoint': viewpoint, 
                                            'param': param})
                        
            else:
                d = int(method.split('_')[-1])
                x = np.loadtxt(result_dir_sub / 'crowdea_x_alpha0.1_d{}.dat'.format(d))
                
                for viewpoint in truth_df.columns:
                    if viewpoint[0] != '*': 
                        continue
                        
                    p_truth = truth_df.loc[:, viewpoint].values
                    v_opt = np.linalg.lstsq(x, p_truth)[0]
                    p = x.dot(v_opt)
                    
                    rm = RankingMeasures(p, p_truth)
                    ndcg5 = rm.nDCG(5)
                    ndcg10 = rm.nDCG(10)
                    
                    ndcg5_data.append({'data_key': data_key, 'ndcg5': ndcg5, 
                                       'method': method, 'viewpoint': viewpoint})
                    ndcg10_data.append({'data_key': data_key, 'ndcg10': ndcg10, 
                                       'method': method, 'viewpoint': viewpoint})
                    
score_dfs = {'ndcg5': pd.DataFrame(ndcg5_data),
            'ndcg10': pd.DataFrame(ndcg10_data)}

In [7]:
# select best parameters of BT and CrowdBT for each (measure, data_key)
best_params = collections.defaultdict(dict)

for measure in score_dfs.keys():
    score_df = score_dfs[measure]
    for method in ['bt', 'crowdbt']:
        avg_df = score_df.loc[score_df.method == method].groupby(['data_key', 'method', 'param']).mean()
        for data_key in data_keys_arr:
            tmp_df = avg_df.loc[(data_key, method)]
            tmp_df = tmp_df.sample(tmp_df.shape[0]) # for tie-breaking
            best_params[measure][(method, data_key)] = tmp_df.sort_values(measure, ascending=False).index[0]
            
for measure in score_dfs.keys():
    score_df = score_dfs[measure]
    for index, row in score_df.iterrows():
        if row.method in ['bt', 'crowdbt']:
            if best_params[measure][(row.method, row.data_key)] != row.param:
                score_df.loc[index, measure] = np.nan
    score_df.drop('param', axis=1, inplace=True)
    score_df = score_df.dropna()
    score_dfs[measure] = score_df

In [8]:
avg_dfs = {}
for measure in score_dfs.keys():
    avg_dfs[measure] = score_dfs[measure].groupby(['data_key', 'method']).mean()

In [9]:
# "win" is True if CrowDEA performs better than BT and CrowdBT
for measure in score_dfs.keys():
    avg_df = avg_dfs[measure]
    for data_key in avg_dfs[measure].index.levels[0]:
        for method in [m for m in methods if 'crowdea' in m]:
            crowdea = avg_df.loc[(data_key, method), measure]
            bt = avg_df.loc[(data_key, 'bt'), measure]
            crowdbt = avg_df.loc[(data_key, 'crowdbt'), measure]
            if crowdea >= bt and crowdea >= crowdbt:
                win = True
            else:
                win = False
            avg_df.loc[(data_key, method), 'win'] = win

In [10]:
# "stats_win" is True if CrowDEA is the statistically significant (p < 0.05) winner by the Wilcoxon signed rank test 
for measure in score_dfs.keys():
    avg_df = avg_dfs[measure]
    score_df = score_dfs[measure]
    for data_key in avg_dfs[measure].index.levels[0]:
        for method in [m for m in methods if 'crowdea' in m]:
            win = avg_df.loc[(data_key, method), 'win']
            if win == False:
                stats_win = False
            else:
                crowdea = score_df.loc[(score_df.method == method) & (score_df.data_key == data_key), measure].values
                bt = score_df.loc[(score_df.method == 'bt') & (score_df.data_key == data_key), measure].values
                crowdbt = score_df.loc[(score_df.method == 'crowdbt') & (score_df.data_key == data_key), measure].values
                s, p_bt = scipy.stats.wilcoxon(crowdea, bt)
                s, p_crowdbt = scipy.stats.wilcoxon(crowdea, crowdbt)
                if p_crowdbt <= 0.05 and p_bt <= 0.05:
                    stats_win = True
                else:
                    stats_win = False
            avg_df.loc[(data_key, method), 'stats_win'] = stats_win

In [11]:
result_dfs = {}
win_dfs = {}
stats_win_dfs = {}

for measure in avg_dfs.keys():
    result_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    win_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    stats_win_df = pd.DataFrame(index=data_keys_arr, columns=methods)
    
    for data_key in data_keys_arr:
        for method in methods:
            result_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), measure]
            win_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), 'win']
            stats_win_df.loc[data_key, method] = avg_dfs[measure].loc[(data_key, method), 'stats_win']
            
    result_dfs[measure] = result_df
    win_dfs[measure] = win_df
    stats_win_dfs[measure] = stats_win_df

---

# nDCG@5

In [12]:
result_dfs['ndcg5']

Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,0.793789,0.805335,0.881,0.915908,0.783778,0.776317
cheat,0.842544,0.860716,0.895187,0.926938,0.764033,0.767304
festival,0.892667,0.887558,0.908616,0.923702,0.889673,0.893922
meeting,0.833529,0.845075,0.901019,0.930292,0.815407,0.81714
name,0.887196,0.880632,0.92283,0.930801,0.882275,0.887968
night,0.850008,0.852706,0.904812,0.945671,0.790344,0.790872
trash,0.751081,0.76541,0.865976,0.927372,0.729031,0.741193
visitor,0.914111,0.901487,0.916528,0.926727,0.824421,0.820267
ai_character,0.904642,0.913784,0.919834,0.952307,0.910755,0.90652
olympic,0.905486,0.91986,0.94581,0.968021,0.920892,0.920892


In [11]:
print('Does CrowDEA perform better than the baselines?')
win_dfs['ndcg5']

Does CrowDEA perform better than the baselines?


Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,True,True,True,True,,
cheat,True,True,True,True,,
festival,False,False,True,True,,
meeting,True,True,True,True,,
name,False,False,True,True,,
night,True,True,True,True,,
trash,True,True,True,True,,
visitor,True,True,True,True,,
ai_character,False,True,True,True,,
olympic,False,False,True,True,,


In [12]:
print('Is CrowDEA the statistically significant winner?')
stats_win_dfs['ndcg5'] # statistical winner or not

Is CrowDEA the statistically significant winner?


Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,False,True,True,True,,
cheat,True,True,True,True,,
festival,False,False,False,False,,
meeting,False,False,True,True,,
name,False,False,False,True,,
night,True,True,True,True,,
trash,False,False,True,True,,
visitor,True,True,True,True,,
ai_character,False,False,False,True,,
olympic,False,False,True,True,,


# nDCG@10

In [13]:
result_dfs['ndcg10']

Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,0.813298,0.825318,0.898272,0.928652,0.80092,0.799512
cheat,0.851581,0.880723,0.913096,0.938447,0.786951,0.792312
festival,0.905463,0.904559,0.921414,0.93444,0.908128,0.904145
meeting,0.847738,0.862977,0.913141,0.93432,0.824912,0.823487
name,0.898588,0.895936,0.925749,0.940117,0.892287,0.898044
night,0.859963,0.869808,0.910069,0.946631,0.806921,0.807651
trash,0.783389,0.795525,0.881158,0.939599,0.746629,0.758148
visitor,0.923405,0.914187,0.925377,0.937279,0.833791,0.832353
ai_character,0.920204,0.92426,0.926365,0.95613,0.919259,0.913606
olympic,0.919439,0.928334,0.948274,0.97015,0.932265,0.932426


In [14]:
print('Does CrowDEA perform better than the baselines?')
win_dfs['ndcg10']

Does CrowDEA perform better than the baselines?


Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,True,True,True,True,,
cheat,True,True,True,True,,
festival,False,False,True,True,,
meeting,True,True,True,True,,
name,True,False,True,True,,
night,True,True,True,True,,
trash,True,True,True,True,,
visitor,True,True,True,True,,
ai_character,True,True,True,True,,
olympic,False,False,True,True,,


In [15]:
print('Is CrowDEA the statistically significant winner?')
stats_win_dfs['ndcg10']

Is CrowDEA the statistically significant winner?


Unnamed: 0,crowdea_2,crowdea_3,crowdea_5,crowdea_10,crowdbt,bt
bike,False,True,True,True,,
cheat,True,True,True,True,,
festival,False,False,False,False,,
meeting,True,True,True,True,,
name,False,False,False,True,,
night,False,True,True,True,,
trash,False,False,True,True,,
visitor,True,True,True,True,,
ai_character,False,False,False,True,,
olympic,False,False,True,True,,


---

In [16]:
data_key_tex = {}
for data_key in data_keys_arr:
    data_key_tex[data_key] = data_key.capitalize()
    if data_key == 'ai_character':
        data_key_tex[data_key] = 'Character'

In [18]:
measure = 'ndcg5'
result_df = result_dfs[measure]
win_df = win_dfs[measure]
stats_win_df = stats_win_dfs[measure]

lines = []

lines.append('\\begin{tabular}{|l||%s}' % ('c|' * (len(methods)) ))
lines.append('\\hline')

#header1
elems = ['\\multirow{2}{*}{Dataset}']
elems.append('\\multicolumn{%d}{c|}{%s}' % (len(d_arr), '\\textsc{CrowDEA}'))
elems.append('\\multirow{2}{*}{%s}' % '\\textsc{CrowdBT}')
elems.append('\\multirow{2}{*}{%s}' % '\\textsc{BT}')

lines.append('&'.join(elems) + '\\\\')
lines.append('\\cline{2-5}')

#header2
elems = ['']
for d in d_arr:
    elems.append('d=%d' % d)
elems.append('')
elems.append('')

lines.append('&'.join(elems) + '\\\\')
lines.append('\\hline')
lines.append('\\hline')

#content
for data_key in data_keys_arr:
    elems = [data_key_tex[data_key]]
    for method in methods:
        v = result_df.loc[data_key, method]
        win = win_df.loc[data_key, method]
        stats_win = stats_win_df.loc[data_key, method]
        
        if 'crowdea' in method:
            if stats_win:
                elem = '\\underline{$\\mathbf{%.3f}$}' % (v)
            elif win:
                elem = '$\\mathbf{%.3f}$' % (v)
            else:
                elem = '${%.3f}$' % v
        else:
            elem = '${%.3f}$' % v
        
        elems.append(elem)
    
    lines.append('&'.join(elems) + '\\\\')

    if data_key == 'visitor':
        lines.append('\\hline')
        
lines.append('\\hline')
lines.append('\\end{tabular}')

print('\n'.join(lines))

\begin{tabular}{|l||c|c|c|c|c|c|}
\hline
\multirow{2}{*}{Dataset}&\multicolumn{4}{c|}{\textsc{CrowDEA}}&\multirow{2}{*}{\textsc{CrowdBT}}&\multirow{2}{*}{\textsc{BT}}\\
\cline{2-5}
&d=2&d=3&d=5&d=10&&\\
\hline
\hline
Bike&$\mathbf{0.794}$&\underline{$\mathbf{0.805}$}&\underline{$\mathbf{0.881}$}&\underline{$\mathbf{0.916}$}&${0.784}$&${0.776}$\\
Cheat&\underline{$\mathbf{0.843}$}&\underline{$\mathbf{0.861}$}&\underline{$\mathbf{0.895}$}&\underline{$\mathbf{0.927}$}&${0.764}$&${0.767}$\\
Festival&${0.893}$&${0.888}$&$\mathbf{0.909}$&$\mathbf{0.924}$&${0.890}$&${0.894}$\\
Meeting&$\mathbf{0.834}$&$\mathbf{0.845}$&\underline{$\mathbf{0.901}$}&\underline{$\mathbf{0.930}$}&${0.815}$&${0.817}$\\
Name&${0.887}$&${0.881}$&$\mathbf{0.923}$&\underline{$\mathbf{0.931}$}&${0.882}$&${0.888}$\\
Night&\underline{$\mathbf{0.850}$}&\underline{$\mathbf{0.853}$}&\underline{$\mathbf{0.905}$}&\underline{$\mathbf{0.946}$}&${0.790}$&${0.791}$\\
Trash&$\mathbf{0.751}$&$\mathbf{0.765}$&\underline{$\mathbf{0.86