In [1]:
import gzip
from glob import glob
import json
import pandas as pd
from statistics import mean
from tqdm import tqdm
from numpy import isnan
#pip install pyircor --ignore-requires-python --no-deps --break-system-packages
# modify from collections.abc import Iterable
from pyircor.tauap import tauap_b

def corr(df, col_a, col_b):
    return tauap_b(df[col_a], df[col_b])


PERCENTAGE_TOP_SYSTEMS = 0.75
TRUTH_FIELD = 'ground-truth-evaluation-top-10'
MEASURE = 'ndcg@10'

def evaluation_against_pool(f):
    ret = {}
    for k in f.keys():
        for method in f[k].keys():
            if method not in ret:
                ret[method] = {
                    'corpus-size': [],
                    'kendall-without-post-judgments': [],
                    'kendall-without-post-judgments-condensed': [],
                    'kendall-with-post-judgments': [],
                }

            ret[method]['corpus-size'] += [f[k][method]['corpus-size']]

            ndcg_scores = []
            for run, run_eval in f[k][method]['runs'].items():
                ndcg_scores += [{
                    'run': run,
                    'evaluation-with-post-judgments': run_eval['evaluation-with-post-judgments'][MEASURE],
                    'evaluation-without-post-judgments': run_eval['evaluation-without-post-judgments'][MEASURE],
                    'evaluation-without-post-judgments-condensed': run_eval['evaluation-without-post-judgments'][MEASURE + '-condensed'],
                    'ground-truth-evaluation': run_eval[TRUTH_FIELD][MEASURE]
                }]

            ndcg_scores = pd.DataFrame(ndcg_scores)
            ndcg_scores = ndcg_scores.sort_values('ground-truth-evaluation', ascending=False)
            ndcg_scores = ndcg_scores.head(int(len(ndcg_scores)*PERCENTAGE_TOP_SYSTEMS))

            ret[method]['kendall-without-post-judgments'] += [corr(ndcg_scores, 'evaluation-without-post-judgments', 'ground-truth-evaluation')]
            ret[method]['kendall-without-post-judgments-condensed'] += [corr(ndcg_scores, 'evaluation-without-post-judgments-condensed', 'ground-truth-evaluation')]
            ret[method]['kendall-with-post-judgments'] += [corr(ndcg_scores, 'evaluation-with-post-judgments', 'ground-truth-evaluation')]

    return ret


def evaluation_against_unpooled(f):
    ret = {}
    for k in f.keys():
        for method in f[k].keys():
            if method not in ret:
                ret[method] = {
                    'corpus-size': [],
                    'ndcg-scores': [],
                }

            ret[method]['corpus-size'] += [f[k][method]['corpus-size']]

            ndcg_scores = []
            for run_eval in f[k][method]['runs'].values():
                if run_eval['is-in-leave-out-group']:
                    ret[method]['ndcg-scores'] += [{
                        'evaluation-with-post-judgments': run_eval['evaluation-with-post-judgments'][MEASURE],
                        'evaluation-without-post-judgments': run_eval['evaluation-without-post-judgments'][MEASURE],
                        'evaluation-without-post-judgments-condensed': run_eval['evaluation-without-post-judgments'][MEASURE + '-condensed'],
                        'ground-truth-evaluation': run_eval[TRUTH_FIELD][MEASURE]
                    }]

    for method in ret.keys():
        ndcg_scores = pd.DataFrame(ret[method]['ndcg-scores'])
        ndcg_scores = ndcg_scores.sort_values('ground-truth-evaluation', ascending=False)
        ndcg_scores = ndcg_scores.head(int(len(ndcg_scores)*PERCENTAGE_TOP_SYSTEMS))
        
        ret[method]['kendall-without-post-judgments'] = [corr(ndcg_scores, 'evaluation-without-post-judgments', 'ground-truth-evaluation')]
        ret[method]['kendall-with-post-judgments'] = [corr(ndcg_scores, 'evaluation-with-post-judgments', 'ground-truth-evaluation')]
        ret[method]['kendall-without-post-judgments-condensed'] = [corr(ndcg_scores, 'evaluation-without-post-judgments-condensed', 'ground-truth-evaluation')]

    return ret

def evaluation(against_pool=True):
    df = []

    for i in tqdm(glob('../data/processed/evaluation-**.json.gz')):
        display_name = i.split('evaluation-')[1].split('.')[0]

        # TODO: Fix Eval with only unjudged. against each other. Do the "correct einsortieren test" via explicit other tests.

        with gzip.open(i, 'rt') as f:
            if against_pool:
                eval_entry_in_progress = evaluation_against_pool(json.load(f))
            else:
                eval_entry_in_progress = evaluation_against_unpooled(json.load(f))

        for method, eval_for_method in eval_entry_in_progress.items():

            df += [{
                'Dataset': display_name,
                'Subsampling': method, 
                'Corpus Size': mean(eval_for_method['corpus-size']),
                'kendall-without-post-judgments': mean(eval_for_method['kendall-without-post-judgments']),
                'kendall-without-post-judgments-condensed': mean(eval_for_method['kendall-without-post-judgments-condensed']),
                'kendall-with-post-judgments': mean(eval_for_method['kendall-with-post-judgments']),
            }]

    return pd.DataFrame(df)


In [2]:
def create_df(against_pool):
    df = evaluation(against_pool)
    df['Dataset'] = df['Dataset'].apply(lambda i: i.split('-')[0])
    
    df = df.groupby(['Dataset', 'Subsampling']) \
           .agg({'Corpus Size': 'mean', 'kendall-without-post-judgments': 'mean', 'kendall-with-post-judgments': 'mean', 'kendall-without-post-judgments-condensed': 'mean'}) \
           .reset_index()
        
    df.sort_values(['Dataset', 'Corpus Size']).round(3)
    return df

df_against_pool = create_df(True)
df_all_unpooled = create_df(False)

100%|█████████████████████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9/9 [00:01<00:00,  6.18it/s]


In [3]:
df_against_pool

Unnamed: 0,Dataset,Subsampling,Corpus Size,kendall-without-post-judgments,kendall-with-post-judgments,kendall-without-post-judgments-condensed
0,clueweb09,complete-corpus,629140.0,0.930034,1.0,0.832334
1,clueweb09,loft-1000,1000.0,0.792981,0.792981,0.69026
2,clueweb09,loft-10000,10000.0,0.798956,0.798956,0.693133
3,clueweb09,re-rank-top-1000-bm25,48091.0,0.919194,0.936062,0.807336
4,clueweb09,top-10-run-pool,8914.825414,0.944385,0.944385,0.831539
5,clueweb09,top-100-run-pool,73939.618659,0.934096,0.980128,0.832291
6,clueweb09,top-1000-run-pool,604957.268815,0.931115,0.993926,0.832334
7,clueweb09,top-25-run-pool,20587.24824,0.956036,0.966723,0.831703
8,clueweb09,top-50-run-pool,38878.354736,0.94003,0.973855,0.83177
9,clueweb12,complete-corpus,435859.5,0.895274,1.0,0.938517


In [4]:
df_all_unpooled

Unnamed: 0,Dataset,Subsampling,Corpus Size,kendall-without-post-judgments,kendall-with-post-judgments,kendall-without-post-judgments-condensed
0,clueweb09,complete-corpus,629140.0,0.56465,1.0,0.585553
1,clueweb09,loft-1000,1000.0,0.278011,0.278011,0.28132
2,clueweb09,loft-10000,10000.0,0.372561,0.372291,0.359627
3,clueweb09,re-rank-top-1000-bm25,48091.0,0.265054,0.435443,0.29752
4,clueweb09,top-10-run-pool,8914.825414,0.611418,0.611418,0.579075
5,clueweb09,top-100-run-pool,73939.618659,0.617504,0.826774,0.584557
6,clueweb09,top-1000-run-pool,604957.268815,0.57369,0.945882,0.585553
7,clueweb09,top-25-run-pool,20587.24824,0.705919,0.733542,0.583362
8,clueweb09,top-50-run-pool,38878.354736,0.657894,0.781369,0.584336
9,clueweb12,complete-corpus,435859.5,0.302094,1.0,0.698547


In [14]:
def f(df, method, measurement, dataset):
    df = df[df['Dataset'] == dataset]
    max_score = df[df['Subsampling'] != 'complete-corpus'][measurement].max() - 0.0001
    
    df = df[df['Subsampling'] == method]
    assert len(df) == 1
    df = df[measurement].iloc[0]
    style = '\\bfseries' if method != 'complete-corpus' and (df) >= max_score else ''
    
    return "{" + style + (" {:.3f}".format(df)).replace('0.', '.') + "}"
    
def line(method, df):
    ret = []
    for dataset in ['clueweb09', 'clueweb12', 'msmarco', 'disks45']:
        for measurement in ['kendall-without-post-judgments', 'kendall-without-post-judgments-condensed', 'kendall-with-post-judgments']:
            ret += [f(df, method, measurement, dataset)]
    return ' & '.join(ret)
    

def print_table(df):
    return """
\\begin{tabular}{@{}lccc@{\\hspace{1.3em}}ccc@{\\hspace{1.3em}}ccc@{\\hspace{1.3em}}ccc@{}}
\\toprule
\\bfseries Sampling     &   \\multicolumn{3}{c@{\\hspace{1em}}}{\\bfseries ClueWeb09}  & \\multicolumn{3}{c@{\\hspace{1em}}}{\\bfseries ClueWeb12}     & \\multicolumn{3}{c@{\\hspace{1em}}}{\\bfseries MS~MARCO} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\bfseries Robust04}                                              \\\\
\\cmidrule(r@{1em}){2-4}
\\cmidrule(r@{1em}){5-7}
\\cmidrule(r@{1em}){8-10}
\\cmidrule(){11-13}

& $\\tau$ & $\\tau_{C}$ & $\\tau_{PJ}$ & $\\tau$ & $\\tau_{C}$ & $\\tau_{PJ}$ & $\\tau$ & $\\tau_{C}$ & $\\tau_{PJ}$ & $\\tau$ & $\\tau_{C}$ & $\\tau_{PJ}$ \\\\

\\midrule

BM25 & """ + line('re-rank-top-1000-bm25', df) + """ \\\\


\\midrule

LOFT$_{1k}$ & """ + line('loft-1000', df) + """ \\\\
LOFT$_{10k}$ & """ + line('loft-10000', df) + """ \\\\

\\midrule

Pool$_J$ & """ + line('top-10-run-pool', df) + """ \\\\
Pool$_{25}$ & """ + line('top-25-run-pool', df) + """ \\\\
Pool$_{50}$ & """ + line('top-50-run-pool', df) + """ \\\\
Pool$_{100}$ & """ + line('top-100-run-pool', df) + """ \\\\
Pool$_{1000}$ & """ + line('top-1000-run-pool', df) + """ \\\\

\\midrule

Full & """ + line('complete-corpus', df) + """ \\\\

\\bottomrule

\\end{tabular}
"""

In [15]:
print(print_table(df_against_pool))


\begin{tabular}{@{}lccc@{\hspace{1.3em}}ccc@{\hspace{1.3em}}ccc@{\hspace{1.3em}}ccc@{}}
\toprule
\bfseries Sampling     &   \multicolumn{3}{c@{\hspace{1em}}}{\bfseries ClueWeb09}  & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries ClueWeb12}     & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries MS~MARCO} & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries Robust04}                                              \\
\cmidrule(r@{1em}){2-4}
\cmidrule(r@{1em}){5-7}
\cmidrule(r@{1em}){8-10}
\cmidrule(){11-13}

& $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ \\

\midrule

BM25 & { .919} & { .807} & { .936} & { .894} & { .922} & { .938} & { .847} & { .827} & { .836} & { .980} & { .945} & { .994} \\


\midrule

LOFT$_{1k}$ & { .793} & { .690} & { .793} & { .775} & { .775} & { .775} & { .776} & { .774} & { .776} & { .940} & { .904} & { .940} \\
LOFT$_{10k}$ & { .799} & { .693} & { .799} & { .763} & { .762} & { .765}

In [16]:
print(print_table(df_all_unpooled))


\begin{tabular}{@{}lccc@{\hspace{1.3em}}ccc@{\hspace{1.3em}}ccc@{\hspace{1.3em}}ccc@{}}
\toprule
\bfseries Sampling     &   \multicolumn{3}{c@{\hspace{1em}}}{\bfseries ClueWeb09}  & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries ClueWeb12}     & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries MS~MARCO} & \multicolumn{3}{c@{\hspace{1em}}}{\bfseries Robust04}                                              \\
\cmidrule(r@{1em}){2-4}
\cmidrule(r@{1em}){5-7}
\cmidrule(r@{1em}){8-10}
\cmidrule(){11-13}

& $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ & $\tau$ & $\tau_{C}$ & $\tau_{PJ}$ \\

\midrule

BM25 & { .265} & { .298} & { .435} & { .211} & { .541} & { .542} & { .601} & { .593} & { .627} & { .888} & { .859} & { .949} \\


\midrule

LOFT$_{1k}$ & { .278} & { .281} & { .278} & { .329} & { .329} & { .329} & { .294} & { .294} & { .294} & { .416} & { .392} & { .416} \\
LOFT$_{10k}$ & { .373} & { .360} & { .372} & { .366} & { .337} & { .373}