In [1]:
import gzip
from glob import glob
import json
import pandas as pd
from statistics import mean
from tqdm import tqdm
from numpy import isnan

def corr(df, col_a, col_b):
    ret = df[[col_a, col_b]].corr('kendall')
    assert len(ret) == 2
    ret = ret.iloc[0].to_dict()
    if ret[col_b] > ret[col_a]:
        raise ValueError('Could not handle', ret)
    
    return ret[col_b]


PERCENTAGE_TOP_SYSTEMS = 0.75
TRUTH_FIELD = 'ground-truth-evaluation-top-10'
MEASURE = 'ndcg@10'

def evaluation_against_pool(f):
    ret = {}
    for k in f.keys():
        for method in f[k].keys():
            if method not in ret:
                ret[method] = {
                    'corpus-size': [],
                    'kendall-without-post-judgments': [],
                    'kendall-with-post-judgments': [],
                }

            ret[method]['corpus-size'] += [f[k][method]['corpus-size']]

            ndcg_scores = []
            for run, run_eval in f[k][method]['runs'].items():
                ndcg_scores += [{
                    'run': run,
                    'evaluation-with-post-judgments': run_eval['evaluation-with-post-judgments'][MEASURE],
                    'evaluation-without-post-judgments': run_eval['evaluation-without-post-judgments'][MEASURE],
                    'ground-truth-evaluation': run_eval[TRUTH_FIELD][MEASURE]
                }]

            ndcg_scores = pd.DataFrame(ndcg_scores)
            ndcg_scores = ndcg_scores.sort_values('ground-truth-evaluation', ascending=False)
            ndcg_scores = ndcg_scores.head(int(len(ndcg_scores)*PERCENTAGE_TOP_SYSTEMS))

            ret[method]['kendall-without-post-judgments'] += [corr(ndcg_scores, 'evaluation-without-post-judgments', 'ground-truth-evaluation')]
            ret[method]['kendall-with-post-judgments'] += [corr(ndcg_scores, 'evaluation-with-post-judgments', 'ground-truth-evaluation')]

    return ret


def evaluation_against_unpooled(f):
    ret = {}
    for k in f.keys():
        for method in f[k].keys():
            if method not in ret:
                ret[method] = {
                    'corpus-size': [],
                    'ndcg-scores': [],
                }

            ret[method]['corpus-size'] += [f[k][method]['corpus-size']]

            ndcg_scores = []
            for run_eval in f[k][method]['runs'].values():
                if run_eval['is-in-leave-out-group']:
                    ret[method]['ndcg-scores'] += [{
                        'evaluation-with-post-judgments': run_eval['evaluation-with-post-judgments'][MEASURE],
                        'evaluation-without-post-judgments': run_eval['evaluation-without-post-judgments'][MEASURE],
                        'ground-truth-evaluation': run_eval[TRUTH_FIELD][MEASURE]
                    }]

    for method in ret.keys():
        ndcg_scores = pd.DataFrame(ret[method]['ndcg-scores'])
        ndcg_scores = ndcg_scores.sort_values('ground-truth-evaluation', ascending=False)
        ndcg_scores = ndcg_scores.head(int(len(ndcg_scores)*PERCENTAGE_TOP_SYSTEMS))
        
        ret[method]['kendall-without-post-judgments'] = [corr(ndcg_scores, 'evaluation-without-post-judgments', 'ground-truth-evaluation')]
        ret[method]['kendall-with-post-judgments'] = [corr(ndcg_scores, 'evaluation-with-post-judgments', 'ground-truth-evaluation')]

    return ret

def evaluation(against_pool=True):
    df = []

    for i in tqdm(glob('../data/processed/evaluation-**.json.gz')):
        display_name = i.split('evaluation-')[1].split('.')[0]

        # TODO: Fix Eval with only unjudged. against each other. Do the "correct einsortieren test" via explicit other tests.

        with gzip.open(i, 'rt') as f:
            if against_pool:
                eval_entry_in_progress = evaluation_against_pool(json.load(f))
            else:
                eval_entry_in_progress = evaluation_against_unpooled(json.load(f))

        for method, eval_for_method in eval_entry_in_progress.items():

            df += [{
                'Dataset': display_name,
                'Subsampling': method, 
                'Corpus Size': mean(eval_for_method['corpus-size']),
                'kendall-without-post-judgments': mean(eval_for_method['kendall-without-post-judgments']),
                'kendall-with-post-judgments': mean(eval_for_method['kendall-with-post-judgments']),
            }]

    return pd.DataFrame(df)


In [2]:
df = evaluation(against_pool=False)
df['Dataset'] = df['Dataset'].apply(lambda i: i.split('-')[0])

df = df.groupby(['Dataset', 'Subsampling']) \
       .agg({'Corpus Size': 'mean', 'kendall-without-post-judgments': 'mean', 'kendall-with-post-judgments': 'mean'}) \
       .reset_index()

df.sort_values(['Dataset', 'Corpus Size']).round(3)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:05<00:00,  1.19s/it]


Unnamed: 0,Dataset,Subsampling,Corpus Size,kendall-without-post-judgments,kendall-with-post-judgments
1,clueweb09,loft-1000,1000.0,0.372,0.372
4,clueweb09,top-10-run-pool,7497.506,0.726,0.726
2,clueweb09,loft-10000,10000.0,0.541,0.541
7,clueweb09,top-25-run-pool,17578.606,0.776,0.778
8,clueweb09,top-50-run-pool,33533.198,0.776,0.811
3,clueweb09,re-rank-top-1000-bm25,47809.667,0.343,0.527
5,clueweb09,top-100-run-pool,64185.694,0.736,0.869
6,clueweb09,top-1000-run-pool,541356.213,0.673,0.938
0,clueweb09,complete-corpus,566038.0,0.652,1.0
10,clueweb12,loft-1000,1000.0,0.357,0.357


In [24]:
df = evaluation(against_pool=False)
df['Dataset'] = df['Dataset'].apply(lambda i: i.split('-')[0])

df = df.groupby(['Dataset', 'Subsampling']) \
       .agg({'Corpus Size': 'mean', 'kendall-without-post-judgments': 'mean', 'kendall-with-post-judgments': 'mean'}) \
       .reset_index()

df.sort_values(['Dataset', 'Corpus Size']).round(3)

100%|██████████| 5/5 [00:00<00:00, 30.40it/s]


Unnamed: 0,Dataset,Subsampling,Corpus Size,kendall-without-post-judgments,kendall-with-post-judgments
2,clueweb09,top-10-run-pool,7497.506,0.726,0.726
5,clueweb09,top-25-run-pool,17578.606,0.776,0.778
6,clueweb09,top-50-run-pool,33533.198,0.776,0.811
1,clueweb09,re-rank-top-1000-bm25,47809.667,0.343,0.527
3,clueweb09,top-100-run-pool,64185.694,0.736,0.869
4,clueweb09,top-1000-run-pool,541356.213,0.673,0.938
0,clueweb09,complete-corpus,566038.0,0.652,1.0
9,clueweb12,top-10-run-pool,6590.589,0.717,0.717
12,clueweb12,top-25-run-pool,14608.185,0.606,0.746
13,clueweb12,top-50-run-pool,26611.476,0.522,0.848


In [25]:
df = evaluation(against_pool=True)
df['Dataset'] = df['Dataset'].apply(lambda i: i.split('-')[0])

df = df.groupby(['Dataset', 'Subsampling']) \
       .agg({'Corpus Size': 'mean', 'kendall-without-post-judgments': 'mean', 'kendall-with-post-judgments': 'mean'}) \
       .reset_index()

df.sort_values(['Dataset', 'Corpus Size']).round(3)

100%|██████████| 5/5 [00:00<00:00,  7.83it/s]


Unnamed: 0,Dataset,Subsampling,Corpus Size,kendall-without-post-judgments,kendall-with-post-judgments
2,clueweb09,top-10-run-pool,7497.506,0.953,0.953
5,clueweb09,top-25-run-pool,17578.606,0.96,0.966
6,clueweb09,top-50-run-pool,33533.198,0.937,0.973
1,clueweb09,re-rank-top-1000-bm25,47809.667,0.904,0.947
3,clueweb09,top-100-run-pool,64185.694,0.926,0.979
4,clueweb09,top-1000-run-pool,541356.213,0.919,0.993
0,clueweb09,complete-corpus,566038.0,0.916,1.0
9,clueweb12,top-10-run-pool,6590.589,0.947,0.947
12,clueweb12,top-25-run-pool,14608.185,0.913,0.961
13,clueweb12,top-50-run-pool,26611.476,0.889,0.979
