# Evaluation on Robust04

In [1]:
from glob import glob
import pandas as pd
from trectools import TrecQrel
import json
from tqdm import tqdm
from numpy import isnan

EVAL_DIR = '../resources/eval/trec-system-runs/trec13/'
qrel = TrecQrel('../resources/unprocessed/topics-and-qrels/qrels.robust04.txt')

RUNS_TO_SKIP = set([])

# TODO Add unit tests here.
def load_eval_file(file_name, qrels=None):
    ret = []
    unique_queries = set(qrel.qrels_data['query'].astype(str).unique())
    
    eval_result = json.load(open(file_name, 'r'))
    if eval_result['task']['run'] in RUNS_TO_SKIP:
        return None
    
    for pool_name, results in eval_result.items():
        if pool_name in ['task']:
            continue
        covered_queries = set()
        scores = {}
        
        for result in results:
            assert results[0]['run_file'] == result['run_file']
            for eval_measure in result.keys():
                if eval_measure in set(['run_file', 'query']):
                    continue
                measure_name = eval_result['task']['measure'] + '-' + eval_measure
                if measure_name not in scores:
                    scores[measure_name] = []
                
                parsed_score = float(result[eval_measure])
                parsed_score = parsed_score if not isnan(parsed_score) else 0
                scores[measure_name] += [parsed_score]
                covered_queries.add(result['query'])
        
        current_entry = {'run': results[0]['run_file'], 'pooling': pool_name}
        to_update = {k: sum(v)/len(unique_queries) for k,v in scores.items()}
        current_entry.update(to_update)
        
        
        ret += [current_entry]

    return pd.DataFrame(ret)

df = []
for eval_file in tqdm(glob(EVAL_DIR + '*.jsonl')):
    #print(eval_file)
    df += [load_eval_file(eval_file)]

df = pd.concat([i for i in df if i is not None])
df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 880/880 [00:26<00:00, 33.42it/s]


Unnamed: 0,run,pooling,unjudged@20-UNJ@20,unjudged@10-UNJ@10,ndcg@20-NDCG@20,condensed-ndcg@10-NDCG@10,ndcg@10-NDCG@10,condensed-ndcg@20-NDCG@20,residual-ndcg@10-MIN-NDCG@10,residual-ndcg@10-MAX-NDCG@10,residual-ndcg@20-MIN-NDCG@20,residual-ndcg@20-MAX-NDCG@20
0,src/main/resources/processed/normalized-runs/t...,complete-pool,0.012851,,,,,,,,,
1,src/main/resources/processed/normalized-runs/t...,depth-10-pool-incomplete-for-apl,0.166064,,,,,,,,,
2,src/main/resources/processed/normalized-runs/t...,depth-20-pool-incomplete-for-apl,0.068474,,,,,,,,,
0,src/main/resources/processed/normalized-runs/t...,complete-pool,,0.011245,,,,,,,,
1,src/main/resources/processed/normalized-runs/t...,depth-10-pool-incomplete-for-nlp,,0.075502,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1,src/main/resources/processed/normalized-runs/t...,depth-10-pool-incomplete-for-sab,,,,,0.411813,,,,,
2,src/main/resources/processed/normalized-runs/t...,depth-20-pool-incomplete-for-sab,,,,,0.415456,,,,,
0,src/main/resources/processed/normalized-runs/t...,complete-pool,,,,,,,,,0.501578,0.506952
1,src/main/resources/processed/normalized-runs/t...,depth-10-pool-incomplete-for-vtu,,,,,,,,,0.529686,0.593904


In [2]:
def rename_pooling(pool):
    if pool.startswith('depth-10-pool-incomplete-for-'):
        return 'depth-10-incomplete'
    if pool == 'complete-pool':
        return 'complete'
    if pool.startswith('depth-20-pool-incomplete-for-'):
        return 'depth-20-incomplete'
    
    raise ValueError('I cant handle ' + str(pool))

def rename_measure(m):
    if m == 'unjudged@10-UNJ@10':
        return 'unjudged@10'
    if m == 'condensed-ndcg@10-NDCG@10':
        return 'condensed-ndcg@10'
    if m == 'residual-ndcg@10-MIN-NDCG@10':
        return 'residual-ndcg@10-min'
    if m == 'residual-ndcg@10-MAX-NDCG@10':
        return 'residual-ndcg@10-max'
    if m == 'ndcg@10-NDCG@10':
        return 'ndcg@10'

def process_row(df_row):
    df_row = df_row.to_dict()
    pool = rename_pooling(df_row['pooling'])
    
    ret = {}
    for k, v in df_row.items():
        if (type(v) is not float and type(v) is not int) or isnan(v):
            continue
        k = rename_measure(k)
        if k is None:
            continue

        k = (pool, k)
        assert k not in ret
        ret[k] = v
    
    return ret


def process_df(df):
    ret = {}
    run = df.iloc[0]['run']
    for _, i in df.iterrows():
        assert i['run'] == run
        for k, v in process_row(i).items():
            assert k not in ret
            ret[k] = v

    ret['run'] = run
    return pd.DataFrame([ret])

aggregated_df = df.groupby('run').apply(process_df)
aggregated_df

Unnamed: 0_level_0,Unnamed: 1_level_0,"(complete, unjudged@10)","(depth-10-incomplete, unjudged@10)","(depth-20-incomplete, unjudged@10)","(complete, condensed-ndcg@10)","(depth-10-incomplete, condensed-ndcg@10)","(depth-20-incomplete, condensed-ndcg@10)","(complete, ndcg@10)","(depth-10-incomplete, ndcg@10)","(depth-20-incomplete, ndcg@10)","(complete, residual-ndcg@10-min)","(complete, residual-ndcg@10-max)","(depth-10-incomplete, residual-ndcg@10-min)","(depth-10-incomplete, residual-ndcg@10-max)","(depth-20-incomplete, residual-ndcg@10-min)","(depth-20-incomplete, residual-ndcg@10-max)",run
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.JuruDes.gz,0,0.027309,0.127309,0.079116,0.480516,0.497944,0.488577,0.477433,0.474850,0.476176,0.477433,0.499208,0.474850,0.572838,0.476176,0.538110,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.JuruDesAggr.gz,0,0.024900,0.123695,0.076305,0.489913,0.507116,0.497938,0.486739,0.483572,0.485236,0.486739,0.505757,0.483572,0.574730,0.485236,0.542324,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.JuruDesLaMd.gz,0,0.027309,0.126104,0.078715,0.483690,0.501251,0.491498,0.480607,0.478435,0.479353,0.480607,0.502383,0.478435,0.576973,0.479353,0.541321,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.JuruDesQE.gz,0,0.026506,0.130924,0.076707,0.473808,0.490829,0.482790,0.471068,0.469241,0.470151,0.471068,0.489008,0.469241,0.566817,0.470151,0.530545,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.JuruDesSwQE.gz,0,0.026908,0.128514,0.077510,0.486532,0.502866,0.494495,0.483397,0.480658,0.482937,0.483397,0.504775,0.480658,0.579936,0.482937,0.544430,src/main/resources/processed/normalized-runs/t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.vtumtitle.gz,0,0.030924,0.132129,0.074297,0.445968,0.472387,0.454978,0.444179,0.445754,0.443295,0.444179,0.466938,0.445754,0.538931,0.443295,0.500270,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.wdo25qla1.gz,0,0.017671,0.106426,0.060241,0.455229,0.479846,0.466035,0.453859,0.459801,0.457390,0.453859,0.466013,0.459801,0.544741,0.457390,0.504669,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.wdoqdn1.gz,0,0.007631,0.051807,0.023293,0.466556,0.484592,0.473690,0.466166,0.474925,0.470120,0.466166,0.469198,0.474925,0.509744,0.470120,0.484854,src/main/resources/processed/normalized-runs/t...
src/main/resources/processed/normalized-runs/trec-system-runs/trec13/robust/input.wdoqla1.gz,0,0.006024,0.043775,0.017269,0.510959,0.531447,0.517309,0.510931,0.521754,0.516554,0.510931,0.512586,0.521754,0.546850,0.516554,0.523905,src/main/resources/processed/normalized-runs/t...


In [14]:
def report_for_row(df_row, depth):
    ground_truth = float(df_row[('complete', f'ndcg@{depth}')])

    return {
        'run': df_row['run'],
        'unjudged': float(df_row[(f'depth-{depth}-incomplete', f'unjudged@{depth}')]),
        'error-condensed-ndcg@10': ground_truth - float(df_row[(f'depth-{depth}-incomplete', f'condensed-ndcg@{depth}')]),
        'error-all-zero-ndcg@10': ground_truth - float(df_row[(f'depth-{depth}-incomplete', f'ndcg@{depth}')]),
        'error-min-residual-ndcg@10': ground_truth - float(df_row[(f'depth-{depth}-incomplete', f'residual-ndcg@{depth}-min')]),
        'error-max-residual-ndcg@10': ground_truth - float(df_row[(f'depth-{depth}-incomplete', f'residual-ndcg@{depth}-max')]),
    }

df_report = pd.DataFrame([dict(i) for i in aggregated_df.apply(lambda i: report_for_row(i, 10), axis=1)])
df_report

Unnamed: 0,run,unjudged,error-condensed-ndcg@10,error-all-zero-ndcg@10,error-min-residual-ndcg@10,error-max-residual-ndcg@10
0,src/main/resources/processed/normalized-runs/t...,0.127309,-0.020512,0.002583,0.002583,-0.095405
1,src/main/resources/processed/normalized-runs/t...,0.123695,-0.020377,0.003167,0.003167,-0.087991
2,src/main/resources/processed/normalized-runs/t...,0.126104,-0.020644,0.002172,0.002172,-0.096366
3,src/main/resources/processed/normalized-runs/t...,0.130924,-0.019761,0.001827,0.001827,-0.095750
4,src/main/resources/processed/normalized-runs/t...,0.128514,-0.019469,0.002739,0.002739,-0.096539
...,...,...,...,...,...,...
105,src/main/resources/processed/normalized-runs/t...,0.132129,-0.028208,-0.001575,-0.001575,-0.094752
106,src/main/resources/processed/normalized-runs/t...,0.106426,-0.025987,-0.005942,-0.005942,-0.090882
107,src/main/resources/processed/normalized-runs/t...,0.051807,-0.018427,-0.008759,-0.008759,-0.043578
108,src/main/resources/processed/normalized-runs/t...,0.043775,-0.020516,-0.010823,-0.010823,-0.035919
