# Analyze QRELS

In [2]:
import json
import pandas as pd

df_raw = [json.loads(i) for i in open('overview-over-qrels.jsonl', 'r')]

df = {}
for i in df_raw:
    if i['corpus'] not in df:
        df[i['corpus']] = {'complete': {}, 'unjudged': {}, 'incomplete': {}}
    
    for qrel_type, data in i['qrels'].items():
        for label, count in data.items():
            if label not in df[i['corpus']][qrel_type]:
                df[i['corpus']][qrel_type][label] = []

            df[i['corpus']][qrel_type][label] += [count]

df_sum = []

for corpus, data in df.items():
    tmp = {'corpus': corpus}
    
    for qrel_type, labels in data.items():
        cnt_all = sum([sum(i) for i in labels.values()])
        
        for label, count in labels.items():
            count = sum(count)
            
            tmp[(qrel_type, int(label))] = count / cnt_all
    df_sum += [tmp]

df_sum = pd.DataFrame(df_sum)
df_sum

Unnamed: 0,corpus,"(complete, 0)","(complete, 2)","(complete, 1)","(unjudged, 0)","(unjudged, 2)","(unjudged, 1)","(incomplete, 0)","(incomplete, 2)","(incomplete, 1)","(complete, 3)","(complete, 4)","(unjudged, 3)","(unjudged, 4)","(incomplete, 3)","(incomplete, 4)"
0,trec-covid,0.629795,0.214591,0.155614,0.747959,0.232653,0.019388,0.628049,0.214324,0.157627,,,,,,
1,ClueWeb09,0.779112,0.050649,0.155667,0.801625,0.038258,0.147245,0.746183,0.068773,0.167987,0.006997,0.007574,0.006302,0.006571,0.008015,0.009042
2,ClueWeb12,0.654629,0.085206,0.243682,0.671119,0.076557,0.239844,0.639059,0.093372,0.247305,0.014937,0.001547,0.011064,0.001415,0.018593,0.00167
3,Robust04,0.944087,0.003311,0.052603,0.958155,0.001505,0.04034,0.798735,0.021966,0.179299,,,,,,


### Load Data for Corpus TREC-COVID

In [53]:
from trectools import TrecQrel
from tqdm import tqdm
import json

def calc_aggregate_qrels(qrels_incomplete, qrels_complete):
    if type(qrels_incomplete) is str:
        return calc_aggregate_qrels(TrecQrel(qrels_incomplete), qrels_complete)
    if type(qrels_complete) is str:
        return calc_aggregate_qrels(qrels_incomplete, TrecQrel(qrels_complete))
    
    
    qrels_incomplete = {(int(i['query']), i['docid']): max(int(i['rel']), 0) for _, i in qrels_incomplete.qrels_data.iterrows()}
    qrels_complete = {(int(i['query']), i['docid']): max(int(i['rel']), 0) for _, i in qrels_complete.qrels_data.iterrows()}
    
    ret = {'complete': {}, 'unjudged': {}, 'incomplete': {}}
    
    for i in list(set(list(qrels_incomplete.keys()) + list(qrels_complete.keys()))):
        label = qrels_complete[i]
        
        for name in ret.keys():
            if label not in ret[name]:
                ret[name][label] = 0
        
        ret['complete'][label] += 1
        
        if i not in qrels_incomplete:
            ret['unjudged'][label] += 1
        else:
            ret['incomplete'][label] += 1
    
    return ret

tmp = calc_aggregate_qrels('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-incomplete.txt', '../resources/unprocessed/topics-and-qrels/qrels.trec-covid-complete.txt',)
json.dump({'corpus': 'trec-covid', 'qrels': tmp}, open('overview-over-qrels.jsonl', 'w'))

In [21]:

tmp = json.load(gzip.open('../resources/processed/pool-documents-per-run-trec-system-runs-trec18-web.adhoc.json.gz'))
tmp.keys()

dict_keys(['groups', 'pool_entries'])

In [62]:
import gzip
import sys
sys.path.append('../python/')
from run_file_processing import IncompletePools
from evaluation_util import __adjust_qrels_to_pool

def load_groups(name, qrels, corpus):
    group_file = f'../resources/processed/pool-documents-per-run-{name}.json.gz'
    tmp = json.load(gzip.open(group_file))
    qrels_complete = TrecQrel(f'../resources/unprocessed/topics-and-qrels/{qrels}')
    
    for group, runs in tqdm(tmp['groups'].items()):
        pooling = IncompletePools(pool_per_run_file=group_file)
        pooling = {k: v for k, v in pooling.create_incomplete_pools_for_run(runs[0]) if k.lower().startswith('depth-10-pool-incomplete-for')}
        pooling = list(pooling.values())
        assert len(pooling) == 1
        pooling = pooling[0]
        
        qrels_incomplete = __adjust_qrels_to_pool(qrels_complete, pooling)
        
        out = {'corpus': corpus, 'group': group, 'qrels': calc_aggregate_qrels(qrels_complete=qrels_complete, qrels_incomplete=qrels_incomplete)}
        open('overview-over-qrels.jsonl', 'a+').write('\n')
        json.dump(out, open('overview-over-qrels.jsonl', 'a+'))
    
load_groups('trec-system-runs-trec18-web.adhoc', 'qrels.web.1-50.txt', 'ClueWeb09')

In [58]:
load_groups('trec-system-runs-trec19-web.adhoc', 'qrels.web.51-100.txt', 'ClueWeb09')
load_groups('trec-system-runs-trec20-web.adhoc', 'qrels.web.101-150.txt', 'ClueWeb09')
load_groups('trec-system-runs-trec21-web.adhoc', 'qrels.web.151-200.txt', 'ClueWeb09')

100%|██████████| 20/20 [01:46<00:00,  5.33s/it]
100%|██████████| 14/14 [00:52<00:00,  3.73s/it]
100%|██████████| 12/12 [00:38<00:00,  3.24s/it]


In [59]:
load_groups('trec-system-runs-trec22-web', 'qrels.web.201-250.txt', 'ClueWeb12')
load_groups('trec-system-runs-trec23-web', 'qrels.web.251-300.txt', 'ClueWeb12')

100%|██████████| 14/14 [00:48<00:00,  3.43s/it]
100%|██████████| 12/12 [00:36<00:00,  3.05s/it]


In [64]:
load_groups('trec-system-runs-trec13-robust', 'qrels.robust04.txt', 'Robust04')

100%|██████████| 14/14 [18:38<00:00, 79.89s/it]
