In [3]:
from tira.third_party_integrations import ir_datasets
from trectools import TrecQrel, TrecRun, TrecEval
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from statistics import mean

timestamp_to_ir_datasets_id = {
    't1': 'longeval-short-july-20230513-training',
    't2': 'longeval-long-september-20230513-training',
    't3': 'longeval-2023-01-20240423-training',
    't4': 'longeval-2023-06-20240418-training',
    't5': 'longeval-2023-08-20240418-training',
}

SPLITS = ['0', '1', '2']

def load_qrels(timestamp, split):
    splits = json.load(open('../data/splits.json'))

    docs_to_skip = set(splits[timestamp].get(split, []))

    dataset = ir_datasets.load(f'ir-benchmarks/{timestamp_to_ir_datasets_id[timestamp]}')
    qrels = TrecQrel()
    qrels.qrels_data = []
    for qrel in dataset.qrels_iter():
        if qrel.doc_id in docs_to_skip or qrel.query_id not in TIMESTAMP_TO_QIDS[timestamp]:
            continue
        qrels.qrels_data += [{"query": qrel.query_id, "q0": 0, "docid": qrel.doc_id, "rel": qrel.relevance}]
    qrels.qrels_data = pd.DataFrame(qrels.qrels_data)
    return qrels

def load_run(system, timestamp, split):
    if 'keyquery' in system:
        run_name = f'../src/keyqueries/{timestamp}/BM25-split-{split}.run.gz'
    else:
        run_name = glob(f'../data/results/{system}_{timestamp}_F{split}*')
        assert len(run_name) == 1
        run_name = run_name[0]
    return TrecRun(run_name)

TIMESTAMP_TO_QIDS = {}

for timestamp in list(timestamp_to_ir_datasets_id):
    TIMESTAMP_TO_QIDS[timestamp] = set(load_run('keyquery', timestamp, '0').run_data['query'].unique())


In [5]:
df_eval = []

for system in tqdm(['BM25', 'BM25+qrel_boost', 'BM25+RM3', 'BM25+RF', 'keyquery']):
    for timestamp in timestamp_to_ir_datasets_id:
        ndcg_scores = []
        ndcg_condensed_scores = []

        for split in SPLITS:
            qrels = load_qrels(timestamp, split)
            run = load_run(system, timestamp, split)
            te = TrecEval(run, qrels)
            ndcg_scores += [te.get_ndcg(depth=10, removeUnjudged=False)]
            ndcg_condensed_scores += [te.get_ndcg(depth=10, removeUnjudged=True)]

        df_eval += [{'system': system, 'nDCG': mean(ndcg_scores), 'nDCG(Condensed)': mean(ndcg_condensed_scores), 'timestamp': timestamp}]
        
df_eval = pd.DataFrame(df_eval)


100%|██████████| 4/4 [02:38<00:00, 39.58s/it]


In [7]:
df_eval

Unnamed: 0,system,nDCG,nDCG(Condensed),timestamp
0,BM25,0.037853,0.119678,t1
1,BM25,0.061275,0.168502,t2
2,BM25,0.046825,0.14719,t3
3,BM25,0.073174,0.210581,t4
4,BM25,0.044083,0.125556,t5
5,BM25+qrel_boost,0.088845,0.140361,t1
6,BM25+qrel_boost,0.111177,0.189993,t2
7,BM25+qrel_boost,0.076477,0.158495,t3
8,BM25+qrel_boost,0.144286,0.235989,t4
9,BM25+qrel_boost,0.067434,0.132823,t5
