In [1]:
from tira.third_party_integrations import ir_datasets
from trectools import TrecQrel, TrecRun, TrecEval
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from statistics import mean

timestamp_to_ir_datasets_id = {
    't1': 'longeval-short-july-20230513-training',
    't2': 'longeval-long-september-20230513-training',
    't3': 'longeval-2023-01-20240423-training',
    't4': 'longeval-2023-06-20240418-training',
    't5': 'longeval-2023-08-20240418-training',
}

SPLITS = ['0', '1', '2']

def load_qrels(timestamp, split):
    splits = json.load(open('../data/splits.json'))

    docs_to_skip = set(splits[timestamp].get(split, []))

    dataset = ir_datasets.load(f'ir-benchmarks/{timestamp_to_ir_datasets_id[timestamp]}')
    qrels = TrecQrel()
    qrels.qrels_data = []
    for qrel in dataset.qrels_iter():
        if qrel.doc_id in docs_to_skip or qrel.query_id not in TIMESTAMP_TO_QIDS[timestamp]:
            continue
        qrels.qrels_data += [{"query": qrel.query_id, "q0": 0, "docid": qrel.doc_id, "rel": qrel.relevance}]
    qrels.qrels_data = pd.DataFrame(qrels.qrels_data)
    return qrels

def load_run(system, timestamp, split):
    if 'keyquery' in system:
        run_name = f'../src/keyqueries/{timestamp}/BM25-split-{split}.run.gz'
    elif 'castorini' in system or 'MonoT5' in system or 'ColBERT' in system:
        run_name = f'../data/results_baseline/{system}-{timestamp}-split-{split}.run.gz'
    else:
        run_name = glob(f'../data/results/{system}_{timestamp}_F{split}*')
        if split == 'no-split':
            run_name = glob(f'../data/results/{system}_{timestamp}_F0*')[:1]
        assert len(run_name) == 1
        run_name = run_name[0]
    ret = TrecRun(run_name)
    if timestamp in TIMESTAMP_TO_QIDS:
        ret.run_data = ret.run_data[ret.run_data['query'].isin(TIMESTAMP_TO_QIDS[timestamp])]
    return ret

TIMESTAMP_TO_QIDS = {}

for timestamp in list(timestamp_to_ir_datasets_id):
    TIMESTAMP_TO_QIDS[timestamp] = set(load_run('keyquery', timestamp, '0').run_data['query'].unique())


In [2]:
df_eval = []

for system in tqdm(['castorini-list-in-t5-150', 'ColBERT', 'MonoT5', 'BM25', 'BM25+qrel_boost', 'BM25+RM3', 'BM25+RF', 'keyquery']):
    for timestamp in timestamp_to_ir_datasets_id:
        ndcg_scores = []
        ndcg_condensed_scores = []

        for split in SPLITS:
            qrels = load_qrels(timestamp, split)
            run = load_run(system, timestamp, split)
            te = TrecEval(run, qrels)
            ndcg_scores += [te.get_ndcg(depth=10, removeUnjudged=False)]
            ndcg_condensed_scores += [te.get_ndcg(depth=10, removeUnjudged=True)]

        df_eval += [{'system': system, 'nDCG': mean(ndcg_scores), 'nDCG(Condensed)': mean(ndcg_condensed_scores), 'timestamp': timestamp}]
        
df_eval = pd.DataFrame(df_eval)


100%|██████████| 8/8 [03:26<00:00, 25.77s/it]


In [3]:
df_eval

Unnamed: 0,system,nDCG,nDCG(Condensed),timestamp
0,castorini-list-in-t5-150,0.155152,0.313036,t1
1,castorini-list-in-t5-150,0.170719,0.348666,t2
2,castorini-list-in-t5-150,0.171815,0.363999,t3
3,castorini-list-in-t5-150,0.161821,0.341931,t4
4,castorini-list-in-t5-150,0.146046,0.284338,t5
5,ColBERT,0.15342,0.314012,t1
6,ColBERT,0.173054,0.345721,t2
7,ColBERT,0.166332,0.360213,t3
8,ColBERT,0.150544,0.337569,t4
9,ColBERT,0.137852,0.284589,t5


In [4]:
df_eval = []

for system in tqdm(['castorini-list-in-t5-150', 'ColBERT', 'MonoT5', 'BM25', 'BM25+qrel_boost', 'BM25+RM3', 'BM25+RF', 'keyquery']):
    for timestamp in timestamp_to_ir_datasets_id:
        ndcg_scores = []
        ndcg_condensed_scores = []

        for split in ['no-split']:
            qrels = load_qrels(timestamp, split)
            run = load_run(system, timestamp, split)
            te = TrecEval(run, qrels)
            ndcg_scores += [te.get_ndcg(depth=10, removeUnjudged=False)]
            ndcg_condensed_scores += [te.get_ndcg(depth=10, removeUnjudged=True)]

        df_eval += [{'system': system, 'nDCG': mean(ndcg_scores), 'nDCG(Condensed)': mean(ndcg_condensed_scores), 'timestamp': timestamp}]
        
df_eval = pd.DataFrame(df_eval)


100%|██████████| 8/8 [01:05<00:00,  8.14s/it]


In [5]:
df_eval

Unnamed: 0,system,nDCG,nDCG(Condensed),timestamp
0,castorini-list-in-t5-150,0.202593,0.400641,t1
1,castorini-list-in-t5-150,0.204249,0.413311,t2
2,castorini-list-in-t5-150,0.201737,0.425332,t3
3,castorini-list-in-t5-150,0.19788,0.413066,t4
4,castorini-list-in-t5-150,0.160521,0.316616,t5
5,ColBERT,0.198358,0.401825,t1
6,ColBERT,0.206863,0.409158,t2
7,ColBERT,0.20054,0.420195,t3
8,ColBERT,0.183769,0.407958,t4
9,ColBERT,0.150839,0.315206,t5
