In [11]:
from tira.third_party_integrations import ir_datasets
from trectools import TrecQrel, TrecRun, TrecEval
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from statistics import mean

timestamp_to_ir_datasets_id = {
    't1': 'longeval-short-july-20230513-training',
    't2': 'longeval-long-september-20230513-training',
    't3': 'longeval-2023-01-20240423-training',
    't4': 'longeval-2023-06-20240418-training',
    't5': 'longeval-2023-08-20240418-training',
}

SPLITS = ['0', '1', '2']

def load_qrels(timestamp, split):
    splits = json.load(open('../data/splits.json'))

    docs_to_skip = set(splits[timestamp].get(split, []))

    dataset = ir_datasets.load(f'ir-benchmarks/{timestamp_to_ir_datasets_id[timestamp]}')
    qrels = TrecQrel()
    qrels.qrels_data = []
    for qrel in dataset.qrels_iter():
        if qrel.doc_id in docs_to_skip or qrel.query_id not in TIMESTAMP_TO_QIDS[timestamp]:
            continue
        qrels.qrels_data += [{"query": qrel.query_id, "q0": 0, "docid": qrel.doc_id, "rel": qrel.relevance}]
    qrels.qrels_data = pd.DataFrame(qrels.qrels_data)
    return qrels

def load_run(system, timestamp, split):
    if 'keyquery' in system:
        run_name = f'../src/keyqueries/{timestamp}/BM25-split-{split}.run.gz'
    elif 'castorini' in system or 'MonoT5' in system or 'ColBERT' in system:
        run_name = f'../data/results_baseline/{system}-{timestamp}-split-{split}.run.gz'
    else:
        run_name = glob(f'../data/results_full/{system}_{timestamp}_F*')
        if len(run_name) != 1:
            print(f'../data/results_full/{system}_{timestamp}_F*', '->', run_name)
        assert len(run_name) == 1
        run_name = run_name[0]
    ret = TrecRun(run_name)
    if timestamp in TIMESTAMP_TO_QIDS:
        ret.run_data = ret.run_data[ret.run_data['query'].isin(TIMESTAMP_TO_QIDS[timestamp])]
    return ret

TIMESTAMP_TO_QIDS = {}

for timestamp in list(timestamp_to_ir_datasets_id):
    TIMESTAMP_TO_QIDS[timestamp] = set(load_run('keyquery', timestamp, '0').run_data['query'].unique())


In [12]:
def table_line(df, approach):
    df = df[df['system'] == approach]
    ret = []
    for measure in ['nDCG', 'nDCG(Condensed)']:
        for timestamp in ['t1', 't2', 't3', 't4', 't5']:
            score = df[df['timestamp'] == timestamp]
            assert len(score) == 1
            ret += ['{:.3f}'.format(score.iloc[0][measure]).replace('0.', '.')]
    return ' & '.join(ret)

def plot_table(df):
    return print('''
\\begin{tabular}{@{}l@{}cccccccccc@{}}
    \\toprule
    \\bfseries System & \\multicolumn{5}{c}{\\bfseries nDCG@10} & \\multicolumn{5}{c}{\\bfseries nDCG@10$^{'}$}\\\\
    \\cmidrule(r@{.25em}){2-6}
    \\cmidrule(l@{.25em}){7-11}
    
    & 07/23 & 09/23 & 01/24 & 06/24 & 08/24 & 07/23 & 09/23 & 01/24 & 06/24 & 08/24\\\\
    
    \\midrule

    BM25 & ''' + table_line(df, 'BM25') + ''' \\\\
    ColBERT & ''' + table_line(df, 'ColBERT') + ''' \\\\
    List-in-T5 & ''' + table_line(df, 'castorini-list-in-t5-150') + ''' \\\\
    monoT5 & ''' + table_line(df, 'MonoT5') + ''' \\\\
    
    \\midrule
    
    BM25$_{Boost}$ & ''' + table_line(df, 'BM25+qrel_boost') + ''' \\\\
    BM25$_{RM3}$ & ''' + table_line(df, 'BM25+RM3') + ''' \\\\
    BM25$_{RF}$ & ''' + table_line(df, 'BM25+RF') + ''' \\\\
    BM25$_{keyquery}$ & ''' + table_line(df, 'keyquery') + ''' \\\\

\\bottomrule
\\end{tabular}
''')

In [13]:
df_eval = []

for system in tqdm(['castorini-list-in-t5-150', 'ColBERT', 'MonoT5', 'BM25', 'BM25+qrel_boost', 'BM25+RM3', 'BM25+RF', 'keyquery']):
    for timestamp in timestamp_to_ir_datasets_id:
        ndcg_scores = []
        ndcg_condensed_scores = []

        qrels = load_qrels(timestamp, 'no-split')
        run = load_run(system, timestamp, 'no-split')
        te = TrecEval(run, qrels)
        ndcg_scores += [te.get_ndcg(depth=10, removeUnjudged=False)]
        ndcg_condensed_scores += [te.get_ndcg(depth=10, removeUnjudged=True)]

        df_eval += [{'system': system, 'nDCG': mean(ndcg_scores), 'nDCG(Condensed)': mean(ndcg_condensed_scores), 'timestamp': timestamp}]
        
df_eval = pd.DataFrame(df_eval)


100%|██████████| 8/8 [01:09<00:00,  8.71s/it]


In [14]:
df_eval

Unnamed: 0,system,nDCG,nDCG(Condensed),timestamp
0,castorini-list-in-t5-150,0.202593,0.400641,t1
1,castorini-list-in-t5-150,0.204249,0.413311,t2
2,castorini-list-in-t5-150,0.201737,0.425332,t3
3,castorini-list-in-t5-150,0.19788,0.413066,t4
4,castorini-list-in-t5-150,0.160521,0.316616,t5
5,ColBERT,0.198358,0.401825,t1
6,ColBERT,0.206863,0.409158,t2
7,ColBERT,0.20054,0.420195,t3
8,ColBERT,0.183769,0.407958,t4
9,ColBERT,0.150839,0.315206,t5


In [15]:
plot_table(df_eval)


\begin{tabular}{@{}l@{}cccccccccc@{}}
    \toprule
    \bfseries System & \multicolumn{5}{c}{\bfseries nDCG@10} & \multicolumn{5}{c}{\bfseries nDCG@10$^{'}$}\\
    \cmidrule(r@{.25em}){2-6}
    \cmidrule(l@{.25em}){7-11}
    
    & 07/23 & 09/23 & 01/24 & 06/24 & 08/24 & 07/23 & 09/23 & 01/24 & 06/24 & 08/24\\
    
    \midrule

    BM25 & .155 & .184 & .172 & .175 & .134 & .471 & .492 & .516 & .486 & .379 \\
    ColBERT & .198 & .207 & .201 & .184 & .151 & .402 & .409 & .420 & .408 & .315 \\
    List-in-T5 & .203 & .204 & .202 & .198 & .161 & .401 & .413 & .425 & .413 & .317 \\
    monoT5 & .202 & .219 & .197 & .202 & .154 & .405 & .410 & .415 & .411 & .314 \\
    
    \midrule
    
    BM25$_{Boost}$ & .355 & .372 & .287 & .364 & .271 & .529 & .546 & .541 & .540 & .412 \\
    BM25$_{RM3}$ & .147 & .181 & .163 & .174 & .134 & .478 & .490 & .524 & .492 & .388 \\
    BM25$_{RF}$ & .303 & .332 & .241 & .262 & .191 & .606 & .611 & .590 & .552 & .426 \\
    BM25$_{keyquery}$ & .252 & .285