In [23]:
from tira.third_party_integrations import ir_datasets
from trectools import TrecQrel, TrecRun, TrecEval
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from statistics import mean
from scipy.stats import ttest_ind

timestamp_to_ir_datasets_id = {
    't1': 'longeval-short-july-20230513-training',
    't2': 'longeval-long-september-20230513-training',
    't3': 'longeval-2023-01-20240423-training',
    't4': 'longeval-2023-06-20240418-training',
    't5': 'longeval-2023-08-20240418-training',
}

SPLITS = ['0', '1', '2']

def load_qrels(timestamp, split):
    splits = json.load(open('../data/splits.json'))

    docs_to_skip = set(splits[timestamp].get(split, []))

    dataset = ir_datasets.load(f'ir-benchmarks/{timestamp_to_ir_datasets_id[timestamp]}')
    qrels = TrecQrel()
    qrels.qrels_data = []
    for qrel in dataset.qrels_iter():
        if qrel.doc_id in docs_to_skip or qrel.query_id not in TIMESTAMP_TO_QIDS[timestamp]:
            continue
        qrels.qrels_data += [{"query": qrel.query_id, "q0": 0, "docid": qrel.doc_id, "rel": qrel.relevance}]
    qrels.qrels_data = pd.DataFrame(qrels.qrels_data)
    return qrels

def load_run(system, timestamp, split):
    if 'keyquery' in system:
        run_name = f'../src/keyqueries/{timestamp}/BM25-split-{split}.run.gz'
    elif 'rm3-BM25' in system:
        run_name = f'../src/keyqueries/{timestamp}/rm3-BM25-split-{split}.run.gz'
    elif 'castorini' in system or 'MonoT5' in system or 'ColBERT' in system:
        run_name = f'../data/results_baseline/{system}-{timestamp}-split-{split}.run.gz'
    else:
        run_name = glob(f'../data/results_full/{system}_{timestamp}_F*')
        if len(run_name) != 1:
            print(f'../data/results_full/{system}_{timestamp}_F*', '->', run_name)
        assert len(run_name) == 1
        run_name = run_name[0]
    ret = TrecRun(run_name)
    if timestamp in TIMESTAMP_TO_QIDS:
        ret.run_data = ret.run_data[ret.run_data['query'].isin(TIMESTAMP_TO_QIDS[timestamp])]
    return ret

TIMESTAMP_TO_QIDS = {}

for timestamp in list(timestamp_to_ir_datasets_id):
    TIMESTAMP_TO_QIDS[timestamp] = set(load_run('keyquery', timestamp, '0').run_data['query'].unique())


In [39]:
P_VALUE_BONFERRONI_CORRECTED = 0.05/8
def table_line(df, approach):
    df = df[df['system'] == approach]
    ret = []
    for measure in ['nDCG', 'nDCG(Condensed)']:
        for timestamp in ['t1', 't2', 't3', 't4', 't5']:
            score = df[df['timestamp'] == timestamp]
            p_value_column = '_p_value'
            if 'Condensed' in measure:
                p_value_column = '_condensed' + p_value_column

            significant_bm25 = score.iloc[0][f'BM25{p_value_column}'] <= P_VALUE_BONFERRONI_CORRECTED
            significant_monot5 = score.iloc[0][f'MonoT5{p_value_column}'] <= P_VALUE_BONFERRONI_CORRECTED

            if significant_bm25 and significant_monot5:
                suffix = '$^{\\dagger\\ddagger}$'
            elif significant_bm25 and not significant_monot5:
                suffix = '$^{\\dagger\\phantom{\\ddagger}}$'
            elif not significant_bm25 and significant_monot5:
                suffix = '$^{\\ddagger\\phantom{\\dagger}}$'
            elif not significant_bm25 and not significant_monot5:
                suffix = '$^{\\phantom{\\dagger\\ddagger}}$'
            else:
                raise ValueError('dsasadda')

            assert len(score) == 1
            ret += ['{:.3f}'.format(score.iloc[0][measure]).replace('0.', '.') + suffix]
    return ' & '.join(ret)

def plot_table(df):
    return print('''
\\begin{tabular}{@{}l@{}cccccccccc@{}}
    \\toprule
    \\bfseries System & \\multicolumn{5}{c}{\\bfseries nDCG@10} & \\multicolumn{5}{c}{\\bfseries nDCG@10$^{'}$}\\\\
    \\cmidrule(r@{.25em}){2-6}
    \\cmidrule(l@{.25em}){7-11}
    
    & 07/23 & 09/23 & 01/24 & 06/24 & 08/24 & 07/23 & 09/23 & 01/24 & 06/24 & 08/24\\\\
    
    \\midrule

    BM25 & ''' + table_line(df, 'BM25') + ''' \\\\
    BM25$_{RM3}$ & ''' + table_line(df, 'BM25+RM3') + ''' \\\\
    ColBERT & ''' + table_line(df, 'ColBERT') + ''' \\\\
    List-in-T5 & ''' + table_line(df, 'castorini-list-in-t5-150') + ''' \\\\
    monoT5 & ''' + table_line(df, 'MonoT5') + ''' \\\\
    
    \\midrule
    
    BM25$_{Boost}$ & ''' + table_line(df, 'BM25+qrel_boost') + ''' \\\\
    BM25$_{RF}$ & ''' + table_line(df, 'BM25+RF') + ''' \\\\
    BM25$_{keyquery}$ & ''' + table_line(df, 'keyquery') + ''' \\\\

\\bottomrule
\\end{tabular}
''')

In [40]:
df_eval = []
baseline_scores = {'BM25': {}, 'MonoT5': {}}

for system in baseline_scores.keys():
    for timestamp in timestamp_to_ir_datasets_id:

        qrels = load_qrels(timestamp, 'no-split')
        run = load_run(system, timestamp, 'no-split')
        te = TrecEval(run, qrels)
        baseline_scores[system][timestamp] = {
            'ndcg': [i if str(i).lower() != 'nan' else 0.0 for i in te.get_ndcg(depth=10, removeUnjudged=False, per_query=True)['NDCG@10']],
            'ndcg_condensed': [i if str(i).lower() != 'nan' else 0.0 for i in te.get_ndcg(depth=10, removeUnjudged=True, per_query=True)['NDCG@10']],
        }

for system in tqdm(['castorini-list-in-t5-150', 'ColBERT', 'MonoT5', 'BM25', 'BM25+qrel_boost', 'BM25+RM3', 'BM25+RF', 'keyquery']):
    for timestamp in timestamp_to_ir_datasets_id:
        qrels = load_qrels(timestamp, 'no-split')
        run = load_run(system, timestamp, 'no-split')
        te = TrecEval(run, qrels)
        
        ndcg_scores = [i if str(i).lower() != 'nan' else 0.0 for i in te.get_ndcg(depth=10, removeUnjudged=False, per_query=True)['NDCG@10']]
        ndcg_condensed_scores = [i if str(i).lower() != 'nan' else 0.0 for i in te.get_ndcg(depth=10, removeUnjudged=True, per_query=True)['NDCG@10']]

        df_eval += [{
            'system': system,
            'nDCG': te.get_ndcg(depth=10, removeUnjudged=False),
            'nDCG(Condensed)': te.get_ndcg(depth=10, removeUnjudged=True),
            'timestamp': timestamp,
            'BM25_condensed_p_value': ttest_ind(baseline_scores['BM25'][timestamp]['ndcg_condensed'], ndcg_condensed_scores).pvalue,
            'MonoT5_condensed_p_value': ttest_ind(baseline_scores['MonoT5'][timestamp]['ndcg_condensed'], ndcg_condensed_scores).pvalue,

            'BM25_p_value': ttest_ind(baseline_scores['BM25'][timestamp]['ndcg'], ndcg_scores).pvalue,
            'MonoT5_p_value': ttest_ind(baseline_scores['MonoT5'][timestamp]['ndcg'], ndcg_scores).pvalue,
        }]
        
df_eval = pd.DataFrame(df_eval)


100%|██████████| 8/8 [01:15<00:00,  9.47s/it]


In [41]:
plot_table(df_eval)


\begin{tabular}{@{}l@{}cccccccccc@{}}
    \toprule
    \bfseries System & \multicolumn{5}{c}{\bfseries nDCG@10} & \multicolumn{5}{c}{\bfseries nDCG@10$^{'}$}\\
    \cmidrule(r@{.25em}){2-6}
    \cmidrule(l@{.25em}){7-11}
    
    & 07/23 & 09/23 & 01/24 & 06/24 & 08/24 & 07/23 & 09/23 & 01/24 & 06/24 & 08/24\\
    
    \midrule

    BM25 & .155$^{\phantom{\dagger\ddagger}}$ & .184$^{\phantom{\dagger\ddagger}}$ & .172$^{\phantom{\dagger\ddagger}}$ & .175$^{\phantom{\dagger\ddagger}}$ & .134$^{\phantom{\dagger\ddagger}}$ & .471$^{\phantom{\dagger\ddagger}}$ & .492$^{\ddagger\phantom{\dagger}}$ & .516$^{\ddagger\phantom{\dagger}}$ & .486$^{\ddagger\phantom{\dagger}}$ & .379$^{\ddagger\phantom{\dagger}}$ \\
    BM25$_{RM3}$ & .147$^{\ddagger\phantom{\dagger}}$ & .181$^{\phantom{\dagger\ddagger}}$ & .163$^{\phantom{\dagger\ddagger}}$ & .174$^{\phantom{\dagger\ddagger}}$ & .134$^{\phantom{\dagger\ddagger}}$ & .478$^{\ddagger\phantom{\dagger}}$ & .490$^{\ddagger\phantom{\dagger}}$ & .524$^{\