In [6]:
from tira.third_party_integrations import ir_datasets
from trectools import TrecQrel, TrecRun, TrecEval
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from statistics import mean
import math
from statistics import stdev
from scipy.stats import ttest_ind

timestamp_to_ir_datasets_id = {
    't1': 'longeval-short-july-20230513-training',
    't2': 'longeval-long-september-20230513-training',
    't3': 'longeval-2023-01-20240423-training',
    't4': 'longeval-2023-06-20240418-training',
    't5': 'longeval-2023-08-20240418-training',
}

docs_to_skip = {i: set() for i in ['t0', 't1', 't2', 't3', 't4', 't5']}

for _, i in pd.read_csv('../data/document-groups-judged-extended.csv.gz').iterrows():
    previously_labeled = False
    for k in ['t0', 't1', 't2', 't3', 't4', 't5']:
        if not str(i[k]).startswith('doc'):
            continue
        if previously_labeled:
            docs_to_skip[k].add(str(i[k]))
        previously_labeled = True

def load_qrels(timestamp):
    dataset = ir_datasets.load(f'ir-benchmarks/{timestamp_to_ir_datasets_id[timestamp]}')
    qrels = TrecQrel()
    qrels.qrels_data = []
    skipped = 0
    for qrel in dataset.qrels_iter():
        if qrel.doc_id in docs_to_skip[timestamp] or qrel.query_id not in TIMESTAMP_TO_QIDS[timestamp]:
            skipped += 1
            continue
        qrels.qrels_data += [{"query": qrel.query_id, "q0": 0, "docid": qrel.doc_id, "rel": qrel.relevance}]
    qrels.qrels_data = pd.DataFrame(qrels.qrels_data)
    print('Timestamp', timestamp, 'skipped:', skipped, 'remaining:', len(qrels.qrels_data))
    return qrels

def load_run(system, timestamp, split):
    if 'keyquery' in system:
        run_name = f'../src/keyqueries/{timestamp}/BM25-split-{split}.run.gz'
    elif 'rm3-BM25' in system:
        run_name = f'../src/keyqueries/{timestamp}/rm3-BM25-split-{split}.run.gz'
    elif 'castorini' in system or 'MonoT5' in system or 'ColBERT' in system:
        run_name = f'../data/results_baseline/{system}-{timestamp}-split-{split}.run.gz'
    else:
        run_name = glob(f'../data/results_full/{system}_{timestamp}_F*')
        assert len(run_name) == 1
        run_name = run_name[0]
    ret = TrecRun(run_name)
    if timestamp in TIMESTAMP_TO_QIDS:
        ret.run_data = ret.run_data[ret.run_data['query'].isin(TIMESTAMP_TO_QIDS[timestamp])]
    return ret

TIMESTAMP_TO_QIDS = {}

for timestamp in list(timestamp_to_ir_datasets_id):
    TIMESTAMP_TO_QIDS[timestamp] = set(load_run('keyquery', timestamp, '0').run_data['query'].unique())


In [12]:
df_eval = {}

for system in tqdm(['BM25', 'BM25+qrel_boost', 'BM25+RF', 'keyquery']):
    df_eval[system] = {}
    for timestamp in timestamp_to_ir_datasets_id:

        qrels = load_qrels(timestamp)
        run = load_run(system, timestamp, 'no-split')
        te = TrecEval(run, qrels)
        df_eval[system][timestamp] = {
            'ndcg': {q: i['NDCG@10'] for q, i in te.get_ndcg(depth=10, removeUnjudged=False, per_query=True).iterrows()},
            'ndcg_condensed': {q: i['NDCG@10'] for q, i in te.get_ndcg(depth=10, removeUnjudged=True, per_query=True).iterrows()}
        }


  0%|          | 0/4 [00:00<?, ?it/s]

Timestamp t1 skipped: 11843 remaining: 374
Timestamp t2 skipped: 12702 remaining: 765
Timestamp t3 skipped: 8186 remaining: 1599
Timestamp t4 skipped: 87067 remaining: 1234
Timestamp t5 skipped: 154142 remaining: 2028


 25%|██▌       | 1/4 [00:11<00:33, 11.29s/it]

Timestamp t1 skipped: 11843 remaining: 374
Timestamp t2 skipped: 12702 remaining: 765
Timestamp t3 skipped: 8186 remaining: 1599
Timestamp t4 skipped: 87067 remaining: 1234
Timestamp t5 skipped: 154142 remaining: 2028


 50%|█████     | 2/4 [00:25<00:25, 12.78s/it]

Timestamp t1 skipped: 11843 remaining: 374
Timestamp t2 skipped: 12702 remaining: 765
Timestamp t3 skipped: 8186 remaining: 1599
Timestamp t4 skipped: 87067 remaining: 1234
Timestamp t5 skipped: 154142 remaining: 2028


 75%|███████▌  | 3/4 [00:38<00:12, 12.97s/it]

Timestamp t1 skipped: 11843 remaining: 374
Timestamp t2 skipped: 12702 remaining: 765
Timestamp t3 skipped: 8186 remaining: 1599
Timestamp t4 skipped: 87067 remaining: 1234
Timestamp t5 skipped: 154142 remaining: 2028


100%|██████████| 4/4 [00:44<00:00, 11.03s/it]


In [13]:
df = []

for system in tqdm(['BM25+qrel_boost', 'BM25+RF', 'keyquery']):    
    for timestamp in timestamp_to_ir_datasets_id:
        i = {'system': system, 'timestamp': timestamp}
        for measure in ['ndcg_condensed']:
            baseline_scores = df_eval['BM25'][timestamp][measure]
            system_scores = df_eval[system][timestamp][measure]
            score_diffs = []

            for topic in baseline_scores:
                baseline_score = baseline_scores[topic]
                system_score = system_scores[topic]

                if math.isnan(baseline_score):
                    continue
                if math.isnan(system_score):
                    continue

                score_diffs += [system_score-baseline_score]
            i[measure] = mean(score_diffs)
            i[measure + ' (std-dev)'] = stdev(score_diffs)
            i[measure + ' (p-value)'] = ttest_ind(score_diffs, [0 for i in score_diffs]).pvalue
            
        df += [i]

df = pd.DataFrame(df)
df

100%|██████████| 3/3 [00:00<00:00, 101.92it/s]


Unnamed: 0,system,timestamp,ndcg_condensed,ndcg_condensed (std-dev),ndcg_condensed (p-value)
0,BM25+qrel_boost,t1,0.0,0.0,
1,BM25+qrel_boost,t2,0.0,0.0,
2,BM25+qrel_boost,t3,0.0,0.0,
3,BM25+qrel_boost,t4,0.0,0.0,
4,BM25+qrel_boost,t5,0.0,0.0,
5,BM25+RF,t1,-0.034338,0.110895,0.053766
6,BM25+RF,t2,0.000365,0.134765,0.979165
7,BM25+RF,t3,0.022368,0.145804,0.068592
8,BM25+RF,t4,0.01241,0.080714,0.155737
9,BM25+RF,t5,0.006177,0.145822,0.672298


In [14]:
# we compare three systems against the baseline
P_VALUE_CORRECTED = 0.05/3

def table_line(df, approach):
    df = df[df['system'] == approach]
    ret = []
    for measure in ['ndcg_condensed']:
        for timestamp in ['t1', 't2', 't3', 't4', 't5']:
            score = df[df['timestamp'] == timestamp]
            assert len(score) == 1
            std_dev = score.iloc[0][measure + ' (std-dev)']
            p_value = score.iloc[0][measure + ' (p-value)']
            score = score.iloc[0][measure]
            style = '^{\\phantom{*}}' if math.isnan(p_value) or p_value > P_VALUE_CORRECTED else '^{*}'

            ret += [('+' if score >= 0 else '') + '{:.3f}'.format(score) + '$'+ style +'_{\\color{gray}\\pm' + '{:.3f}'.format(std_dev).replace('0.', '.') + '}$']
    return ' & '.join(ret)

def plot_table(df):
    return print('''

\\begin{tabular}{@{}lcccccccccc@{}}
    \\toprule
    \\bfseries System & \\bfseries 07/23 & \\bfseries 09/23 & \\bfseries 01/24 & \\bfseries 06/24 & \\bfseries 08/24 \\\\
    
    \\midrule

    
    BM25$_{Boost}$ & ''' + table_line(df, 'BM25+qrel_boost') + ''' \\\\
    BM25$_{RF}$ & ''' + table_line(df, 'BM25+RF') + ''' \\\\
    BM25$_{keyquery}$ & ''' + table_line(df, 'keyquery') + ''' \\\\

\\bottomrule
\\end{tabular}
''')

plot_table(df)



\begin{tabular}{@{}lcccccccccc@{}}
    \toprule
    \bfseries System & \bfseries 07/23 & \bfseries 09/23 & \bfseries 01/24 & \bfseries 06/24 & \bfseries 08/24 \\
    
    \midrule

    
    BM25$_{Boost}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.000}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.000}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.000}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.000}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.000}$ \\
    BM25$_{RF}$ & -0.034$^{\phantom{*}}_{\color{gray}\pm.111}$ & +0.000$^{\phantom{*}}_{\color{gray}\pm.135}$ & +0.022$^{\phantom{*}}_{\color{gray}\pm.146}$ & +0.012$^{\phantom{*}}_{\color{gray}\pm.081}$ & +0.006$^{\phantom{*}}_{\color{gray}\pm.146}$ \\
    BM25$_{keyquery}$ & -0.010$^{\phantom{*}}_{\color{gray}\pm.084}$ & +0.012$^{\phantom{*}}_{\color{gray}\pm.153}$ & +0.032$^{*}_{\color{gray}\pm.105}$ & -0.001$^{\phantom{*}}_{\color{gray}\pm.065}$ & +0.002$^{\phantom{*}}_{\color{gray}\pm.085}$ \\

\bottomrule
\end{tabular}

