In [2]:
import pyterrier as pt

if not pt.started():
    pt.init()

from tqdm import tqdm
sep = '___'

def to_trec_run(r):
    from trectools import TrecRun
    import pandas as pd
    tr = TrecRun()
    tr.run_data = r.copy()
    tr.run_data['query'] = tr.run_data['qid'].apply(lambda i: i.split(sep)[0])
    tr.run_data['docid'] = tr.run_data['docno']
    
    print(f'Created run with {len(tr.run_data)} lines')
    
    return tr
    
def reciprocal_renk_fusion(input_run, output_run, num_runs=20):
    from trectools import fusion
    all_runs = []
    
    run = pt.io.read_results(input_run)
    print(f'Process run with {len(run)} lines.')
    
    r = []
    
    all_runs += [to_trec_run(run[~run['qid'].str.contains(sep)])]
    assert len(all_runs[0].run_data) > 0
    
    for run_id in range(0, num_runs +1):
        r = run[run['qid'].str.endswith(f'{sep}{run_id}')]

        if len(r) < 1:
            continue

        all_runs += [to_trec_run(r)]
        print(f'Run with id {run_id} has {len(r)} documents')

    print(f'Fuse {len(all_runs)} runs')
    fused_run = fusion.reciprocal_rank_fusion(all_runs)
    fused_run = fused_run.run_data

    fused_run['qid'] = fused_run['query']
    del fused_run['query']
    fused_run['docno'] = fused_run['docid']
    del fused_run['docid']

    pt.io.write_results(fused_run, output_run)

In [5]:
dataset = 'longeval-heldout-20230513-training'
wmodel = 'BM25'
variants = '10'
prompt = '1'
input_run = f'{dataset}/query-variant-runs/{wmodel}-{variants}-variants-prompt-{prompt}-run.txt.gz'
output_run = f'all-results/{dataset}/{wmodel}-{variants}-variants-prompt-{prompt}-fused-run.txt.gz'

reciprocal_renk_fusion(input_run, output_run)

Process run with 549037 lines.
Created run with 97037 lines
Created run with 51000 lines
Run with id 0 has 51000 documents
Created run with 51000 lines
Run with id 1 has 51000 documents
Created run with 51000 lines
Run with id 2 has 51000 documents
Created run with 51000 lines
Run with id 3 has 51000 documents
Created run with 51000 lines
Run with id 4 has 51000 documents
Created run with 51000 lines
Run with id 5 has 51000 documents
Created run with 49000 lines
Run with id 6 has 49000 documents
Created run with 40000 lines
Run with id 7 has 40000 documents
Created run with 33000 lines
Run with id 8 has 33000 documents
Created run with 24000 lines
Run with id 9 has 24000 documents
Fuse 11 runs


In [3]:

for dataset in ['longeval-train-20230513-training']:
    for wmodel in ['BM25', 'DPH', 'DirichletLM', 'LGD', 'PL2']:
        for prompt in ['1', '2']:
            for variants in ['3', '5', '10']:
                input_run = f'{dataset}/query-variant-runs/{wmodel}-{variants}-variants-prompt-{prompt}-run.txt.gz'
                output_run = f'all-results/{dataset}/{wmodel}-{variants}-variants-prompt-{prompt}-fused-run.txt.gz'

                reciprocal_renk_fusion(input_run, output_run)

Process run with 1666455 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documents
Created run with 336000 lines
Run with id 2 has 336000 documents
Fuse 4 runs
Process run with 2338402 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documents
Created run with 336000 lines
Run with id 2 has 336000 documents
Created run with 336000 lines
Run with id 3 has 336000 documents
Created run with 335947 lines
Run with id 4 has 335947 documents
Fuse 6 runs
Process run with 3556964 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documents
Created run with 336000 lines
Run with id 2 has 336000 documents
Created run with 336000 lines
Run with id 3 has 336000 documents
Created ru

Created run with 513158 lines
Run with id 8 has 513158 documents
Created run with 513000 lines
Run with id 9 has 513000 documents
Fuse 11 runs
Process run with 1666455 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documents
Created run with 336000 lines
Run with id 2 has 336000 documents
Fuse 4 runs
Process run with 2338402 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documents
Created run with 336000 lines
Run with id 2 has 336000 documents
Created run with 336000 lines
Run with id 3 has 336000 documents
Created run with 335947 lines
Run with id 4 has 335947 documents
Fuse 6 runs
Process run with 3556964 lines.
Created run with 658455 lines
Created run with 336000 lines
Run with id 0 has 336000 documents
Created run with 336000 lines
Run with id 1 has 336000 documen

In [14]:
len(pt.io.read_results('all-results/longeval-heldout-20230513-training/BM25-10-variants-prompt-1-fused-run.txt.gz').qid.unique())

98

In [4]:
print('Done')

Done
