In [2]:
import pandas as pd
import sys
sys.path.append('../python')
from evaluation_util import evaluate_on_original_pool_only
from trectools import TrecQrel, TrecRun, TrecEval
from tqdm import tqdm
from cross_validation_util import cross_validation_experiment
from statistics import mean
from parametrized_bootstrapping_model import BootstrappingBySelectingMostLikelyDataPoint, FixedQuantileBootstrappingModel

def qrels_dict(f):
    ret = {}
    for _, l in tqdm(pd.read_csv(f, sep='\t').iterrows()):
        internal_id = (int(l['query_id']), l['doc_id'])
        if internal_id in ret:
            print(f'Duplicate {internal_id}')
            #raise ValueError('Can not happen')

        ret[internal_id] = int(l['score'])
    return ret

def load_qrels(with_additional=False):
    orig_qrels_dict = qrels_dict('../resources/beir/trec-covid-beir/qrels/test.tsv')
    add_qrels_dict = qrels_dict('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-additional-judgments.tsv')
    
    df_ret = []
    for (qid, doc_id), score in orig_qrels_dict.items():
        if (qid, doc_id) in add_qrels_dict:
            continue
        
        df_ret += [{"query": str(qid),"q0": "0","docid": doc_id,"rel": score}]

    if with_additional:
        for (qid, doc_id), score in add_qrels_dict.items():
            df_ret += [{"query": str(qid),"q0": "Q0","docid": doc_id,"rel": score}]
    
    df_ret = pd.DataFrame(df_ret)
    
    ret = TrecQrel()
    ret.qrels_data = df_ret
    
    return ret

qrels_complete = TrecQrel('beir-evaluation-data/complete-annotation-trec-covid.txt')
qrels_with_unjudged = TrecQrel('beir-evaluation-data/incomplete-beir-trec-covid.txt')

print(len(qrels_with_unjudged.qrels_data))
print(len(qrels_complete.qrels_data))

runs = !ls beir-evaluation-data/runs/
runs = [i.split('-run.txt')[0] for i in runs]
runs = {i: TrecRun(f'beir-evaluation-data/runs/{i}-run.txt') for i in tqdm(runs)}


66334
67314


100%|██████████| 4/4 [00:00<00:00,  7.99it/s]


In [4]:
runs.keys()

dict_keys(['ance-09-01-2023', 'BBGhelani2', 'colbert-ranking-26-12-2022', 'tas-b-09-01-2023'])

In [11]:
def run_to_unjudged_docs_per_query(run):
    return {i['query']: i['UNJ@10'] for _, i in TrecEval(run, qrels_with_unjudged).get_unjudged(depth=10, per_query=True).reset_index().iterrows()}

def topics_with_unjudged_between(system, min_unjudged, max_unjudged):
    ret = run_to_unjudged_docs_per_query(runs[system])
    
    return [int(k) for k, v in ret.items() if v >= min_unjudged and v <= max_unjudged]

topics_with_unjudged_between('ance-09-01-2023', 0.5, 2)

topics_with_unjudged_between('ance-09-01-2023', 0.25, 0.4999999)

[12, 17, 22, 29, 32, 33, 35, 43, 9]

In [15]:
len(qrels_with_unjudged.qrels_data['query'].unique())

50

In [21]:
def eval_run(run_name, run, topics_to_filter, description):
    
    qrels_with_unjudged_filtered = TrecQrel()
    qrels_with_unjudged_filtered.qrels_data = qrels_with_unjudged.qrels_data.copy()
    
    qrels_complete_filtered = TrecQrel()
    qrels_complete_filtered.qrels_data = qrels_complete.qrels_data.copy()
    
    if topics_to_filter:
        qrels_with_unjudged_filtered.qrels_data = qrels_with_unjudged_filtered.qrels_data[qrels_with_unjudged_filtered.qrels_data['query'].astype(int).isin(topics_to_filter)]
        qrels_complete_filtered.qrels_data = qrels_complete_filtered.qrels_data[qrels_complete_filtered.qrels_data['query'].astype(int).isin(topics_to_filter)]
        run_data = run.run_data.copy()
        run = TrecRun()
        run.run_data = run_data[run_data['query'].astype(int).isin(topics_to_filter)]
        
    return {
        'system': run_name,
        'unjudged@10 (incomplete)': TrecEval(run, qrels_with_unjudged_filtered).get_unjudged(depth=10),
        'ndcg@10 (incomplete)': TrecEval(run, qrels_with_unjudged_filtered).get_ndcg(depth=10),
        'ndcg@10 (incomplete, removeUnjudged)': TrecEval(run, qrels_with_unjudged_filtered).get_ndcg(depth=10, removeUnjudged=True),
        'unjudged@10 (complete)': TrecEval(run, qrels_complete_filtered).get_unjudged(depth=10),
        'ndcg@10 (complete)': TrecEval(run, qrels_complete_filtered).get_ndcg(depth=10),
        'Number of Topics (unjudged)': len(qrels_with_unjudged_filtered.qrels_data['query'].unique()),
        'Number of Topics (judged)': len(qrels_complete_filtered.qrels_data['query'].unique()),
        'Description': description,
    }
    
df_eval = []
for run_name, run in tqdm(runs.items()):
    df_eval += [eval_run(run_name, run, [], 'All Topics')]
    df_eval += [eval_run(run_name, run, topics_with_unjudged_between(run_name, 0.25, 0.4999999), '25% to 50% unjudged')]
    df_eval += [eval_run(run_name, run, topics_with_unjudged_between(run_name, 0.5, 2), '50% to 100% unjudged')]

df_eval = pd.DataFrame(df_eval)
df_eval

100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


Unnamed: 0,system,unjudged@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",unjudged@10 (complete),ndcg@10 (complete),Number of Topics (unjudged),Number of Topics (judged),Description
0,ance-09-01-2023,0.224,0.652447,0.772475,0.002,0.734673,50,50,All Topics
1,ance-09-01-2023,0.355556,0.489424,0.682926,0.0,0.650143,9,9,25% to 50% unjudged
2,ance-09-01-2023,0.655556,0.206787,0.547147,0.011111,0.35724,9,9,50% to 100% unjudged
3,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996,50,50,All Topics
4,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996,50,50,25% to 50% unjudged
5,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996,50,50,50% to 100% unjudged
6,colbert-ranking-26-12-2022,0.172,0.679529,0.769887,0.016,0.733639,50,50,All Topics
7,colbert-ranking-26-12-2022,0.333333,0.48511,0.640822,0.0,0.625857,6,6,25% to 50% unjudged
8,colbert-ranking-26-12-2022,0.628571,0.336554,0.678858,0.1,0.447473,7,7,50% to 100% unjudged
9,tas-b-09-01-2023,0.41,0.481254,0.704533,0.0,0.555436,50,50,All Topics


In [14]:
df_eval.sort_values('ndcg@10 (incomplete)', ascending=False).reset_index()

Unnamed: 0,index,system,unjudged@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",unjudged@10 (complete),ndcg@10 (complete)
0,2,colbert-ranking-26-12-2022,0.172,0.679529,0.769887,0.016,0.733639
1,1,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996
2,0,ance-09-01-2023,0.224,0.652447,0.772475,0.002,0.734673
3,3,tas-b-09-01-2023,0.41,0.481254,0.704533,0.0,0.555436


In [15]:
ret = []

for measure in ['residual-ndcg@10', 'bs-run-and-pool-dependent-1000-ndcg@10']:
    for system in ['colbert-ranking-26-12-2022', 'ance-09-01-2023', 'tas-b-09-01-2023']:
        ret += [evaluate_on_original_pool_only(
            run_file=f'beir-evaluation-data/runs/{system}-run.txt', 
            qrel_file='beir-evaluation-data/incomplete-beir-trec-covid.txt', 
            measure=measure
        )]


Bootstrapping:   0%|          | 0/50 [00:00<?, ?it/s][A
Bootstrapping:   2%|▏         | 1/50 [00:09<07:50,  9.60s/it][A
Bootstrapping:   4%|▍         | 2/50 [00:18<07:17,  9.12s/it][A
Bootstrapping:   6%|▌         | 3/50 [00:27<07:15,  9.26s/it][A
Bootstrapping:   8%|▊         | 4/50 [00:36<06:59,  9.12s/it][A
Bootstrapping:  10%|█         | 5/50 [00:47<07:12,  9.61s/it][A
Bootstrapping:  12%|█▏        | 6/50 [00:56<06:55,  9.44s/it][A
Bootstrapping:  14%|█▍        | 7/50 [01:04<06:24,  8.95s/it][A
Bootstrapping:  16%|█▌        | 8/50 [01:12<06:01,  8.60s/it][A
Bootstrapping:  18%|█▊        | 9/50 [01:19<05:41,  8.32s/it][A
Bootstrapping:  20%|██        | 10/50 [01:27<05:25,  8.15s/it][A
Bootstrapping:  22%|██▏       | 11/50 [01:35<05:11,  7.99s/it][A
Bootstrapping:  24%|██▍       | 12/50 [01:42<05:00,  7.90s/it][A
Bootstrapping:  26%|██▌       | 13/50 [01:51<04:59,  8.11s/it][A
Bootstrapping:  28%|██▊       | 14/50 [02:00<04:59,  8.33s/it][A
Bootstrapping:  30%|███    

In [16]:
import json
json.dump([{'depth-10-incomplete': i['complete-pool-depth-all']} for i in ret], open('beir-evaluation-data/beir-bootstrapping-results.jsonl', 'w'))

In [23]:
import json
tmp = json.load(open('beir-evaluation-data/beir-bootstrapping-results.jsonl', 'r'))

In [37]:
def predict_bs_most_likely(i, description, topics_to_include=None):
    i = i['depth-10-incomplete']
    model = BootstrappingBySelectingMostLikelyDataPoint('bs-run-and-pool-dependent-1000-ndcg@10')
    ret = []
    for l in i:
        if topics_to_include and int(l['query']) not in topics_to_include:
            continue

        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    print(f'run-and-pool-based-bs-most-likely ({description}: {len(ret)} Topics):', i[0]['run_file'], ':', mean(ret))

def predict_bs_95_percent_quantile(i, description, topics_to_include=None):
    i = i['depth-10-incomplete']
    model = FixedQuantileBootstrappingModel('x', 95)
    ret = []
    num_topics = 0
    for l in i:
        if topics_to_include and int(l['query']) not in topics_to_include:
            continue

        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    print(f'run-and-pool-based-bs-95-percent ({description}: {len(ret)} Topics):', i[0]['run_file'], ':', mean(ret))

def predict_max_residual(i, description, topics_to_include=None):
    i = i['depth-10-incomplete']
    ret = []
    num_topics = 0
    for l in i:
        if topics_to_include and int(l['query']) not in topics_to_include:
            continue

        ret += [l['MAX-NDCG@10']]
    
    print(f'Max-residual ({description}: {len(ret)} Topics):', (i[0]['run_file']), ':', mean(ret))


In [38]:
predict_bs_most_likely(tmp[3], 'All')
predict_bs_95_percent_quantile(tmp[3], 'All')
predict_max_residual(tmp[0], 'All')

print('\n\n')

predict_bs_most_likely(tmp[3], '25% to 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.25, 0.4999999))
predict_bs_95_percent_quantile(tmp[3], '25% to 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.25, 0.4999999))
predict_max_residual(tmp[0], '25% to 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.25, 0.4999999))

print('\n\n')

predict_bs_most_likely(tmp[3], '>= 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.5, 2))
predict_bs_95_percent_quantile(tmp[3], '>= 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.5, 2))
predict_max_residual(tmp[0], '>= 50% unjudged', topics_with_unjudged_between('colbert-ranking-26-12-2022', 0.5, 2))

run-and-pool-based-bs-most-likely (All: 50 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.7409517338949408
run-and-pool-based-bs-95-percent (All: 50 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.7885473343788988
Max-residual (All: 50 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.8258838045277852



run-and-pool-based-bs-most-likely (25% to 50% unjudged: 6 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.6144498142386656
run-and-pool-based-bs-95-percent (25% to 50% unjudged: 6 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.741200372569417
Max-residual (25% to 50% unjudged: 6 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.7702190333334451



run-and-pool-based-bs-most-likely (>= 50% unjudged: 7 Topics): beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.5168805655932145
run-and-pool-based-bs-95-percent (>= 50% unju

In [40]:
predict_bs_most_likely(tmp[4], 'All')
predict_bs_95_percent_quantile(tmp[4], 'All')
predict_max_residual(tmp[1], 'All')

print('\n\n')

predict_bs_most_likely(tmp[4], '25% to 50% unjudged',  topics_with_unjudged_between('ance-09-01-2023', 0.25, 0.4999999))
predict_bs_95_percent_quantile(tmp[4], '25% to 50% unjudged',  topics_with_unjudged_between('ance-09-01-2023', 0.25, 0.4999999))
predict_max_residual(tmp[1], '25% to 50% unjudged',  topics_with_unjudged_between('ance-09-01-2023', 0.25, 0.4999999))

print('\n\n')

predict_bs_most_likely(tmp[4], '>= 50% unjudged', topics_with_unjudged_between('ance-09-01-2023', 0.5, 2))
predict_bs_95_percent_quantile(tmp[4], '>= 50% unjudged', topics_with_unjudged_between('ance-09-01-2023', 0.5, 2))
predict_max_residual(tmp[1], '>= 50% unjudged', topics_with_unjudged_between('ance-09-01-2023', 0.5, 2))

run-and-pool-based-bs-most-likely (All: 50 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.7474231965765935
run-and-pool-based-bs-95-percent (All: 50 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.8041515476389878
Max-residual (All: 50 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.8527441906345933



run-and-pool-based-bs-most-likely (25% to 50% unjudged: 9 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.6600858116557217
run-and-pool-based-bs-95-percent (25% to 50% unjudged: 9 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.7954460614144012
Max-residual (25% to 50% unjudged: 9 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.8383318615648692



run-and-pool-based-bs-most-likely (>= 50% unjudged: 9 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.3845120388175058
run-and-pool-based-bs-95-percent (>= 50% unjudged: 9 Topics): beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.54196

In [44]:
predict_bs_most_likely(tmp[5], 'All')
predict_bs_95_percent_quantile(tmp[5], 'All')
predict_max_residual(tmp[2], 'All')

print('\n\n')

predict_bs_most_likely(tmp[5], '25% to 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.25, 0.4999999))
predict_bs_95_percent_quantile(tmp[5], '25% to 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.25, 0.4999999))
predict_max_residual(tmp[2], '25% to 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.25, 0.4999999))

print('\n\n')

predict_bs_most_likely(tmp[5], '>= 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.5, 2))
predict_bs_95_percent_quantile(tmp[5], '>= 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.5, 2))
predict_max_residual(tmp[2], '>= 50% unjudged', topics_with_unjudged_between('tas-b-09-01-2023', 0.5, 2))

run-and-pool-based-bs-most-likely (All: 50 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.6333194443585958
run-and-pool-based-bs-95-percent (All: 50 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.7285839535510149
Max-residual (All: 50 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.8705287615794982



run-and-pool-based-bs-most-likely (25% to 50% unjudged: 4 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.8472441247665643
run-and-pool-based-bs-95-percent (25% to 50% unjudged: 4 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.8937956298748013
Max-residual (25% to 50% unjudged: 4 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.9017482283996704



run-and-pool-based-bs-most-likely (>= 50% unjudged: 24 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.4586411896115204
run-and-pool-based-bs-95-percent (>= 50% unjudged: 24 Topics): beir-evaluation-data/runs/tas-b-09-01-2023-run.txt