In [2]:
import pandas as pd
import sys
sys.path.append('../python')
from evaluation_util import evaluate_on_original_pool_only
from trectools import TrecQrel, TrecRun, TrecEval
from tqdm import tqdm
from cross_validation_util import cross_validation_experiment
from statistics import mean
from parametrized_bootstrapping_model import BootstrappingBySelectingMostLikelyDataPoint, FixedQuantileBootstrappingModel

def qrels_dict(f):
    ret = {}
    for _, l in tqdm(pd.read_csv(f, sep='\t').iterrows()):
        internal_id = (int(l['query_id']), l['doc_id'])
        if internal_id in ret:
            print(f'Duplicate {internal_id}')
            #raise ValueError('Can not happen')

        ret[internal_id] = int(l['score'])
    return ret

def load_qrels(with_additional=False):
    orig_qrels_dict = qrels_dict('../resources/beir/trec-covid-beir/qrels/test.tsv')
    add_qrels_dict = qrels_dict('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-additional-judgments.tsv')
    
    df_ret = []
    for (qid, doc_id), score in orig_qrels_dict.items():
        if (qid, doc_id) in add_qrels_dict:
            continue
        
        df_ret += [{"query": str(qid),"q0": "0","docid": doc_id,"rel": score}]

    if with_additional:
        for (qid, doc_id), score in add_qrels_dict.items():
            df_ret += [{"query": str(qid),"q0": "Q0","docid": doc_id,"rel": score}]
    
    df_ret = pd.DataFrame(df_ret)
    
    ret = TrecQrel()
    ret.qrels_data = df_ret
    
    return ret

qrels_complete = TrecQrel('beir-evaluation-data/complete-annotation-trec-covid.txt')
qrels_with_unjudged = TrecQrel('beir-evaluation-data/incomplete-beir-trec-covid.txt')

print(len(qrels_with_unjudged.qrels_data))
print(len(qrels_complete.qrels_data))

66334
67314


In [7]:
!mkdir beir-evaluation-data/runs/

In [11]:
!cp ../resources/processed/normalized-runs/trec-system-runs/trec-covid/tas-b-09-01-2023-run.txt  beir-evaluation-data/runs/

In [13]:
runs = !ls beir-evaluation-data/runs/
runs = [i.split('-run.txt')[0] for i in runs]
runs = {i: TrecRun(f'beir-evaluation-data/runs/{i}-run.txt') for i in tqdm(runs)}

def eval_run(run_name, run):
    return {
        'system': run_name,
        'unjudged@10 (incomplete)': TrecEval(run, qrels_with_unjudged).get_unjudged(depth=10),
        'ndcg@10 (incomplete)': TrecEval(run, qrels_with_unjudged).get_ndcg(depth=10),
        'ndcg@10 (incomplete, removeUnjudged)': TrecEval(run, qrels_with_unjudged).get_ndcg(depth=10, removeUnjudged=True),
        'unjudged@10 (complete)': TrecEval(run, qrels_complete).get_unjudged(depth=10),
        'ndcg@10 (complete)': TrecEval(run, qrels_complete).get_ndcg(depth=10),
    }
    
df_eval = []
for run_name, run in tqdm(runs.items()):
    df_eval += [eval_run(run_name, run)]

df_eval = pd.DataFrame(df_eval)
df_eval


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  6.12it/s][A
100%|██████████| 4/4 [00:00<00:00, 12.34it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  3.04it/s][A
 50%|█████     | 2/4 [00:00<00:00,  3.88it/s][A
 75%|███████▌  | 3/4 [00:00<00:00,  4.51it/s][A
100%|██████████| 4/4 [00:00<00:00,  4.04it/s][A


Unnamed: 0,system,unjudged@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",unjudged@10 (complete),ndcg@10 (complete)
0,ance-09-01-2023,0.224,0.652447,0.772475,0.002,0.734673
1,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996
2,colbert-ranking-26-12-2022,0.172,0.679529,0.769887,0.016,0.733639
3,tas-b-09-01-2023,0.41,0.481254,0.704533,0.0,0.555436


In [14]:
df_eval.sort_values('ndcg@10 (incomplete)', ascending=False).reset_index()

Unnamed: 0,index,system,unjudged@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",unjudged@10 (complete),ndcg@10 (complete)
0,2,colbert-ranking-26-12-2022,0.172,0.679529,0.769887,0.016,0.733639
1,1,BBGhelani2,0.0,0.678996,0.678996,0.0,0.678996
2,0,ance-09-01-2023,0.224,0.652447,0.772475,0.002,0.734673
3,3,tas-b-09-01-2023,0.41,0.481254,0.704533,0.0,0.555436


In [15]:
ret = []

for measure in ['residual-ndcg@10', 'bs-run-and-pool-dependent-1000-ndcg@10']:
    for system in ['colbert-ranking-26-12-2022', 'ance-09-01-2023', 'tas-b-09-01-2023']:
        ret += [evaluate_on_original_pool_only(
            run_file=f'beir-evaluation-data/runs/{system}-run.txt', 
            qrel_file='beir-evaluation-data/incomplete-beir-trec-covid.txt', 
            measure=measure
        )]


Bootstrapping:   0%|          | 0/50 [00:00<?, ?it/s][A
Bootstrapping:   2%|▏         | 1/50 [00:09<07:50,  9.60s/it][A
Bootstrapping:   4%|▍         | 2/50 [00:18<07:17,  9.12s/it][A
Bootstrapping:   6%|▌         | 3/50 [00:27<07:15,  9.26s/it][A
Bootstrapping:   8%|▊         | 4/50 [00:36<06:59,  9.12s/it][A
Bootstrapping:  10%|█         | 5/50 [00:47<07:12,  9.61s/it][A
Bootstrapping:  12%|█▏        | 6/50 [00:56<06:55,  9.44s/it][A
Bootstrapping:  14%|█▍        | 7/50 [01:04<06:24,  8.95s/it][A
Bootstrapping:  16%|█▌        | 8/50 [01:12<06:01,  8.60s/it][A
Bootstrapping:  18%|█▊        | 9/50 [01:19<05:41,  8.32s/it][A
Bootstrapping:  20%|██        | 10/50 [01:27<05:25,  8.15s/it][A
Bootstrapping:  22%|██▏       | 11/50 [01:35<05:11,  7.99s/it][A
Bootstrapping:  24%|██▍       | 12/50 [01:42<05:00,  7.90s/it][A
Bootstrapping:  26%|██▌       | 13/50 [01:51<04:59,  8.11s/it][A
Bootstrapping:  28%|██▊       | 14/50 [02:00<04:59,  8.33s/it][A
Bootstrapping:  30%|███    

In [16]:
import json
json.dump([{'depth-10-incomplete': i['complete-pool-depth-all']} for i in ret], open('beir-evaluation-data/beir-bootstrapping-results.jsonl', 'w'))

In [1]:
import json
tmp = json.load(open('beir-evaluation-data/beir-bootstrapping-results.jsonl', 'r'))

In [10]:
def predict_bs_most_likely(i):
    i = i['depth-10-incomplete']
    model = BootstrappingBySelectingMostLikelyDataPoint('bs-run-and-pool-dependent-1000-ndcg@10')
    ret = []
    
    for l in i:
        #print(type(l['ndcg@10']))
        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    print('run-and-pool-based-bs-most-likely: ', i[0]['run_file'], ':', mean(ret))

def predict_bs_95_percent_quantile(i):
    i = i['depth-10-incomplete']
    model = FixedQuantileBootstrappingModel('x', 95)
    ret = []
    print(i[0]['run_file'])
    for l in i:
        #print(type(l['ndcg@10']))
        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    print('run-and-pool-based-bs-95-percent: ', i[0]['run_file'], ':', mean(ret))

def predict_max_residual(i):
    i = i['depth-10-incomplete']
    ret = []
    print
    for l in i:
        ret += [l['MAX-NDCG@10']]
    
    print('Max-residual', (i[0]['run_file']), ':', mean(ret))


In [12]:
predict_bs_most_likely(tmp[3])
predict_bs_95_percent_quantile(tmp[3])
predict_max_residual(tmp[0])


run-and-pool-based-bs-most-likely:  beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.7409517338949408
beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt
run-and-pool-based-bs-95-percent:  beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.7885473343788988
Max-residual beir-evaluation-data/runs/colbert-ranking-26-12-2022-run.txt : 0.8258838045277852


In [13]:
predict_bs_most_likely(tmp[4])
predict_bs_95_percent_quantile(tmp[4])
predict_max_residual(tmp[1])

run-and-pool-based-bs-most-likely:  beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.7474231965765935
beir-evaluation-data/runs/ance-09-01-2023-run.txt
run-and-pool-based-bs-95-percent:  beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.8041515476389878
Max-residual beir-evaluation-data/runs/ance-09-01-2023-run.txt : 0.8527441906345933


In [14]:
predict_bs_most_likely(tmp[5])
predict_bs_95_percent_quantile(tmp[5])
predict_max_residual(tmp[2])

run-and-pool-based-bs-most-likely:  beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.6333194443585958
beir-evaluation-data/runs/tas-b-09-01-2023-run.txt
run-and-pool-based-bs-95-percent:  beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.7285839535510149
Max-residual beir-evaluation-data/runs/tas-b-09-01-2023-run.txt : 0.8705287615794982
