# Evaluation on BEIR

In [1]:
import pandas as pd
import sys
sys.path.append('../python')
from evaluation_util import evaluate_on_original_pool_only
from trectools import TrecQrel, TrecRun, TrecEval
from tqdm import tqdm
from cross_validation_util import cross_validation_experiment
from statistics import mean
from parametrized_bootstrapping_model import BootstrappingBySelectingMostLikelyDataPoint, FixedQuantileBootstrappingModel

def qrels_dict(f):
    ret = {}
    for _, l in tqdm(pd.read_csv(f, sep='\t').iterrows()):
        internal_id = (int(l['query_id']), l['doc_id'])
        if internal_id in ret:
            print(f'Duplicate {internal_id}')
            #raise ValueError('Can not happen')

        ret[internal_id] = int(l['score'])
    return ret

def load_qrels(with_additional=False):
    orig_qrels_dict = qrels_dict('../resources/beir/trec-covid-beir/qrels/test.tsv')
    add_qrels_dict = qrels_dict('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-additional-judgments.tsv')
    
    df_ret = []
    for (qid, doc_id), score in orig_qrels_dict.items():
        if (qid, doc_id) in add_qrels_dict:
            continue
        
        df_ret += [{"query": str(qid),"q0": "0","docid": doc_id,"rel": score}]

    if with_additional:
        for (qid, doc_id), score in add_qrels_dict.items():
            df_ret += [{"query": str(qid),"q0": "Q0","docid": doc_id,"rel": score}]
    
    df_ret = pd.DataFrame(df_ret)
    
    ret = TrecQrel()
    ret.qrels_data = df_ret
    
    return ret

#qrels_complete = load_qrels(True)
#qrels_with_unjudged = load_qrels()

#print(len(qrels_complete.qrels_data))
#print(len(qrels_with_unjudged.qrels_data))

# qrels_complete.qrels_data.to_csv(f'../resources/unprocessed/topics-and-qrels/qrels.trec-covid-complete.txt', sep=' ', header=False, index=False)

# qrels_with_unjudged.qrels_data.to_csv(f'../resources/unprocessed/topics-and-qrels/qrels.trec-covid-incomplete.txt', sep=' ', header=False, index=False)

qrels_complete = TrecQrel('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-complete.txt')
qrels_with_unjudged = TrecQrel('../resources/unprocessed/topics-and-qrels/qrels.trec-covid-incomplete.txt')

print(len(qrels_with_unjudged.qrels_data))
print(len(qrels_complete.qrels_data))

66334
67314


### Load Runs

In [2]:
runs = !ls ../resources/processed/normalized-runs/trec-system-runs/trec-covid/
runs = [i.split('-run.txt')[0] for i in runs]
runs = {i: TrecRun(f'../resources/processed/normalized-runs/trec-system-runs/trec-covid/{i}-run.txt') for i in tqdm(runs)}

def eval_run(run_name, run):
    return {
        'system': run_name,
        'unjudgeed@10 (incomplete)': TrecEval(run, qrels_with_unjudged).get_unjudged(depth=10),
        'ndcg@10 (incomplete)': TrecEval(run, qrels_with_unjudged).get_ndcg(depth=10),
        'ndcg@10 (incomplete, removeUnjudged)': TrecEval(run, qrels_with_unjudged).get_ndcg(depth=10, removeUnjudged=True),
        'ndcg@10 (complete)': TrecEval(run, qrels_complete).get_ndcg(depth=10),
    }
    
df_eval = []
for run_name, run in tqdm(runs.items()):
    df_eval += [eval_run(run_name, run)]

df_eval = pd.DataFrame(df_eval)
df_eval

100%|██████████| 34/34 [00:01<00:00, 26.74it/s]
100%|██████████| 34/34 [00:17<00:00,  1.91it/s]


Unnamed: 0,system,unjudgeed@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",ndcg@10 (complete)
0,ance,0.224,0.652447,0.772475,0.734673
1,BBGhelani1,0.0,0.678996,0.678996,0.678996
2,BBGhelani2,0.0,0.678996,0.678996,0.678996
3,BioinfoUA-emb-q,0.0,0.462326,0.462326,0.462326
4,BioinfoUA-emb,0.0,0.524367,0.524367,0.524367
5,BioinfoUA-noadapt,0.0,0.523362,0.523362,0.523362
6,BITEM_BL,0.0,0.362629,0.362629,0.362629
7,BITEM_df,0.0,0.357621,0.357621,0.357621
8,BITEM_stem,0.0,0.350991,0.350991,0.350991
9,bm25,0.018,0.342661,0.345743,0.345438


In [3]:
df_eval.sort_values('unjudgeed@10 (incomplete)', ascending=False)

Unnamed: 0,system,unjudgeed@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",ndcg@10 (complete)
32,tas-b,0.41,0.481254,0.704533,0.555436
0,ance,0.224,0.652447,0.772475,0.734673
31,sentence-bert,0.222,0.585106,0.708086,0.658507
23,dpr,0.208,0.482703,0.597574,0.548894
16,colbert-pyterrier,0.09,0.707877,0.755696,0.741051
28,pl2,0.022,0.358082,0.363658,0.361608
9,bm25,0.018,0.342661,0.345743,0.345438
33,tf-idf,0.016,0.367447,0.369802,0.371849
30,run2,0.002,0.637752,0.638373,0.637752
8,BITEM_stem,0.0,0.350991,0.350991,0.350991


In [4]:
df_eval.sort_values('ndcg@10 (incomplete)', ascending=False).reset_index()

Unnamed: 0,index,system,unjudgeed@10 (incomplete),ndcg@10 (incomplete),"ndcg@10 (incomplete, removeUnjudged)",ndcg@10 (complete)
0,16,colbert-pyterrier,0.09,0.707877,0.755696,0.741051
1,2,BBGhelani2,0.0,0.678996,0.678996,0.678996
2,1,BBGhelani1,0.0,0.678996,0.678996,0.678996
3,29,run1,0.0,0.677685,0.677685,0.677685
4,0,ance,0.224,0.652447,0.772475,0.734673
5,17,CSIROmedNIR,0.0,0.650633,0.650633,0.650633
6,30,run2,0.002,0.637752,0.638373,0.637752
7,19,CSIROmed_RF,0.0,0.606938,0.606938,0.606938
8,31,sentence-bert,0.222,0.585106,0.708086,0.658507
9,4,BioinfoUA-emb,0.0,0.524367,0.524367,0.524367


In [86]:
df_eval.sort_values('ndcg@10 (complete)', ascending=False).reset_index()

Unnamed: 0,index,system,ndcg@10 (incomplete),unjudgeed@10 (incomplete),ndcg@10 (complete),unjudgeed@10 (complete)
0,16,colbert-pyterrier,0.707877,0.09,0.741051,0.03
1,0,ance,0.652447,0.224,0.734673,0.0
2,2,BBGhelani2,0.678996,0.0,0.678996,0.0
3,1,BBGhelani1,0.678996,0.0,0.678996,0.0
4,29,run1,0.677685,0.0,0.677685,0.0
5,31,sentence-bert,0.585106,0.222,0.658507,0.0
6,17,CSIROmedNIR,0.650633,0.0,0.650633,0.0
7,30,run2,0.637752,0.002,0.637752,0.0
8,19,CSIROmed_RF,0.606938,0.0,0.606938,0.0
9,23,dpr,0.482703,0.208,0.548894,0.0


# Bootstrapping and Other Evaluations

In [22]:
ret = []

for measure in ['residual-ndcg@10', 'bs-run-and-pool-dependent-1000-ndcg@10']:
    for system in ['colbert-pyterrier', 'ance', 'sentence-bert', 'dpr', 'tas-b']:
        ret += [evaluate_on_original_pool_only(
            run_file=f'../resources/processed/normalized-runs/trec-system-runs/trec-covid/{system}-run.txt', 
            qrel_file='../resources/unprocessed/topics-and-qrels/qrels.trec-covid-incomplete.txt', 
            measure=measure
        )]

Bootstrapping: 100%|██████████| 50/50 [07:57<00:00,  9.55s/it]


In [28]:
import json
json.dump([{'depth-10-incomplete': i['complete-pool-depth-all']} for i in ret], open('beir-bootstrapping-results.jsonl', 'w'))

In [34]:
mkdir ../resources/eval/trec-system-runs/trec-covid

In [50]:
from statistics import mean

In [60]:
def predict_bla(i):
    i = i['depth-10-incomplete']
    model = BootstrappingBySelectingMostLikelyDataPoint('bs-run-and-pool-dependent-1000-ndcg@10')
    ret = []
    print(i[0]['run_file'])
    for l in i:
        #print(type(l['ndcg@10']))
        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    return mean(ret)
        

predict_bla(tmp[4])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/colbert-pyterrier-run.txt


0.7447383603904687

In [61]:
predict_bla(tmp[5])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/ance-run.txt


0.7465167706156675

In [62]:
predict_bla(tmp[6])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/sentence-bert-run.txt


0.6968039239522638

In [63]:
predict_bla(tmp[7])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/dpr-run.txt


0.5571300711262028

In [65]:
predict_bla(tmp[9])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/tas-b-run.txt


0.6289705901274139

In [2]:
import json
tmp = json.load(open('beir-bootstrapping-results.jsonl', 'r'))

In [8]:
def predict_bla(i):
    i = i['depth-10-incomplete']
    model = FixedQuantileBootstrappingModel('x', 95)
    ret = []
    print(i[0]['run_file'])
    for l in i:
        #print(type(l['ndcg@10']))
        t = model.predict([l['ndcg@10']])
        assert len(t) == 1
        
        ret += [t[0]]
    
    return mean(ret)
        

predict_bla(tmp[4])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/colbert-pyterrier-run.txt


0.7711735686320375

In [9]:
predict_bla(tmp[5])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/ance-run.txt


0.8036730585198735

In [10]:
predict_bla(tmp[6])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/sentence-bert-run.txt


0.7682683369435955

In [11]:
predict_bla(tmp[7])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/dpr-run.txt


0.65940712072831

In [12]:
predict_bla(tmp[9])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/tas-b-run.txt


0.7262074608719449

In [15]:
def predict_bla(i):
    i = i['depth-10-incomplete']
    ret = []
    print(i[0]['run_file'])
    for l in i:
        ret += [l['MAX-NDCG@10']]
    
    return mean(ret)
        

predict_bla(tmp[0])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/colbert-pyterrier-run.txt


0.7818903844140824

In [16]:
predict_bla(tmp[1])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/ance-run.txt


0.8527441906345933

In [17]:
predict_bla(tmp[2])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/sentence-bert-run.txt


0.8068369972447191

In [18]:
predict_bla(tmp[3])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/dpr-run.txt


0.7050871673235765

In [21]:
predict_bla(tmp[8])

../resources/processed/normalized-runs/trec-system-runs/trec-covid/tas-b-run.txt


0.8705287615794982