# Significance Tests with PyTerrier

In [2]:
import pyterrier as pt
import pandas as pd

RUN_DIR='/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-probst/retrievalExperiments/runs-ecir22/'
RUN_DIR_MARCO_V2='/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-probst/retrievalExperiments/runs-marco-v2-ecir22/'
QREL_DIR = '/mnt/ceph/storage/data-tmp/2021/kibi9872/thesis-probst/Data/navigational-topics-and-qrels-ms-marco-'

if not pt.started():
    pt.init()

def pt_qrels(ret):
    from trectools import TrecQrel
    ret = TrecQrel(QREL_DIR + ret).qrels_data
    ret = ret.copy()
    del ret['q0']
    ret = ret.rename(columns={'query': 'qid','docid': 'docno', 'rel': 'label'})
    ret['qid'] = ret['qid'].astype(str)
    ret['label'] = ret['label'].astype(int)
    return ret

def pt_topics(ret):
    from trectools import TrecQrel
    qids = TrecQrel(QREL_DIR + ret).qrels_data['query'].unique()
    
    ret = []
    for qid in qids:
        ret += [{'qid': str(qid), 'query': 'Unused, only for significance tests for qid: ' + str(qid)}]
        
    return pd.DataFrame(ret)

def trec_run(run_name):
    from pyterrier.transformer import get_transformer
    return get_transformer(pt.io.read_results(run_name))


QRELS = {
    'v1-popular': pt_qrels('v1/qrels.msmarco-entrypage-popular.txt'),
    'v1-random': pt_qrels('v1/qrels.msmarco-entrypage-random.txt'),
    'v2-popular': pt_qrels('v2/qrels.msmarco-v2-entrypage-popular.txt'),
    'v2-random': pt_qrels('v2/qrels.msmarco-v2-entrypage-random.txt'),
}

TOPICS = {
    'v1-popular': pt_topics('v1/qrels.msmarco-entrypage-popular.txt'),
    'v1-random': pt_topics('v1/qrels.msmarco-entrypage-random.txt'),
    'v2-popular': pt_topics('v2/qrels.msmarco-v2-entrypage-popular.txt'),
    'v2-random': pt_topics('v2/qrels.msmarco-v2-entrypage-random.txt'),
}

In [3]:
APPROACH_TO_MARCO_V1_RUN_FILE={
    'BM25@2016-07': 'run.cc-16-07-anchortext.bm25-default.txt',
    'BM25@2017-04': 'run.cc-17-04-anchortext.bm25-default.txt',
    'BM25@2018-13': 'run.cc-18-13-anchortext.bm25-default.txt',
    'BM25@2019-47': 'run.cc-19-47-anchortext.bm25-default.txt',
    'BM25@2020-05': 'run.cc-20-05-anchortext.bm25-default.txt',
    'BM25@2021-04': 'run.cc-21-04-anchortext.bm25-default.txt',
    'BM25@16--21': 'run.cc-combined-anchortext.bm25-default.txt',
    'BM25@Content': 'run.ms-marco-content.bm25-default.txt',
    'BM25@Title': 'run.msmarco-document-v1-title-only.pos+docvectors+raw.bm25-default.txt',
    'BM25@Orcas': 'run.orcas.bm25-default.txt',
    'DeepCT@Anchor': 'run.ms-marco-deepct-v1-anserini-docs-cc-2019-47-sampled-test-overlap-removed-389979.bm25-default.txt',
    'DeepCT@Orcas': 'run.ms-marco-deepct-v1-anserini-docs-orcas-sampled-test-overlap-removed-390009.bm25-default.txt',
    'DeepCT@Train':'run.ms-marco-deepct-v1-anserini-docs-ms-marco-training-set-test-overlap-removed-389973.bm25-default.txt',
    'MonoT5': 'run.ms-marco-content.bm25-mono-t5-maxp.txt',
    'MonoBERT': 'run.ms-marco-content.bm25-mono-bert-maxp.txt',
    'LambdaMART@CTA':'run.ms-marco.lambda-mart-cta-trees-1000.txt',
    'LambdaMART@CTOA':'run.ms-marco.lambda-mart-ctoa-trees-1000.txt',
    'LambdaMART@CTO':'run.ms-marco.lambda-mart-cto-trees-1000.txt',
    'LambdaMART@CT':'run.ms-marco.lambda-mart-ct-trees-1000.txt',
}

APPROACH_TO_MARCO_V2_RUN_FILE={
    'BM25@Content': 'run.msmarco-doc-v2.bm25-default.txt',
    'BM25@Orcas': 'run.orcas-ms-marco-v2.bm25-default.txt',
    'BM25@2016-07': 'run.cc-16-07-anchortext.bm25-default.txt',
    'BM25@2017-04': 'run.cc-17-04-anchortext.bm25-default.txt',
    'BM25@2018-13': 'run.cc-18-13-anchortext.bm25-default.txt',
    'BM25@2019-47': 'run.cc-19-47-anchortext-v2.bm25-default.txt',
    'BM25@2020-05': 'run.cc-20-05-anchortext.bm25-default.txt',
    'BM25@2021-04': 'run.cc-21-04-anchortext.bm25-default.txt',
    'BM25@16--21': 'run.cc-union-16-to-21-anchortext-1000.bm25-default.txt',
    'DeepCT@Anchor': 'run.ms-marco-deepct-v2-anserini-docs-cc-2019-47-sampled-test-overlap-removed-389979.bm25-default.txt',
    'DeepCT@Orcas': 'run.ms-marco-deepct-v2-anserini-docs-orcas-sampled-test-overlap-removed-390009.bm25-default.txt',
    'DeepCT@Train':'run.ms-marco-deepct-v2-anserini-docs-ms-marco-training-set-test-overlap-removed-389973.bm25-default.txt',
    'MonoT5': 'run.ms-marco-content.bm25-mono-t5-maxp.txt',
    'MonoBERT': 'run.ms-marco-content.bm25-mono-bert-maxp.txt',
    'LambdaMART@CTA':'run.ms-marco.lambda-mart-cta-trees-1000.txt',
    'LambdaMART@CTOA':'run.ms-marco.lambda-mart-ctoa-trees-1000.txt',
    'LambdaMART@CTO':'run.ms-marco.lambda-mart-cto-trees-1000.txt',
    'LambdaMART@CT':'run.ms-marco.lambda-mart-ct-trees-1000.txt',
}

### Comparison of MRR for Anchor Text approaches to DeepCT

In [4]:
runs = ['DeepCT@Anchor', 'BM25@2016-07', 'BM25@2017-04', 'BM25@2018-13', 'BM25@2019-47', 'BM25@2020-05', 'BM25@2021-04', 'BM25@16--21']
runs = [(i, trec_run(RUN_DIR + '/entrypage-random/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-random'],
    QRELS['v1-random'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,DeepCT@Anchor,0.428108,,,,False,
1,BM25@2016-07,0.585702,46.0,23.0,0.004533772,True,0.03627018
2,BM25@2017-04,0.617538,48.0,18.0,0.00112328,True,0.008986241
3,BM25@2018-13,0.682118,50.0,17.0,8.122389e-06,True,6.497911e-05
4,BM25@2019-47,0.614973,51.0,20.0,0.0009633959,True,0.007707167
5,BM25@2020-05,0.615233,50.0,20.0,0.0007568068,True,0.006054455
6,BM25@2021-04,0.623853,50.0,20.0,0.0004120504,True,0.003296403
7,BM25@16--21,0.742475,56.0,10.0,2.399932e-09,True,1.919945e-08


Result: From xy above we see....

### Comparison of MRR for BM25 on Content with DeepCT trained on anchor text, DeepCT, MonoT5, MonoBERT, and LambdaMART

In [5]:
runs = ['BM25@Content', 'DeepCT@Orcas', 'DeepCT@Anchor', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'LambdaMART@CTA', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-random/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-random'],
    QRELS['v1-random'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@Content,0.207601,,,,False,
1,DeepCT@Orcas,0.374346,58.0,8.0,1.38518e-06,True,1.38518e-05
2,DeepCT@Anchor,0.428108,63.0,5.0,5.402745e-09,True,5.402745e-08
3,DeepCT@Train,0.268742,39.0,24.0,0.08015659,False,0.8015659
4,MonoT5,0.380031,38.0,20.0,1.850124e-06,True,1.850124e-05
5,MonoBERT,0.344202,36.0,21.0,8.487255e-06,True,8.487255e-05
6,LambdaMART@CTA,0.422489,54.0,3.0,3.9057e-09,True,3.9057e-08
7,LambdaMART@CTOA,0.470258,56.0,1.0,1.431657e-11,True,1.431657e-10
8,LambdaMART@CTO,0.402079,54.0,6.0,1.381525e-06,True,1.381525e-05
9,LambdaMART@CT,0.261211,48.0,14.0,0.1017261,False,1.0


Result:
    
DeepCT trained on anchor text, DeepCT trained on the ORCAS query log, MonoT5, MonoBERT, and three of the LambdaMART models improve statistically significant upon the MRR of 0.21 achieved by the BM25 retrieval on the content.

### Comparison of BM25 on Orcas with other Content-Only Models

In [6]:
runs = ['BM25@Orcas', 'BM25@Content', 'DeepCT@Orcas', 'DeepCT@Anchor', 'DeepCT@Train', 'MonoT5', 'MonoBERT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-random/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-random'],
    QRELS['v1-random'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@Orcas,0.587358,,,,False,
1,BM25@Content,0.207601,18.0,64.0,3.13861e-09,True,2.197027e-08
2,DeepCT@Orcas,0.374346,21.0,53.0,0.0006635362,True,0.004644753
3,DeepCT@Anchor,0.428108,23.0,48.0,0.004142828,True,0.0289998
4,DeepCT@Train,0.268742,16.0,60.0,3.040394e-07,True,2.128276e-06
5,MonoT5,0.380031,21.0,51.0,0.001817491,True,0.01272244
6,MonoBERT,0.344202,21.0,54.0,0.000278234,True,0.001947638


Result:
    
M25 on ORCAS improves statistically significantly upon all content-only models.

### Comparison of all Anchor-Text Models with all other approaches

In [4]:
runs = ['BM25@2016-07', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2016-07,0.622108,,,,False,
1,BM25@Content,0.01971,1.0,95.0,3.3547990000000004e-27,True,4.025759e-26
2,DeepCT@Anchor,0.030247,1.0,95.0,6.075277e-27,True,7.290332999999999e-26
3,DeepCT@Orcas,0.020387,1.0,96.0,6.466991e-28,True,7.760389e-27
4,DeepCT@Train,0.015139,2.0,95.0,6.847513000000001e-28,True,8.217016e-27
5,MonoT5,0.019281,0.0,95.0,2.89847e-27,True,3.478164e-26
6,MonoBERT,0.019631,0.0,95.0,2.8921530000000003e-27,True,3.470583e-26
7,BM25@Orcas,0.283894,10.0,69.0,1.235129e-11,True,1.482155e-10
8,LambdaMART@CTOA,0.080452,3.0,91.0,3.994469e-23,True,4.793363000000001e-22
9,LambdaMART@CTO,0.070888,3.0,91.0,2.6952590000000003e-23,True,3.23431e-22


In [5]:
runs = ['BM25@2017-04', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2017-04,0.594353,,,,False,
1,BM25@Content,0.01971,0.0,96.0,1.374514e-25,True,1.649417e-24
2,DeepCT@Anchor,0.030247,0.0,96.0,2.60723e-25,True,3.1286759999999998e-24
3,DeepCT@Orcas,0.020387,1.0,96.0,3.469866e-26,True,4.163839e-25
4,DeepCT@Train,0.015139,1.0,96.0,4.8921779999999997e-26,True,5.870614000000001e-25
5,MonoT5,0.019281,0.0,96.0,1.184795e-25,True,1.421754e-24
6,MonoBERT,0.019631,0.0,96.0,1.1817870000000002e-25,True,1.418144e-24
7,BM25@Orcas,0.283894,11.0,67.0,2.826027e-10,True,3.391232e-09
8,LambdaMART@CTOA,0.080452,1.0,93.0,1.344188e-21,True,1.613026e-20
9,LambdaMART@CTO,0.070888,2.0,92.0,1.07814e-21,True,1.293768e-20


In [6]:
runs = ['BM25@2018-13', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2018-13,0.53459,,,,False,
1,BM25@Content,0.01971,1.0,95.0,8.248166e-22,True,9.897799e-21
2,DeepCT@Anchor,0.030247,0.0,95.0,1.636321e-22,True,1.963585e-21
3,DeepCT@Orcas,0.020387,0.0,96.0,3.076506e-23,True,3.691807e-22
4,DeepCT@Train,0.015139,0.0,96.0,2.043835e-23,True,2.452602e-22
5,MonoT5,0.019281,0.0,95.0,7.769732e-23,True,9.323679000000001e-22
6,MonoBERT,0.019631,0.0,95.0,7.582235e-23,True,9.098682e-22
7,BM25@Orcas,0.283894,15.0,66.0,6.52544e-07,True,7.830528e-06
8,LambdaMART@CTOA,0.080452,1.0,92.0,5.034594e-19,True,6.041513e-18
9,LambdaMART@CTO,0.070888,1.0,92.0,4.540724e-19,True,5.448869e-18


In [7]:
runs = ['BM25@2019-47', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2019-47,0.575995,,,,False,
1,BM25@Content,0.01971,1.0,95.0,1.533954e-24,True,1.840745e-23
2,DeepCT@Anchor,0.030247,0.0,95.0,3.5870040000000003e-25,True,4.3044049999999996e-24
3,DeepCT@Orcas,0.020387,0.0,96.0,5.673189e-26,True,6.807827e-25
4,DeepCT@Train,0.015139,0.0,96.0,3.4773969999999996e-26,True,4.172877e-25
5,MonoT5,0.019281,0.0,95.0,2.809035e-25,True,3.370842e-24
6,MonoBERT,0.019631,0.0,95.0,3.297982e-25,True,3.957578e-24
7,BM25@Orcas,0.283894,9.0,69.0,1.776304e-09,True,2.131564e-08
8,LambdaMART@CTOA,0.080452,3.0,90.0,4.968238e-22,True,5.961885e-21
9,LambdaMART@CTO,0.070888,1.0,92.0,2.062016e-22,True,2.474419e-21


In [8]:
runs = ['BM25@2020-05', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2020-05,0.546413,,,,False,
1,BM25@Content,0.01971,1.0,95.0,3.2834140000000005e-23,True,3.9400960000000004e-22
2,DeepCT@Anchor,0.030247,0.0,95.0,7.678368e-24,True,9.214042000000001e-23
3,DeepCT@Orcas,0.020387,0.0,96.0,1.314591e-24,True,1.5775090000000003e-23
4,DeepCT@Train,0.015139,0.0,96.0,8.141747e-25,True,9.770096e-24
5,MonoT5,0.019281,0.0,95.0,6.712497e-24,True,8.054996000000001e-23
6,MonoBERT,0.019631,0.0,95.0,8.284593999999999e-24,True,9.941513000000001e-23
7,BM25@Orcas,0.283894,12.0,67.0,1.235936e-07,True,1.483123e-06
8,LambdaMART@CTOA,0.080452,2.0,89.0,2.436762e-20,True,2.9241149999999995e-19
9,LambdaMART@CTO,0.070888,1.0,92.0,1.1150289999999999e-20,True,1.338035e-19


In [9]:
runs = ['BM25@2021-04', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@2021-04,0.542634,,,,False,
1,BM25@Content,0.01971,1.0,96.0,2.2433450000000003e-22,True,2.692014e-21
2,DeepCT@Anchor,0.030247,0.0,96.0,3.795649e-23,True,4.554779e-22
3,DeepCT@Orcas,0.020387,0.0,97.0,7.747182e-24,True,9.296618e-23
4,DeepCT@Train,0.015139,0.0,97.0,3.176723e-24,True,3.812067e-23
5,MonoT5,0.019281,0.0,96.0,1.864243e-23,True,2.2370920000000003e-22
6,MonoBERT,0.019631,0.0,96.0,1.709393e-23,True,2.051272e-22
7,BM25@Orcas,0.283894,15.0,65.0,6.135173e-07,True,7.362208e-06
8,LambdaMART@CTOA,0.080452,2.0,92.0,9.052180999999999e-20,True,1.086262e-18
9,LambdaMART@CTO,0.070888,1.0,93.0,5.525136999999999e-20,True,6.630164999999999e-19


In [10]:
runs = ['BM25@16--21', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'BM25@Orcas', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@16--21,0.548738,,,,False,
1,BM25@Content,0.01971,1.0,97.0,1.5874870000000002e-23,True,1.904984e-22
2,DeepCT@Anchor,0.030247,0.0,97.0,2.659767e-24,True,3.1917210000000005e-23
3,DeepCT@Orcas,0.020387,0.0,98.0,4.07735e-25,True,4.8928199999999996e-24
4,DeepCT@Train,0.015139,0.0,98.0,2.690777e-25,True,3.228933e-24
5,MonoT5,0.019281,0.0,97.0,1.1045979999999999e-24,True,1.325518e-23
6,MonoBERT,0.019631,0.0,97.0,1.075175e-24,True,1.2902100000000001e-23
7,BM25@Orcas,0.283894,13.0,65.0,2.481725e-08,True,2.97807e-07
8,LambdaMART@CTOA,0.080452,1.0,94.0,4.251118e-21,True,5.1013419999999995e-20
9,LambdaMART@CTO,0.070888,1.0,94.0,3.482613e-21,True,4.1791359999999996e-20


Result:

For queries pointing to popular entry pages, all BM25 models retrieving on anchor text outperform all other retrieval models statistically significant

### Compare BM25 on ORCAS for popular topics with all other non-anchor-approaches

In [11]:
runs = ['BM25@Orcas', 'BM25@Content', 'DeepCT@Anchor', 'DeepCT@Orcas', 'DeepCT@Train', 'MonoT5', 'MonoBERT', 'LambdaMART@CTOA', 'LambdaMART@CTO', 'LambdaMART@CTA', 'LambdaMART@CT']
runs = [(i, trec_run(RUN_DIR + '/entrypage-popular/' + APPROACH_TO_MARCO_V1_RUN_FILE[i])) for i in runs]

pt.Experiment(
    [i for _, i in runs],
    TOPICS['v1-popular'],
    QRELS['v1-popular'],
    ['recip_rank'],
    [i for i, _ in runs],
    baseline = 0,
    test='t',
    correction='b'
)

Unnamed: 0,name,recip_rank,recip_rank +,recip_rank -,recip_rank p-value,recip_rank reject,recip_rank p-value corrected
0,BM25@Orcas,0.283894,,,,False,
1,BM25@Content,0.01971,12.0,57.0,1.32369e-09,True,1.456059e-08
2,DeepCT@Anchor,0.030247,21.0,57.0,1.778996e-09,True,1.956896e-08
3,DeepCT@Orcas,0.020387,19.0,58.0,5.093902e-10,True,5.603292e-09
4,DeepCT@Train,0.015139,13.0,57.0,3.061334e-10,True,3.367467e-09
5,MonoT5,0.019281,3.0,58.0,2.79271e-09,True,3.07198e-08
6,MonoBERT,0.019631,3.0,57.0,3.11298e-09,True,3.424277e-08
7,LambdaMART@CTOA,0.080452,13.0,52.0,8.507425e-07,True,9.358167e-06
8,LambdaMART@CTO,0.070888,14.0,53.0,2.523759e-07,True,2.776134e-06
9,LambdaMART@CTA,0.062282,14.0,54.0,1.378659e-07,True,1.516524e-06
