# Evaluation of K-Fold-GridSearch

In [10]:
import pyterrier as pt
from glob import glob
from tqdm import tqdm
import pandas as pd
from trectools import TrecQrel, TrecRun, TrecEval
from pathlib import Path
import json
import argparse
import numpy as np

QRELS_HELPFUL = {
    19: TrecQrel('../../../../third-party/health-misinfo-19/misinfo-qrels-graded.helpful-only'),
    20: TrecQrel('../../../../third-party/health-misinfo-20/misinfo-qrels-graded.helpful-only'),
    21: TrecQrel('../../../../third-party/health-misinfo-21/misinfo-qrels-graded.helpful-only'),
}

QRELS_HARMFUL = {
    19: TrecQrel('../../../../third-party/health-misinfo-19/misinfo-qrels-graded.harmful-only'),
    20: TrecQrel('../../../../third-party/health-misinfo-20/misinfo-qrels-graded.harmful-only'),
    21: TrecQrel('../../../../third-party/health-misinfo-21/misinfo-qrels-graded.harmful-only'),
}

RUNS = {}

for year in QRELS_HELPFUL.keys():
    RUNS[year] = {
        i: TrecRun(str(year) + '/hmi-' + str(year) + i + '/run.txt') for i in ['', '-rm3', '-rm3-kq', '-rm3rel-1', '-rm3-kqrel-1', '-rm3rel-2', '-rm3-kqrel-2', '-rm3rel-3', '-rm3-kqrel-3', '-rm3rel-4', '-rm3-kqrel-4', '-rm3rel-5', '-rm3-kqrel-5', '-bm25-move-to-top-rel1', '-bm25-move-to-top-rel2', '-bm25-move-to-top-rel3', '-bm25-move-to-top-rel4', '-bm25-move-to-top-rel5', '-castorini-monobert-large-msmarco', '-castorini-monot5-base-msmarco', '-trec', '-trec-without-top-1', '-trec-without-top-2']
    }

In [11]:
!cat 19/hmi-19-trec/params.json

[{"approach": "input.IELAB07_xWiki_q.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-trec"}, {"approach": "input.UWatMDSBM25_HC3.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-trec"}, {"approach": "input.UWatMDSBM25_HC3.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-trec"}]

In [12]:
!cat 20/hmi-20-trec-without-top-1/params.json

[{"approach": "input.adhoc_run6.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec-without-top-1"}, {"approach": "input.adhoc_run2.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec-without-top-1"}, {"approach": "input.cn-kq-td.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec-without-top-1"}]

In [13]:
!cat 20/hmi-20-trec/params.json

[{"approach": "input.h2oloo.m5.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec"}, {"approach": "input.h2oloo.m5.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec"}, {"approach": "input.h2oloo.m5.gz", "base_dir": "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec"}]

In [14]:
def eval_year(year, run):
    te_help = TrecEval(run, QRELS_HELPFUL[year])
    te_harm = TrecEval(run, QRELS_HARMFUL[year])
    
    return {
            'helpful': te_help.get_ndcg(depth=10),
            'harmful': te_harm.get_ndcg(depth=10),
            'help-harm': te_help.get_ndcg(depth=10) - te_harm.get_ndcg(depth=10),
        }
    
def df():
    ret = []
    for display_name, value_name in [('BM25', ''), ('BM25+RM3', '-rm3'), ('BM25+KQ-RM3', '-rm3-kq'),]:
        ret += [{
            'run': display_name,
            'HMI-19': eval_year(19, RUNS[19][value_name]),
            'HMI-20': eval_year(20, RUNS[20][value_name]),
            'HMI-21': eval_year(21, RUNS[21][value_name]),
        }]
        
    return pd.DataFrame(ret)

In [15]:
df().stack().to_frame()

Unnamed: 0,Unnamed: 1,0
0,run,BM25
0,HMI-19,"{'helpful': 0.19391015303203746, 'harmful': 0...."
0,HMI-20,"{'helpful': 0.28785634929701587, 'harmful': 0...."
0,HMI-21,"{'helpful': 0.28504469657083625, 'harmful': 0...."
1,run,BM25+RM3
1,HMI-19,"{'helpful': 0.4485071672573105, 'harmful': 0.2..."
1,HMI-20,"{'helpful': 0.5219519177263294, 'harmful': 0.0..."
1,HMI-21,"{'helpful': 0.358922551571216, 'harmful': 0.11..."
2,run,BM25+KQ-RM3
2,HMI-19,"{'helpful': 0.5369469531409167, 'harmful': 0.1..."


In [16]:
def df(year):
    ret = []
    for display_name, value_name in [('BM25', ''),
                                     ('MonoBERT', '-castorini-monobert-large-msmarco'), 
                                     ('MonoT5', '-castorini-monot5-base-msmarco'),
                                     ('Top1@TREC', '-trec'),
                                     ('Top2@TREC', '-trec-without-top-1'),
                                     ('Top3@TREC', '-trec-without-top-2'),
                                     ('BM25-To-Top (rel=1)', '-bm25-move-to-top-rel1'), ('BM25+RM3 (rel=1)', '-rm3rel-1'), ('BM25+KQ-RM3 (rel=1)', '-rm3-kqrel-1'),
                                     ('BM25-To-Top (rel=2)', '-bm25-move-to-top-rel2'), ('BM25+RM3 (rel=2)', '-rm3rel-2'), ('BM25+KQ-RM3 (rel=2)', '-rm3-kqrel-2'),
                                     ('BM25-To-Top (rel=3)', '-bm25-move-to-top-rel3'), ('BM25+RM3 (rel=3)', '-rm3rel-3'), ('BM25+KQ-RM3 (rel=3)', '-rm3-kqrel-3'),
                                     ('BM25-To-Top (rel=4)', '-bm25-move-to-top-rel4'), ('BM25+RM3 (rel=4)', '-rm3rel-4'), ('BM25+KQ-RM3 (rel=4)', '-rm3-kqrel-4'),
                                     ('BM25-To-Top (rel=5)', '-bm25-move-to-top-rel5'), ('BM25+RM3 (rel=5)', '-rm3rel-5'), ('BM25+KQ-RM3 (rel=5)', '-rm3-kqrel-5'),
                                     ('BM25+RM3 (rel=var)', '-rm3'), ('BM25+KQ-RM3 (rel=var)', '-rm3-kq')]:
        i = {'run': display_name}
        i.update(eval_year(year, RUNS[year][value_name]))
        
        ret += [i]
        
    return pd.DataFrame(ret)

In [17]:
dict_of_df = {
    'HMI 19': df(19),
    'HMI 20': df(20),
    'HMI 21': df(21),
}

tmp = pd.concat(dict_of_df, axis=1)
tmp['run'] = tmp[('HMI 19', 'run')]
del tmp[('HMI 19', 'run')]
del tmp[('HMI 20', 'run')]
del tmp[('HMI 21', 'run')]
tmp = tmp.set_index('run')
tmp

Unnamed: 0_level_0,HMI 19,HMI 19,HMI 19,HMI 20,HMI 20,HMI 20,HMI 21,HMI 21,HMI 21
Unnamed: 0_level_1,helpful,harmful,help-harm,helpful,harmful,help-harm,helpful,harmful,help-harm
run,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
BM25,0.19391,0.35194,-0.15803,0.287856,0.035955,0.251901,0.285045,0.179838,0.105206
MonoBERT,0.210121,0.334238,-0.124117,0.16336,0.028425,0.134935,0.208775,0.119695,0.089079
MonoT5,0.222467,0.336615,-0.114149,0.298297,0.061047,0.237251,0.204353,0.134668,0.069685
Top1@TREC,0.211756,0.268357,-0.056601,0.661939,0.045547,0.616392,0.520841,0.07865,0.442191
Top2@TREC,0.262721,0.283187,-0.020466,0.430373,0.085243,0.34513,0.566625,0.074716,0.491909
Top3@TREC,0.169243,0.149781,0.019461,0.455615,0.047967,0.407648,0.532842,0.077741,0.455101
BM25-To-Top (rel=1),0.375172,0.245601,0.12957,0.373824,0.032754,0.341069,0.270049,0.083015,0.187034
BM25+RM3 (rel=1),0.41612,0.193297,0.222823,0.477831,0.05296,0.424872,0.365702,0.088355,0.277347
BM25+KQ-RM3 (rel=1),0.440623,0.204091,0.236532,0.468384,0.056682,0.411701,0.432491,0.075789,0.356702
BM25-To-Top (rel=2),0.467979,0.210975,0.257004,0.463534,0.035593,0.427942,0.304254,0.068585,0.235668


In [18]:
def f(x):
    return '{:.3f}'.format(x)

def row(i):
    fb = '---' if 'rel=' not in i.name else i.name.split('rel=')[1].split(')')[0]
    
    return fb + ' & ' + (i.name.split(' (rel=')[0]) + ' & ' + f(i[('HMI 19', 'helpful')]) + ' & ' + f(i[('HMI 19', 'harmful')]) + '  &  ' + f(i[('HMI 19', 'help-harm')]) + ' & ' + f(i[('HMI 20', 'helpful')]) + ' & ' + f(i[('HMI 20', 'harmful')]) + ' & ' + f(i[('HMI 20', 'help-harm')]) + ' & ' + f(i[('HMI 21', 'helpful')]) + ' & ' + f(i[('HMI 21', 'harmful')]) + ' & ' + f(i[('HMI 21', 'help-harm')]) + ' \\\\'

def print_table_effectiveness(df):
    return '''\\begin{table*}[bt]%
\\caption{TBD.}%
\\label{table-effectiveness}%
\\renewcommand{\\tabcolsep}{5pt}%
\\begin{tabular}{@{\\extracolsep{\\fill}}ll@{\\qquad}ccc@{\\quad}ccc@{\\quad}ccc@{}}
\\toprule
\\multicolumn{2}{@{}l@{}}{\\bfseries Retrieval system}  & \\multicolumn{3}{@{}c@{\\quad}}{\\bfseries HMI 19} & \\multicolumn{3}{@{}c@{\\quad}}{\\bfseries HMI 20} & \\multicolumn{3}{@{}c@{\\quad}}{\\bfseries HMI 21} \\\\
\\cmidrule(r{1em}){1-2} \\cmidrule(r{1em}){3-5} \\cmidrule(r{1em}){6-8} \\cmidrule(){9-11}
Feedback & System                    & Help             & Harm             & H-H            & Help             & Harm             & H-H            & Help             & Harm             & H-H            \\\\
\\midrule
''' + ('\n'.join([row(i) for _, i in df.iterrows()])) + '''
\\bottomrule
\\end{tabular}%
\\end{table*} 
'''

print(print_table_effectiveness(tmp))

\begin{table*}[bt]%
\caption{TBD.}%
\label{table-effectiveness}%
\renewcommand{\tabcolsep}{5pt}%
\begin{tabular}{@{\extracolsep{\fill}}ll@{\qquad}ccc@{\quad}ccc@{\quad}ccc@{}}
\toprule
\multicolumn{2}{@{}l@{}}{\bfseries Retrieval system}  & \multicolumn{3}{@{}c@{\quad}}{\bfseries HMI 19} & \multicolumn{3}{@{}c@{\quad}}{\bfseries HMI 20} & \multicolumn{3}{@{}c@{\quad}}{\bfseries HMI 21} \\
\cmidrule(r{1em}){1-2} \cmidrule(r{1em}){3-5} \cmidrule(r{1em}){6-8} \cmidrule(){9-11}
Feedback & System                    & Help             & Harm             & H-H            & Help             & Harm             & H-H            & Help             & Harm             & H-H            \\
\midrule
--- & BM25 & 0.194 & 0.352  &  -0.158 & 0.288 & 0.036 & 0.252 & 0.285 & 0.180 & 0.105 \\
--- & MonoBERT & 0.210 & 0.334  &  -0.124 & 0.163 & 0.028 & 0.135 & 0.209 & 0.120 & 0.089 \\
--- & MonoT5 & 0.222 & 0.337  &  -0.114 & 0.298 & 0.061 & 0.237 & 0.204 & 0.135 & 0.070 \\
--- & Top1@TREC & 0.212 & 0.268  &

In [75]:
tmp.iloc[0].keys()

MultiIndex([('HMI 19',   'helpful'),
            ('HMI 19',   'harmful'),
            ('HMI 19', 'help-harm'),
            ('HMI 20',   'helpful'),
            ('HMI 20',   'harmful'),
            ('HMI 20', 'help-harm'),
            ('HMI 21',   'helpful'),
            ('HMI 21',   'harmful'),
            ('HMI 21', 'help-harm')],
           )

In [21]:
eval_df(21)

Unnamed: 0,run,ndcg_cut_10.helpful,ndcg_cut_10.harmful
0,,0.285045,0.179838
1,-rm3,0.358923,0.114926
2,-rm3-kq,0.458101,0.093002


# Configurations for KFoldGridSearch

In [1]:
!ls /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/|grep 19

hmi-19
hmi-19-rm3
hmi-19-rm3-kq


```
./pt-k-fold-cross-validation.py --year 19 --base_dir hmi-19
./pt-k-fold-cross-validation.py --year 19 --base_dir hmi-19-rm3
./pt-k-fold-cross-validation.py --year 19 --base_dir hmi-19-rm3-kq
```

In [2]:
!ls /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/|grep 20

hmi-20
hmi-20-rm3
hmi-20-rm3-kq


```
./pt-k-fold-cross-validation.py --year 20 --base_dir hmi-20
./pt-k-fold-cross-validation.py --year 20 --base_dir hmi-20-rm3
./pt-k-fold-cross-validation.py --year 20 --base_dir hmi-20-rm3-kq
```

In [3]:
!ls /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/|grep 21

hmi-21
hmi-21-rm3
hmi-21-rm3-kq


```
./pt-k-fold-cross-validation.py --year 21 --base_dir hmi-21
./pt-k-fold-cross-validation.py --year 21 --base_dir hmi-21-rm3
./pt-k-fold-cross-validation.py --year 21 --base_dir hmi-21-rm3-kq
```

In [28]:
!cp -R ../../../../third-party/health-misinfo-19/runs/ /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-trec
!cp -R ../../../../third-party/health-misinfo-20/runs/ /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-20-trec

In [5]:
!mkdir -p /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec
!cp ../../../../third-party/health-misinfo-21/runs/*.txt /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec

# Run Cross Valudation

In [6]:
import importlib
cross_validation = importlib.import_module('pt-k-fold-cross-validation')

PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [14]:
cross_validation.main(19, 'hmi-19-trec')

In [18]:
cross_validation.main(19, 'hmi-19-trec-without-top-1')

In [33]:
cross_validation.main(19, 'hmi-19-trec-without-top-2')

In [15]:
cross_validation.main(20, 'hmi-20-trec')

In [19]:
cross_validation.main(20, 'hmi-20-trec-without-top-1')

In [34]:
cross_validation.main(20, 'hmi-20-trec-without-top-2')

In [7]:
cross_validation.main(21, 'hmi-21-trec')

Fold 1
Best ndcg_cut.10 is 0.423895
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> approach=WatSMC-Correct.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec']
Fold 2
Best ndcg_cut.10 is 0.378320
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> approach=vera_mt5_0.95.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec']
Fold 3
Best ndcg_cut.10 is 0.351490
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> approach=vera_mt5_0.5.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179c10> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21

In [8]:
cross_validation.main(21, 'hmi-21-trec-without-top-1')

Fold 1
Best ndcg_cut.10 is 0.423895
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> approach=WatSMC-Correct.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec-without-top-1']
Fold 2
Best ndcg_cut.10 is 0.371307
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> approach=WatSMC-Correct.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec-without-top-1']
Fold 3
Best ndcg_cut.10 is 0.338048
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> approach=WatSMC-Correct.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa534179940> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-searc

In [9]:
cross_validation.main(21, 'hmi-21-trec-without-top-2')

Fold 1
Best ndcg_cut.10 is 0.372051
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> approach=mt5_r.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec-without-top-2']
Fold 2
Best ndcg_cut.10 is 0.366150
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> approach=mt5_r.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-21-trec-without-top-2']
Fold 3
Best ndcg_cut.10 is 0.305664
Best setting is ['<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> approach=mt5_r.txt', '<pt-k-fold-cross-validation.RunFileTransformer object at 0x7fa3f40c08b0> base_dir=/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hm

In [2]:
!ls /mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/|grep trec

hmi-19-trec
hmi19-trec
hmi-19-trec-without-top-1
hmi-19-trec-without-top-2
hmi-20-trec
hmi20-trec
hmi-20-trec-without-top-1
hmi-20-trec-without-top-2


# Copy move-to-top approaches

In [9]:
for year in [19, 20, 21]:
    for rel in [1,2,3,4,5]:
        !mkdir -p  {year}/hmi-{year}-bm25-move-to-top-rel{rel}
        !cp "/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-{year}-move-to-top/run.hmi{year}.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(relevat-docs-to-top;rel={rel})-default" {year}/hmi-{year}-bm25-move-to-top-rel{rel}/run.txt


# Significance tests

In [19]:
def is_significant(a, b):
    import statsmodels.stats.multitest
    from scipy.stats import ttest_rel

    p = ttest_rel(a, b)[1]

    reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(p, alpha=0.05, method='bonferroni')
    
    return reject, corrected

def to_dict(df):
    ret = {}
    for _, i in df.iterrows():
        assert i.name not in ret
        ret[i.name] = i['NDCG@10']
    return ret

def significance_test_for_year(year, baseline_run, run):
    te_help_baseline = TrecEval(baseline_run, QRELS_HELPFUL[year])
    te_harm_baseline = TrecEval(baseline_run, QRELS_HARMFUL[year])
    
    helpful_baseline = np.array(list(te_help_baseline.get_ndcg(depth=10, per_query=True).fillna(0)['NDCG@10']))
    harmful_baseline = np.array(list(te_harm_baseline.get_ndcg(depth=10, per_query=True).fillna(0)['NDCG@10']))
    hf = to_dict(te_harm_baseline.get_ndcg(depth=10, per_query=True).fillna(0))
    
    help_minus_harm_baseline = []
    for _, i in te_help_baseline.get_ndcg(depth=10, per_query=True).fillna(0).iterrows():
        help_minus_harm_baseline += [i['NDCG@10'] - hf.get(i.name, 0)]
    help_minus_harm_baseline = np.array(help_minus_harm_baseline)
    
    
    
    te_help_run = TrecEval(run, QRELS_HELPFUL[year])
    te_harm_run = TrecEval(run, QRELS_HARMFUL[year])
    
    helpful_run = np.array(list(te_help_run.get_ndcg(depth=10, per_query=True).fillna(0)['NDCG@10']))
    harmful_run = np.array(list(te_harm_run.get_ndcg(depth=10, per_query=True).fillna(0)['NDCG@10']))
    hf = to_dict(te_harm_run.get_ndcg(depth=10, per_query=True).fillna(0))
    
    help_minus_harm_run = []
    for _, i in te_help_run.get_ndcg(depth=10, per_query=True).fillna(0).iterrows():
        help_minus_harm_run += [i['NDCG@10'] - hf.get(i.name, 0)]
    help_minus_harm_run = np.array(help_minus_harm_run)
    
    return {
            'helpful': is_significant(helpful_baseline, helpful_run)[0][0],
            'harmful': is_significant(harmful_baseline, harmful_run)[0][0],
            'help-harm': is_significant(help_minus_harm_baseline, help_minus_harm_run)[0][0],
        }


In [20]:
for year in [19, 20, 21]:
    for run_display_name, run_name in [('BM25-To-Top (rel=5)', '-bm25-move-to-top-rel5'), ('BM25+RM3 (rel=5)', '-rm3rel-5'), ('BM25+KQ-RM3 (rel=5)', '-rm3-kqrel-5'),
                                       ('BM25-To-Top (rel=4)', '-bm25-move-to-top-rel4'), ('BM25+RM3 (rel=4)', '-rm3rel-4'), ('BM25+KQ-RM3 (rel=4)', '-rm3-kqrel-4'),
                                       ('BM25-To-Top (rel=3)', '-bm25-move-to-top-rel3'), ('BM25+RM3 (rel=3)', '-rm3rel-3'), ('BM25+KQ-RM3 (rel=3)', '-rm3-kqrel-3'),
                                       ('BM25-To-Top (rel=2)', '-bm25-move-to-top-rel2'), ('BM25+RM3 (rel=2)', '-rm3rel-2'), ('BM25+KQ-RM3 (rel=2)', '-rm3-kqrel-2'),
                                       ('BM25-To-Top (rel=1)', '-bm25-move-to-top-rel1'), ('BM25+RM3 (rel=1)', '-rm3rel-1'), ('BM25+KQ-RM3 (rel=1)', '-rm3-kqrel-1'),
                                       ('BM25+RM3 (rel=var)', '-rm3'), ('BM25+KQ-RM3 (rel=var)', '-rm3-kq'),
                                      ]:
    
        print('\n\nComparison ' + run_display_name + ' to Top1@TREC in ' + str(year))
        print(significance_test_for_year(year, RUNS[year]['-trec'], RUNS[year][run_name]))




Comparison BM25-To-Top (rel=5) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+RM3 (rel=5) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+KQ-RM3 (rel=5) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25-To-Top (rel=4) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+RM3 (rel=4) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+KQ-RM3 (rel=4) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25-To-Top (rel=3) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+RM3 (rel=3) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25+KQ-RM3 (rel=3) to Top1@TREC in 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25-To-Top (rel=2) to Top1@TREC 

In [22]:
for year in [19, 20, 21]:
    for run_display_name, run_name in [('BM25', ''),
                                     ('MonoBERT', '-castorini-monobert-large-msmarco'), 
                                     ('MonoT5', '-castorini-monot5-base-msmarco'),]:
    
        print('\n\nComparison ' + run_display_name + ' to Top1@TREC in ' + str(year))
        print(significance_test_for_year(year, RUNS[year]['-trec'], RUNS[year][run_name]))




Comparison BM25 to Top1@TREC in 19
{'helpful': False, 'harmful': True, 'help-harm': True}


Comparison MonoBERT to Top1@TREC in 19
{'helpful': False, 'harmful': True, 'help-harm': False}


Comparison MonoT5 to Top1@TREC in 19
{'helpful': True, 'harmful': True, 'help-harm': False}


Comparison BM25 to Top1@TREC in 20
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison MonoBERT to Top1@TREC in 20
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison MonoT5 to Top1@TREC in 20
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison BM25 to Top1@TREC in 21
{'helpful': True, 'harmful': True, 'help-harm': True}


Comparison MonoBERT to Top1@TREC in 21
{'helpful': True, 'harmful': True, 'help-harm': True}


Comparison MonoT5 to Top1@TREC in 21
{'helpful': True, 'harmful': True, 'help-harm': True}


In [58]:
for year in [19, 20, 21]:
    for r in ['rel-1', 'rel-2', 'rel-3', 'rel-4', 'rel-5', '']:
        
        print('\n\nComparison ' + r + ' to ' + str(year))
        print(significance_test_for_year(year, RUNS[year]['-rm3-kq'+ r], RUNS[year]['-rm3'+ r]))




Comparison rel-1 to 19
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-2 to 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison rel-3 to 19
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-4 to 19
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-5 to 19
{'helpful': True, 'harmful': False, 'help-harm': False}


Comparison  to 19
{'helpful': True, 'harmful': False, 'help-harm': True}


Comparison rel-1 to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-2 to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-3 to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-4 to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-5 to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison  to 20
{'helpful': False, 'harmful': False, 'help-harm': False}


Comparison rel-1 to 21
{'help