# Evaluation of Cross Validation on all Benchmarks

### Step 1 Load Libraries

In [2]:
import sys
sys.path.append('../python/')
from cross_validation_util import cross_validation_experiment
from parametrized_bootstrapping_model import ParametrizedBootstrappingModel, ReturnAlways1Model, ReturnAlways0Model,\
    LowerBoundFixedBudgetBootstrappingModel, UpperBoundFixedBudgetBootstrappingModel, LowerBoundDeltaModel, UpperBoundDeltaModel, \
    BootstrappingInducedByCondensedLists, BootstrappingBySelectingMostLikelyDataPoint, FixedQuantileBootstrappingModel

In [5]:
!head -1 cross-val-tmp/bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl

{"run":"CiirAll1","query":"251","x":[0.6501002685,0.689539422,0.5632669329,0.5158727507,0.5632669329,0.6424715091,0.6027060863,0.6577290279,0.7051232101,0.6501002685,0.5158727507,0.595077327,0.6259186338,0.5708956922,0.6259186338,0.7293048448,0.673312816,0.689539422,0.6182898744,0.6424715091,0.6501002685,0.6268877211,0.5708956922,0.595077327,0.5794935389,0.595077327,0.6656840566,0.6819106626,0.5632669329,0.6819106626,0.5794935389,0.7369336042,0.6345164804,0.6268877211,0.6501002685,0.7525173923,0.5158727507,0.6742819032,0.6027060863,0.689539422,0.5794935389,0.6182898744,0.689539422,0.6577290279,0.610661115,0.673312816,0.6182898744,0.6577290279,0.7207069982,0.7369336042,0.595077327,0.610661115,0.5632669329,0.7843277864,0.7293048448,0.7369336042,0.7051232101,0.6819106626,0.7051232101,0.7525173923,0.610661115,0.6268877211,0.6259186338,0.673312816,0.6656840566,0.6268877211,0.6268877211,0.7525173923,0.6268877211,0.610661115,0.6656840566,0.6268877211,0.6501002685,0.5158727507,0.5632669329,0.5

In [4]:
cross_validation_experiment(
    trec='trec23',
    input_measure=['bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10'],
    models=[FixedQuantileBootstrappingModel('bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10', 75)],
    out_dir='cross-val-tmp',
    clean=True,
    working_dir='../resources',
    failsave=False,
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 210/210 [00:03<00:00, 67.43it/s]
bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10 on trec23: 100%|███████████████████████████████████████████████████| 30/30 [00:04<00:00,  6.91it/s]


In [4]:
cross_validation_experiment(
    trec='trec23',
    input_measure=[('bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10', 'condensed-ndcg@10')],
    models=[BootstrappingInducedByCondensedLists(0.99, 'bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10')],
    out_dir='cross-val-tmp',
    clean=True,
    working_dir='../resources',
    failsave=False,
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:06<00:00, 49.73it/s]
('bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10', 'condensed-ndcg@10') on trec23: 100%|█████████████████████████| 30/30 [00:03<00:00,  9.21it/s]


### Step 2 Run Cross Validation

In [4]:
# ToDo: Parallelize with slurm
#for trec in ['trec13', 'trec18', 'trec19', 'trec20', 'trec21', 'trec22', 'trec23']:
for trec in ['trec13']:
    print(trec)
    bootstrap_types = ['bs-p-1000-ndcg@10-ndcg@10', 'bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10', 'bs-pool-dependent-1000-ndcg@10-ndcg@10', 'bs-run-dependent-1000-ndcg@10-ndcg@10']
    cross_validation_experiment(
        trec=trec,
        bootstrap_types=bootstrap_types,
    )

trec13


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 877/877 [02:08<00:00,  6.83it/s]
bs-p-1000-ndcg@10-ndcg@10 on trec13: 100%|██████████████████████████████████████████████████████████████████████| 110/110 [38:48<00:00, 21.16s/it]
bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10 on trec13:  53%|██████████████████████████▎                       | 58/110 [19:42<17:40, 20.39s/it]


TypeError: the JSON object must be str, bytes or bytearray, not float

In [2]:
# ToDo: Parallelize with slurm
for trec in ['trec13', 'trec18', 'trec19', 'trec20', 'trec21', 'trec22', 'trec23']:
    for bootstrap_type in ['bs-p-1000-ndcg@10-ndcg@10', 'bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10', 'bs-pool-dependent-1000-ndcg@10-ndcg@10', 'bs-run-dependent-1000-ndcg@10-ndcg@10']:
        cross_validation_experiment(
            trec=trec,
            bootstrap_types=[bootstrap_type],
        )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 639/639 [00:26<00:00, 24.08it/s]
bs-p-1000-ndcg@10-ndcg@10 on trec18: 100%|████████████████████████████████████████████████████████████████████████| 71/71 [11:49<00:00, 10.00s/it]
bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10 on trec18: 100%|███████████████████████████████████████████████████| 71/71 [14:36<00:00, 12.34s/it]
bs-pool-dependent-1000-ndcg@10-ndcg@10 on trec18: 100%|███████████████████████████████████████████████████████████| 71/71 [08:18<00:00,  7.02s/it]
bs-run-dependent-1000-ndcg@10-ndcg@10 on trec18: 100%|████████████████████████████████████████████████████████████| 71/71 [06:55<00:00,  5.85s/it]


In [16]:
def cell(approach, corpus, field):
    return '.000$^{\\phantom{\\dagger}\\phantom{\\ast}\\phantom{\\ddagger}}$'

def row(approach):
    ret = ''
    for corpus in ['Robust04', 'CW09', 'CW12']:
        for field in ['Lower', 'Actual', 'Upper']:
            ret += ' & ' + cell(approach, corpus, field)
    return ret

def table():
    return """\\begin{table*}[t]
\\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\\dagger$), Condensed Lists ($\\ast$), and Max-Residuals ($\\ddagger$).}
\\label{table-per-topic-rmse-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{RMSE on Robust04} & \\multicolumn{3}{c}{RMSE on CW09} & \\multicolumn{3}{c}{RMSE on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\\\
\\midrule
Min Res. """ + row('MinResiduals') + """\\\\
Cond. Lists """ + row('Condensed') + """\\\\
Max Res. """ + row('MaxResiduals') + """\\\\

\\midrule

BS (R) """ + row('PBS-R-ML') + """\\\\
BS (P) """ + row('PBS-P-ML') + """\\\\
BS (R+P) """ + row('PBS-RP-ML') + """\\\\

\\bottomrule
\\end{tabular} 
\\end{table*}
"""

print(table())

\begin{table*}[t]
\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\dagger$), Condensed Lists ($\ast$), and Max-Residuals ($\ddagger$).}
\label{table-per-topic-rmse-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{RMSE on Robust04} & \multicolumn{3}{c}{RMSE on CW09} & \multicolumn{3}{c}{RMSE on CW12} \\
\cmidrule(r{1em}){2-4} \cmidrule(r{1em}){5-7} \cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\
\midrule
Min Res.  & .000$^{\phantom{\dagger}\phantom{\ast}\phantom{\ddagger}}$ & .000$^{\phantom{\dagger}\phantom{\ast}\phantom{\ddagger}}$ & .000$^{\phantom{\d