# Create table `table-reconstruction-effectiveness`

### Import utility and load datasets

In [2]:
%%time
import sys
sys.path.append('../python/')
from tqdm import tqdm
from glob import glob
import pandas as pd
from parametrized_bootstrapping_model import ParametrizedBootstrappingModel, ReturnAlways1Model, ReturnAlways0Model
from result_analysis_utils import load_ground_truth_data, load_evaluations, run_cross_validation, load_cross_validation_results, load_raw_evaluations
SEARCH_SPACE= [0, 1, 2] + list(range(5,96, 5)) + [98, 99, 100]
from io import StringIO
from trectools import TrecQrel
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import ReconstructionEvaluation, DataConstruction
import json


CPU times: user 34 µs, sys: 0 ns, total: 34 µs
Wall time: 37.4 µs


### Utility Methods

In [3]:
def load_df(trec):
    eval_predictions = glob(f'../resources/eval/trec-system-runs/{trec}/*.jsonl')
    eval_predictions += list(load_cross_validation_results(open(f'cross-validation-results/{trec}/bs-p-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'cross-validation-results/{trec}/bs-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'cross-validation-results/{trec}/bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'cross-validation-results/{trec}/bs-run-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    
    return load_evaluations(tqdm(eval_predictions))

def report_for_row(df_row, measure, depth):
    tmp = {'run': df_row['run'].split('/')[-1].replace('input.', '').replace('.gz', '')}
    measures = [
        ('unjudged', (f'depth-{depth}-incomplete', f'unjudged@{depth}')),
        (f'ground-truth-{measure}@{depth}', (f'depth-{depth}-complete', f'ndcg@{depth}')),
        (f'min-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-min')),
        (f'condensed-{measure}@{depth}', (f'depth-{depth}-incomplete', f'condensed-{measure}@{depth}')),
        (f'max-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-max')),
        (f'always-1', (f'depth-{depth}-incomplete', 'always-1')),
        (f'always-0', (f'depth-{depth}-incomplete', 'always-0')),
    ]
    
    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['']:
            measures += [(f'{k}-RMSE{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'pbs-rmse{m}-{v}'))]
    
    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['-upper-bound-0.01', '-upper-bound-0.05', '-lower-bound-0.01', '-lower-bound-0.05']:
            part_name = f'pbs{m}-{v}'
            measures += [(f'{k}-RMSE{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'{part_name}-{part_name}'))]
    
    for display_name, m in measures:
        try:
            tmp[display_name] = json.loads(df_row[m])
        except:
            raise ValueError(f'Can not handle "{m}". Got {df_row.keys()}')
    
    ret = []
    
    for topic in tmp[f'ground-truth-{measure}@{depth}']:
        entry = {'run': tmp['run'], 'topic': topic}
        for k, v in tmp.items():
            if k in ['run']:
                continue
            
            if topic in v:
                entry[k] = v[topic]
        ret += [entry]
    
    return ret

def create_aggregated_df(df, measure, depth, loc):
    ret = pd.DataFrame([dict(i) for i in report_for_row(df.iloc[loc], measure, depth)])
    ret = ret.sort_values(f'ground-truth-{measure}@{depth}', ascending=False).reset_index()
    del ret['index']
    return ret


def data_for_reconstruction_experiments(df, trec):
    ret = {}
    for run in tqdm(range(len(df['run'].unique()))):
        try:
            tmp = create_aggregated_df(df, 'ndcg', 10, run)
        except Exception as e:
            #raise e
            continue
        tmp = tmp[tmp['unjudged'] > 0].dropna()
        # for robust04 we want 50 topics in the comparison and for the web tracks 10 each (to get to the same number of overall topics)
        if len(tmp) < (50 if trec == 'trec13' else 10):
            continue

        measures_to_report = [('Condensed', 'condensed-ndcg@10'), ('Min-Residual', 'min-residual-ndcg@10'),
                    ('Max-Residual', 'max-residual-ndcg@10'), ('Always 1', 'always-1'), ('Always 0', 'always-0'),
                   ]

        for i in ['', '-upper-bound-0.01', '-upper-bound-0.05', '-lower-bound-0.01', '-lower-bound-0.05']:
            for p in ['', 'P-', 'RP-', 'R-']:
                measures_to_report += [(f'PBS-{p}RMSE{i}', f'PBS-{p}RMSE{i}-ndcg@10')]

                
        for _, i in tmp.iterrows():
            to_add = {
                'topic': i['topic'],
                'system': i['run'],
                'ground_truth': i['ground-truth-ndcg@10']
            }
            
            for k,v in measures_to_report:
                to_add[k] = i[v]
            
            if i['topic'] not in ret:
                ret[i['topic']] = []
            
            ret[i['topic']] += [to_add]
    
    return ret

def load_df_reconstruction(trec):
    df = load_df(trec)
    d = data_for_reconstruction_experiments(df, trec)
    reconstruction_approaches = {
        'Residuals': DataConstruction('Min-Residual', 'Condensed', 'Max-Residual'),
        'PBS-RP-0.01': DataConstruction('PBS-RP-RMSE-lower-bound-0.01', 'PBS-RP-RMSE', 'PBS-RP-RMSE-upper-bound-0.01'),
        'PBS-RP-0.05': DataConstruction('PBS-RP-RMSE-lower-bound-0.05', 'PBS-RP-RMSE', 'PBS-RP-RMSE-upper-bound-0.05'),
        
        'PBS-R-0.01': DataConstruction('PBS-R-RMSE-lower-bound-0.01', 'PBS-R-RMSE', 'PBS-R-RMSE-upper-bound-0.01'),
        'PBS-R-0.05': DataConstruction('PBS-R-RMSE-lower-bound-0.05', 'PBS-R-RMSE', 'PBS-R-RMSE-upper-bound-0.05'),
        
        'PBS-P-0.01': DataConstruction('PBS-P-RMSE-lower-bound-0.01', 'PBS-P-RMSE', 'PBS-P-RMSE-upper-bound-0.01'),
        'PBS-P-0.05': DataConstruction('PBS-P-RMSE-lower-bound-0.05', 'PBS-P-RMSE', 'PBS-P-RMSE-upper-bound-0.05'),
    }

    df_reconstruction = []

    reconstruction_eval = ReconstructionEvaluation()

    for approach_name, approach in reconstruction_approaches.items():
        for topic, topic_data in approach.construct_data_for_reconstruction_evaluation(d).items():
            df_reconstruction += [{
                'approach': approach_name,
                'topic': topic,
                'precision': reconstruction_eval.precision(topic_data),
                'recall': reconstruction_eval.recall(topic_data),
            }]

    df_reconstruction = pd.DataFrame(df_reconstruction)
    df_reconstruction['f1'] = df_reconstruction.apply(lambda i: 0 if (i['precision']+i['recall']) == 0 else 2*(i['precision']*i['recall'])/(i['precision']+i['recall']), axis=1)
    
    return df_reconstruction

In [4]:
df_reconstruction = []
for trec in ['trec18', 'trec19', 'trec20', 'trec21']:
    df_reconstruction += [load_df_reconstruction(trec)]
    
df_reconstruction = pd.concat(df_reconstruction)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2769/2769 [00:11<00:00, 248.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 159.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2184/2184 [00:09<00:00, 239.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 155.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1443/1443 [00:06<00:00, 229.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 152.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1248/12

In [6]:
df_reconstruction[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,PBS-P-0.01,0.89835,0.588781,0.690694
1,PBS-P-0.05,0.879752,0.692954,0.758657
2,PBS-R-0.01,0.917231,0.4462,0.565987
3,PBS-R-0.05,0.84947,0.644186,0.713381
4,PBS-RP-0.01,0.909014,0.735875,0.79694
5,PBS-RP-0.05,0.896813,0.764771,0.810585
6,Residuals,0.876198,0.096973,0.139747


In [7]:
df_reconstruction = []
for trec in ['trec22', 'trec23']:
    df_reconstruction += [load_df_reconstruction(trec)]
    
df_reconstruction = pd.concat(df_reconstruction)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1326/1326 [00:13<00:00, 100.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:00<00:00, 153.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1170/1170 [00:11<00:00, 101.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 152.37it/s]


In [8]:
df_reconstruction[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,PBS-P-0.01,0.884676,0.500202,0.624982
1,PBS-P-0.05,0.856901,0.633312,0.718714
2,PBS-R-0.01,0.905337,0.321554,0.451078
3,PBS-R-0.05,0.853869,0.548333,0.652172
4,PBS-RP-0.01,0.871468,0.672973,0.748974
5,PBS-RP-0.05,0.848363,0.740749,0.782476
6,Residuals,0.947782,0.104383,0.161988


In [14]:
def line(df, name):
    df = df[df['approach'] == name]
    assert len(df) == 1
    df = df.iloc[0].to_dict()
    
    return f'{round(df["precision"], 3)} & {round(df["recall"], 3)} & {round(df["f1"], 3)} &  --- & --- & --- & --- & --- & ---'

def produce_table(df):
    return '''\\begin{table*}[t]
\\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?), Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall. All of this on the Topic Level. {\\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}}
\\label{table-reconstruction-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{Reconstr. on Robust04} & \\multicolumn{3}{c}{Reconstr. on CW09} & \\multicolumn{3}{c}{Reconstr. on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall         & F1             \\\\
\\midrule
Residuals & ''' + line(df, 'Residuals') + '''\\\\
Min Res. +-x\\% & --- & --- & --- &  --- & --- & --- & --- & --- & ---\\\\
Cond. Lists +-x\\% & --- & --- & --- & --- & --- & --- & --- & --- & ---\\\\

\\midrule
BS (R) &  ''' + line(df, 'PBS-R') + '''\\\\
BS (P) &  ''' + line(df, 'PBS-P') + '''\\\\
BS (R+P) &  ''' + line(df, 'PBS-RP') + '''\\\\

\\bottomrule
\\end{tabular} 
\\end{table*} 
'''

print(produce_table(df_reconstruction))

\begin{table*}[t]
\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?), Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall. All of this on the Topic Level. {\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}}
\label{table-reconstruction-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{Reconstr. on Robust04} & \multicolumn{3}{c}{Reconstr. on CW09} & \multicolumn{3}{c}{Reconstr. on CW12} \\
\cmidrule(r{1em}){2-4} \cmidrule(r{1em}){5-7} \cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall