# Create table `table-reconstruction-effectiveness`

### Import utility and load datasets

In [2]:
%%time
import sys
sys.path.append('../python/')
from tqdm import tqdm
from glob import glob
import pandas as pd
from parametrized_bootstrapping_model import ParametrizedBootstrappingModel, ReturnAlways1Model, ReturnAlways0Model
from result_analysis_utils import load_ground_truth_data, load_evaluations, run_cross_validation, load_cross_validation_results, load_raw_evaluations
SEARCH_SPACE= [0, 1, 2] + list(range(5,96, 5)) + [98, 99, 100]
from io import StringIO
from trectools import TrecQrel
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import ReconstructionEvaluation, DataConstruction
import json

if 'df' not in locals() or 'unique_queries' not in locals():
    eval_predictions = glob('../resources/eval/trec-system-runs/trec13/*.jsonl')
    eval_predictions += list(load_cross_validation_results(open('cross-validation-results/bs-p-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open('cross-validation-results/bs-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open('cross-validation-results/bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open('cross-validation-results/bs-run-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    
    df = load_evaluations(tqdm(eval_predictions))
    
    unique_queries = set(TrecQrel('../resources/unprocessed/topics-and-qrels/qrels.robust04.txt').qrels_data['query'].astype(str).unique())

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 3488/3488 [02:01<00:00, 28.81it/s]


CPU times: user 2min 59s, sys: 21.2 s, total: 3min 21s
Wall time: 6min 40s


In [3]:
def report_for_row(df_row, measure, depth):
    tmp = {'run': df_row['run'].split('/')[-1].replace('input.', '').replace('.gz', '')}
    measures = [
        ('unjudged', (f'depth-{depth}-incomplete', f'unjudged@{depth}')),
        (f'ground-truth-{measure}@{depth}', (f'depth-{depth}-complete', f'ndcg@{depth}')),
        (f'min-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-min')),
        (f'condensed-{measure}@{depth}', (f'depth-{depth}-incomplete', f'condensed-{measure}@{depth}')),
        (f'max-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-max')),
        (f'always-1', (f'depth-{depth}-incomplete', 'always-1')),
        (f'always-0', (f'depth-{depth}-incomplete', 'always-0')),
    ]
    
    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['[0.6,1]', '', '[0.8,1]', '[1,2]', '[1,3]', '[0.1,5]', '[0.1,10]', '[0.1,100]', '[0,1]']:
            measures += [(f'{k}-RMSE{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'pbs-rmse{m}-{v}'))]
    
    for display_name, m in measures:
        try:
            tmp[display_name] = json.loads(df_row[m])
        except:
            raise ValueError(m)
    
    ret = []
    
    for topic in tmp[f'ground-truth-{measure}@{depth}']:
        entry = {'run': tmp['run'], 'topic': topic}
        for k, v in tmp.items():
            if k in ['run']:
                continue
            
            if topic in v:
                entry[k] = v[topic]
        ret += [entry]
    
    return ret

def create_aggregated_df(measure, depth, loc):
    ret = pd.DataFrame([dict(i) for i in report_for_row(df.iloc[loc], measure, depth)])
    ret = ret.sort_values(f'ground-truth-{measure}@{depth}', ascending=False).reset_index()
    del ret['index']
    return ret


def data_for_reconstruction_experiments():
    ret = {}
    for run in tqdm(range(110)):
        try:
            tmp = create_aggregated_df('ndcg', 10, run)
        except Exception as e:
            #raise e
            continue
        tmp = tmp[tmp['unjudged'] > 0].dropna()
        if len(tmp) < 50:
            continue

        measures_to_report = [('Condensed', 'condensed-ndcg@10'), ('Min-Residual', 'min-residual-ndcg@10'),
                    ('Max-Residual', 'max-residual-ndcg@10'), ('Always 1', 'always-1'), ('Always 0', 'always-0'),
                   ]

        for i in ['[0.6,1]', '', '[0.8,1]', '[1,2]', '[1,3]']:
            for p in ['', 'P-', 'RP-', 'R-']:
                measures_to_report += [(f'PBS-{p}RMSE{i}', f'PBS-{p}RMSE{i}-ndcg@10')]

                
        for _, i in tmp.iterrows():
            to_add = {
                'topic': i['topic'],
                'system': i['run'],
                'ground_truth': i['ground-truth-ndcg@10']
            }
            
            for k,v in measures_to_report:
                to_add[k] = i[v]
            
            if i['topic'] not in ret:
                ret[i['topic']] = []
            
            ret[i['topic']] += [to_add]
    
    return ret

d = data_for_reconstruction_experiments()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:01<00:00, 105.57it/s]


In [9]:
reconstruction_approaches = {
    'Residuals': DataConstruction('Min-Residual', 'Condensed', 'Max-Residual'),
    'PBS-RP': DataConstruction('PBS-RP-RMSE[0.8,1]', 'PBS-RP-RMSE', 'PBS-RP-RMSE[1,3]'),
    'PBS-R': DataConstruction('PBS-R-RMSE[0.8,1]', 'PBS-R-RMSE', 'PBS-R-RMSE[1,3]'),
    'PBS-P': DataConstruction('PBS-P-RMSE[0.8,1]', 'PBS-P-RMSE', 'PBS-P-RMSE[1,3]')
}

df_reconstruction = []

reconstruction_eval = ReconstructionEvaluation()

for approach_name, approach in reconstruction_approaches.items():
    for topic, topic_data in approach.construct_data_for_reconstruction_evaluation(d).items():
        df_reconstruction += [{
            'approach': approach_name,
            'topic': topic,
            'precision': reconstruction_eval.precision(topic_data),
            'recall': reconstruction_eval.recall(topic_data),
        }]

df_reconstruction = pd.DataFrame(df_reconstruction)
df_reconstruction['f1'] = df_reconstruction.apply(lambda i: 0 if (i['precision']+i['recall']) == 0 else 2*(i['precision']*i['recall'])/(i['precision']+i['recall']), axis=1)
df_reconstruction = df_reconstruction[['approach', 'precision', 'recall', 'f1']].groupby('approach').mean().reset_index()
df_reconstruction

Unnamed: 0,approach,precision,recall,f1
0,PBS-P,0.936807,0.828982,0.869048
1,PBS-R,0.952959,0.835189,0.878572
2,PBS-RP,0.951172,0.836975,0.879491
3,Residuals,0.935703,0.415869,0.540361


In [14]:
def line(df, name):
    df = df[df['approach'] == name]
    assert len(df) == 1
    df = df.iloc[0].to_dict()
    
    return f'{round(df["precision"], 3)} & {round(df["recall"], 3)} & {round(df["f1"], 3)} &  --- & --- & --- & --- & --- & ---'

def produce_table(df):
    return '''\\begin{table*}[t]
\\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?), Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall. All of this on the Topic Level. {\\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}}
\\label{table-reconstruction-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{Reconstr. on Robust04} & \\multicolumn{3}{c}{Reconstr. on CW09} & \\multicolumn{3}{c}{Reconstr. on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall         & F1             \\\\
\\midrule
Residuals & ''' + line(df, 'Residuals') + '''\\\\
Min Res. +-x\\% & --- & --- & --- &  --- & --- & --- & --- & --- & ---\\\\
Cond. Lists +-x\\% & --- & --- & --- & --- & --- & --- & --- & --- & ---\\\\

\\midrule
BS (R) &  ''' + line(df, 'PBS-R') + '''\\\\
BS (P) &  ''' + line(df, 'PBS-P') + '''\\\\
BS (R+P) &  ''' + line(df, 'PBS-RP') + '''\\\\

\\bottomrule
\\end{tabular} 
\\end{table*} 
'''

print(produce_table(df_reconstruction))

\begin{table*}[t]
\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?), Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall. All of this on the Topic Level. {\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}}
\label{table-reconstruction-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{Reconstr. on Robust04} & \multicolumn{3}{c}{Reconstr. on CW09} & \multicolumn{3}{c}{Reconstr. on CW12} \\
\cmidrule(r{1em}){2-4} \cmidrule(r{1em}){5-7} \cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall