# Create table `table-reconstruction-effectiveness`

### Import utility and load datasets

In [2]:
%%time
import sys
sys.path.append('../python/')
from tqdm import tqdm
from glob import glob
import pandas as pd
from parametrized_bootstrapping_model import ParametrizedBootstrappingModel, ReturnAlways1Model, ReturnAlways0Model
from result_analysis_utils import load_ground_truth_data, load_evaluations, run_cross_validation, load_cross_validation_results, load_raw_evaluations
SEARCH_SPACE= [0, 1, 2] + list(range(5,96, 5)) + [98, 99, 100]
from io import StringIO
from trectools import TrecQrel
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import ReconstructionEvaluation, DataConstruction
import json
from statistics import mean

CPU times: user 0 ns, sys: 30 µs, total: 30 µs
Wall time: 33.4 µs


In [11]:
!ls ../resources/processed/cross-validation-results/trec23|grep condensed

bs-p-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl
bs-pool-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl
bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl
bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl
bs-run-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl
condensed-ndcg@10-results.jsonl


### Utility Methods

In [20]:
def load_df(trec):
    eval_predictions = glob(f'../resources/eval/trec-system-runs/{trec}/*.jsonl')
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-p-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-dependent-1000-ndcg@10-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-p-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-pool-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-and-pool-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-and-pool-dependent2-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/bs-run-dependent-1000-ndcg@10-ndcg@10-condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/condensed-ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    eval_predictions += list(load_cross_validation_results(open(f'../resources/processed/cross-validation-results/{trec}/ndcg@10-results.jsonl'), depth=10, return_buffers=True))
    
    return load_evaluations(tqdm(eval_predictions))

def report_for_row(df_row, measure, depth):
    tmp = {'run': df_row['run'].split('/')[-1].replace('input.', '').replace('.gz', '')}
    measures = [
        ('unjudged', (f'depth-{depth}-incomplete', f'unjudged@{depth}')),
        (f'ground-truth-{measure}@{depth}', (f'depth-{depth}-complete', f'ndcg@{depth}')),
        (f'min-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-min')),
        (f'condensed-{measure}@{depth}', (f'depth-{depth}-incomplete', f'condensed-{measure}@{depth}')),
        (f'max-residual-{measure}@{depth}', (f'depth-{depth}-incomplete', f'residual-{measure}@{depth}-max')),
        (f'always-1', (f'depth-{depth}-incomplete', 'always-1')),
        (f'always-0', (f'depth-{depth}-incomplete', 'always-0')),
    ]
    
    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP2', f'bs-run-and-pool-dependent2-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['']:
            measures += [(f'{k}-RMSE{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'pbs-rmse{m}-{v}'))]

    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP2', f'bs-run-and-pool-dependent2-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['0.8', '0.9', '0.95', '0.99']:
            internal_name = f'bs-ci-{m}-{v}-{v}-condensed-{measure}@{depth}'
            measures += [(f'{k}-CL-{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'{internal_name}-{internal_name}'))]
    
    for k,v in [('PBS', f'bs-p-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-P', f'bs-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP', f'bs-run-and-pool-dependent-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-RP2', f'bs-run-and-pool-dependent2-1000-{measure}@{depth}-{measure}@{depth}'), ('PBS-R', f'bs-run-dependent-1000-{measure}@{depth}-{measure}@{depth}')]:
        for m in ['-upper-bound-0.01', '-upper-bound-0.05', '-lower-bound-0.01', '-lower-bound-0.05']:
            part_name = f'pbs{m}-{v}'
            measures += [(f'{k}-RMSE{m}-{measure}@{depth}', (f'depth-{depth}-incomplete', f'{part_name}-{part_name}'))]
    
    
    for i in ['upper-bound-0.01', 'upper-bound-0.05', 'lower-bound-0.01', 'lower-bound-0.05']:
        measures += [(f'gsd-{i}-condensed-{measure}@{depth}',  (f'depth-{depth}-incomplete', f'gsd-{i}-condensed-{measure}@{depth}-condensed-{measure}@{depth}-gsd-{i}-condensed-{measure}@{depth}-condensed-{measure}@{depth}')),
                     (f'gsd-{i}-{measure}@{depth}',  (f'depth-{depth}-incomplete', f'gsd-{i}-{measure}@{depth}-{measure}@{depth}-gsd-{i}-{measure}@{depth}-{measure}@{depth}'))]
    
    for display_name, m in measures:
        try:
            tmp[display_name] = json.loads(df_row[m])
        except:
            raise ValueError(f'Can not handle "{m}". Got {df_row.keys()}')
    
    ret = []
    
    for topic in tmp[f'ground-truth-{measure}@{depth}']:
        entry = {'run': tmp['run'], 'topic': topic}
        for k, v in tmp.items():
            if k in ['run']:
                continue
            
            if topic in v:
                entry[k] = v[topic]
        ret += [entry]
    
    return ret

def create_aggregated_df(df, measure, depth, loc, runs_to_keep):
    if df.iloc[loc]['run'] not in runs_to_keep:
        return None
    ret = pd.DataFrame([dict(i) for i in report_for_row(df.iloc[loc], measure, depth)])
    ret = ret.sort_values(f'ground-truth-{measure}@{depth}', ascending=False).reset_index()
    del ret['index']
    return ret


def data_for_reconstruction_experiments(df, trec, failsave, runs_to_keep, min_unjudged=0):
    ret = {}
    for run in tqdm(range(len(df['run'].unique()))):
        try:
            tmp = create_aggregated_df(df, 'ndcg', 10, run, runs_to_keep)
            if tmp is None:
                continue
        except Exception as e:
            if not failsave:
                raise e
            
            continue
        tmp = tmp[tmp['unjudged'] > min_unjudged].dropna()
        # for robust04 we want 50 topics in the comparison and for the web tracks 10 each (to get to the same number of overall topics)
        #if len(tmp) < (50 if trec == 'trec13' else 10):
        #    continue
        if len(tmp) <= 1:
            print(len(tmp))
            continue

        measures_to_report = [('Condensed', 'condensed-ndcg@10'), ('Min-Residual', 'min-residual-ndcg@10'),
                    ('Max-Residual', 'max-residual-ndcg@10'), ('Always 1', 'always-1'), ('Always 0', 'always-0'),
                   ]

        for i in ['', '-upper-bound-0.01', '-upper-bound-0.05', '-lower-bound-0.01', '-lower-bound-0.05']:
            for p in ['', 'P-', 'RP-',  'RP2-', 'R-']:
                measures_to_report += [(f'PBS-{p}RMSE{i}', f'PBS-{p}RMSE{i}-ndcg@10')]

        for p in ['P', 'RP',  'RP2', 'R']:
            for m in ['0.8', '0.9', '0.95', '0.99']:
                measures_to_report += [(f'PBS-{p}-CL-{m}', f'PBS-{p}-CL-{m}-ndcg@10')]


        for i in ['upper-bound-0.01', 'upper-bound-0.05', 'lower-bound-0.01', 'lower-bound-0.05']:
                measures_to_report += [(f'GSD-Condensed-{i}', f'gsd-{i}-condensed-ndcg@10'), (f'GSD-{i}', f'gsd-{i}-ndcg@10')]
        
        for _, i in tmp.iterrows():
            to_add = {
                'topic': i['topic'],
                'system': i['run'],
                'ground_truth': i['ground-truth-ndcg@10']
            }
            
            for k,v in measures_to_report:
                to_add[k] = i[v]
            
            if i['topic'] not in ret:
                ret[i['topic']] = []
            
            ret[i['topic']] += [to_add]
    
    return ret

def load_df_reconstruction(trec, num_runs_to_keep=100000, failsave=True):
    runs_to_keep = pd.read_json('../resources/processed/ndcg-at-10-effectiveness.jsonl', lines=True)
    runs_to_keep = runs_to_keep[runs_to_keep['position'] < num_runs_to_keep]
    runs_to_keep = set(runs_to_keep['run'].unique())
    df = load_df(trec)
    d = data_for_reconstruction_experiments(df, trec, failsave, runs_to_keep)
    reconstruction_approaches = {
        'Residuals': DataConstruction('Min-Residual', 'Condensed', 'Max-Residual'),
        'MinResiduals': DataConstruction('Min-Residual', 'Min-Residual', 'Min-Residual'),
        'Condensed': DataConstruction('Condensed', 'Condensed', 'Condensed'),
        'Min-Condensed': DataConstruction('Min-Residual', 'Condensed', 'Condensed'),
        
        'PBS-P-CL-0.80': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-P-CL-0.8'),
        'PBS-P-CL-0.90': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-P-CL-0.9'),
        'PBS-P-CL-0.95': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-P-CL-0.95'),
        'PBS-P-CL-0.99': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-P-CL-0.99'),
        
        'PBS-R-CL-0.80': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-R-CL-0.8'),
        'PBS-R-CL-0.90': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-R-CL-0.9'),
        'PBS-R-CL-0.95': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-R-CL-0.95'),
        'PBS-R-CL-0.99': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-R-CL-0.99'),
        
        'PBS-RP-CL-0.80': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP-CL-0.8'),
        'PBS-RP-CL-0.90': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP-CL-0.9'),
        'PBS-RP-CL-0.95': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP-CL-0.95'),
        'PBS-RP-CL-0.99': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP-CL-0.99'),
        
        'PBS-RP2-CL-0.80': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP2-CL-0.8'),
        'PBS-RP2-CL-0.90': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP2-CL-0.9'),
        'PBS-RP2-CL-0.95': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP2-CL-0.95'),
        'PBS-RP2-CL-0.99': DataConstruction('Min-Residual', 'Min-Residual', 'PBS-RP2-CL-0.99'),
        
        
        'PBS-RP-RMSE': DataConstruction('PBS-RP-RMSE', 'PBS-RP-RMSE', 'PBS-RP-RMSE'),
        'PBS-RP2-RMSE': DataConstruction('PBS-RP2-RMSE', 'PBS-RP2-RMSE', 'PBS-RP2-RMSE'),
        'PBS-R-RMSE': DataConstruction('PBS-R-RMSE', 'PBS-R-RMSE', 'PBS-R-RMSE'),
        'PBS-P-RMSE': DataConstruction('PBS-P-RMSE', 'PBS-P-RMSE', 'PBS-P-RMSE'),
        
        'PBS-RP-0.01': DataConstruction('PBS-RP-RMSE-lower-bound-0.01', 'PBS-RP-RMSE', 'PBS-RP-RMSE-upper-bound-0.01'),
        'PBS-RP-0.05': DataConstruction('PBS-RP-RMSE-lower-bound-0.05', 'PBS-RP-RMSE', 'PBS-RP-RMSE-upper-bound-0.05'),
        
        'PBS-RP2-0.01': DataConstruction('PBS-RP2-RMSE-lower-bound-0.01', 'PBS-RP2-RMSE', 'PBS-RP2-RMSE-upper-bound-0.01'),
        'PBS-RP2-0.05': DataConstruction('PBS-RP2-RMSE-lower-bound-0.05', 'PBS-RP2-RMSE', 'PBS-RP2-RMSE-upper-bound-0.05'),
        
        'PBS-R-0.01': DataConstruction('PBS-R-RMSE-lower-bound-0.01', 'PBS-R-RMSE', 'PBS-R-RMSE-upper-bound-0.01'),
        'PBS-R-0.05': DataConstruction('PBS-R-RMSE-lower-bound-0.05', 'PBS-R-RMSE', 'PBS-R-RMSE-upper-bound-0.05'),
        
        'PBS-P-0.01': DataConstruction('PBS-P-RMSE-lower-bound-0.01', 'PBS-P-RMSE', 'PBS-P-RMSE-upper-bound-0.01'),
        'PBS-P-0.05': DataConstruction('PBS-P-RMSE-lower-bound-0.05', 'PBS-P-RMSE', 'PBS-P-RMSE-upper-bound-0.05'),
    }

    df_reconstruction = []

    reconstruction_eval = ReconstructionEvaluation()

    for approach_name, approach in reconstruction_approaches.items():
        for topic, topic_data in approach.construct_data_for_reconstruction_evaluation(d).items():
            df_reconstruction += [{
                'approach': approach_name,
                'topic': topic,
                'precision': reconstruction_eval.precision(topic_data),
                'recall': reconstruction_eval.recall(topic_data),
            }]

    df_reconstruction = pd.DataFrame(df_reconstruction)
    df_reconstruction['f1'] = df_reconstruction.apply(lambda i: 0 if (i['precision']+i['recall']) == 0 else 2*(i['precision']*i['recall'])/(i['precision']+i['recall']), axis=1)
    
    return df_reconstruction

In [21]:
df_reconstruction = []
for trec in ['trec18', 'trec19', 'trec20', 'trec21']:
    df_reconstruction += [load_df_reconstruction(trec, 24, False)]
    
df_reconstruction_cw09 = pd.concat(df_reconstruction)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 4757/4757 [00:13<00:00, 343.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 317.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 3752/3752 [00:11<00:00, 331.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 219.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2479/2479 [00:07<00:00, 318.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 141.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2144/21

In [22]:
df_reconstruction_cw09[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,Condensed,0.882361,0.882361,0.882361
1,Min-Condensed,0.96848,0.821652,0.884659
2,MinResiduals,0.913399,0.913399,0.913399
3,PBS-P-0.01,0.952396,0.775111,0.85015
4,PBS-P-0.05,0.932144,0.844817,0.885082
5,PBS-P-CL-0.80,0.971036,0.785717,0.864512
6,PBS-P-CL-0.90,0.970092,0.793131,0.868981
7,PBS-P-CL-0.95,0.96965,0.795504,0.87035
8,PBS-P-CL-0.99,0.968529,0.798776,0.871986
9,PBS-P-RMSE,0.909579,0.909579,0.909579


In [5]:
df_reconstruction_cw09[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,Condensed,0.882361,0.882361,0.882361
1,MinResiduals,0.913399,0.913399,0.913399
2,PBS-P-0.01,0.952327,0.775993,0.850622
3,PBS-P-0.05,0.932569,0.843903,0.884687
4,PBS-P-RMSE,0.909473,0.909473,0.909473
5,PBS-R-0.01,0.976713,0.719269,0.822996
6,PBS-R-0.05,0.942553,0.814871,0.872014
7,PBS-R-RMSE,0.905585,0.905585,0.905585
8,PBS-RP-0.01,0.938635,0.864263,0.898423
9,PBS-RP-0.05,0.925929,0.880256,0.901889


In [23]:
df_reconstruction = []
for trec in ['trec22', 'trec23']:
    df_reconstruction += [load_df_reconstruction(trec, 22, False)]
    
df_reconstruction_cw12 = pd.concat(df_reconstruction)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2278/2278 [00:07<00:00, 313.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:00<00:00, 133.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2010/2010 [00:06<00:00, 302.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 124.16it/s]


In [24]:
df_reconstruction_cw12[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,Condensed,0.878306,0.878306,0.878306
1,Min-Condensed,0.95233,0.764419,0.843447
2,MinResiduals,0.848977,0.848977,0.848977
3,PBS-P-0.01,0.937232,0.713127,0.80596
4,PBS-P-0.05,0.911585,0.785939,0.842594
5,PBS-P-CL-0.80,0.959837,0.721641,0.817449
6,PBS-P-CL-0.90,0.958217,0.731488,0.823816
7,PBS-P-CL-0.95,0.956735,0.734033,0.825086
8,PBS-P-CL-0.99,0.95536,0.73713,0.826632
9,PBS-P-RMSE,0.87099,0.87099,0.87099


In [7]:
df_reconstruction_cw12[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,Condensed,0.878306,0.878306,0.878306
1,MinResiduals,0.848977,0.848977,0.848977
2,PBS-P-0.01,0.936739,0.713653,0.806068
3,PBS-P-0.05,0.911342,0.78521,0.842034
4,PBS-P-RMSE,0.8701,0.8701,0.8701
5,PBS-R-0.01,0.976061,0.609957,0.742179
6,PBS-R-0.05,0.921748,0.732752,0.814285
7,PBS-R-RMSE,0.862355,0.862355,0.862355
8,PBS-RP-0.01,0.904087,0.815588,0.8562
9,PBS-RP-0.05,0.887024,0.854446,0.870199


In [8]:
df_reconstruction = []
for trec in ['trec13']:
    df_reconstruction += [load_df_reconstruction(trec, 82, True)]
    
df_reconstruction_r04 = pd.concat(df_reconstruction)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 4382/4382 [01:15<00:00, 58.11it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:01<00:00, 79.11it/s]


In [9]:
df_reconstruction_r04[['approach', 'precision', 'recall', 'f1']]\
    .groupby('approach')\
    .mean()\
    .reset_index()

Unnamed: 0,approach,precision,recall,f1
0,Condensed,0.931028,0.931028,0.931028
1,MinResiduals,0.954159,0.954159,0.954159
2,PBS-P-0.01,0.976394,0.859667,0.911637
3,PBS-P-0.05,0.968767,0.892457,0.927716
4,PBS-P-RMSE,0.946714,0.946714,0.946714
5,PBS-R-0.01,0.979206,0.824309,0.892124
6,PBS-R-0.05,0.954028,0.890219,0.920339
7,PBS-R-RMSE,0.936278,0.936278,0.936278
8,PBS-RP-0.01,0.977114,0.862375,0.912918
9,PBS-RP-0.05,0.973803,0.869511,0.915995


In [9]:
def col(df, name):
    df = df[df['approach'] == name]
    assert len(df) == 1
    df = df.iloc[0].to_dict()
    
    return '{:.3f}'.format(df["precision"], 3) + ' & ' + '{:.3f}'.format(df["recall"]) + ' & ' + '{:.3f}'.format(df["f1"])
    
def line(name):
    
    df_r04 = df_reconstruction_r04[['approach', 'precision', 'recall', 'f1']]\
        .groupby('approach')\
        .mean()\
        .reset_index()
    
    df_cw09 = df_reconstruction_cw09[['approach', 'precision', 'recall', 'f1']]\
        .groupby('approach')\
        .mean()\
        .reset_index()
    
    df_cw12 = df_reconstruction_cw12[['approach', 'precision', 'recall', 'f1']]\
        .groupby('approach')\
        .mean()\
        .reset_index()
    
    return col(df_r04, name) + ' & ' + col(df_cw09, name) + ' & ' + col(df_cw12, name)

def produce_table():
    return '''\\begin{table*}[t]
\\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?), Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall. All of this on the Topic Level. {\\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}}
\\label{table-reconstruction-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{Reconstr. on Robust04} & \\multicolumn{3}{c}{Reconstr. on CW09} & \\multicolumn{3}{c}{Reconstr. on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall         & F1             \\\\
\\midrule
Residuals & ''' + line('Residuals') + '''\\\\
Min Res. & ''' + line('MinResiduals') + '''\\\\
Cond. Lists & ''' + line('Condensed') + '''\\\\

\\midrule
BS (R$_{0.01}$) &  ''' + line('PBS-R-0.01') + '''\\\\
BS (P$_{0.01}$) &  ''' + line('PBS-P-0.01') + '''\\\\
BS (R+P$_{0.01}$) &  ''' + line('PBS-RP-0.01') + '''\\\\


\\midrule
BS (R$_{0.05}$) &  ''' + line('PBS-R-0.05') + '''\\\\
BS (P$_{0.05}$) &  ''' + line('PBS-P-0.05') + '''\\\\
BS (R+P$_{0.05}$) &  ''' + line('PBS-RP-0.05') + '''\\\\

\\bottomrule
\\end{tabular} 
\\end{table*} 
'''

print(produce_table())

KeyError: "['approach', 'precision', 'recall'] not in index"

### Helper-Methods to remove the effects of inefficient runs


In [None]:

def avg(i):
    i = json.loads(i)
    
    return mean(i.values())
df = []
for trec in tqdm(['trec13', 'trec18', 'trec19', 'trec20', 'trec21', 'trec22', 'trec23']):
    df_trec = load_evaluations(glob(f'../resources/eval/trec-system-runs/{trec}/*.jsonl'))
    df_trec['ndcg@10'] = df_trec[('depth-10-complete', 'ndcg@10')].apply(avg)
    df_trec = df_trec[['ndcg@10']].sort_values('ndcg@10', ascending=False).reset_index()[['run', 'ndcg@10']].reset_index()
    df_trec['position'] = df_trec['index']
    df_trec['trec'] = trec
    del df_trec['index']
    df += [df_trec]
df = pd.concat(df)
df.to_json('../resources/processed/ndcg-at-10-effectiveness.jsonl', lines=True, orient='records')
df