# Create table `table-reconstruction-effectiveness`

### Import utility and load datasets

In [3]:
%%time
import sys
sys.path.append('../python/')
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import load_df_reconstruction, load_preprocessed_reconstruction_or_from_cache
import json
from statistics import mean
from scipy.stats import ttest_ind

# To Reload the data
#!rm -Rf processed-evaluation-results.json

P_VALUE = 0.05
# We compare against 5 things
P_VALUE_BONFERONNI_CORRECTED = P_VALUE / 5

dfs = load_preprocessed_reconstruction_or_from_cache()

CPU times: user 2.83 s, sys: 535 ms, total: 3.36 s
Wall time: 3.36 s


### Create the Table

In [4]:
def significant(a, b):
    _, p_value = ttest_ind(a, b)
    
    return p_value < P_VALUE_BONFERONNI_CORRECTED

def col(df, name):
    df_grouped = df.groupby('approach')\
        .mean()\
        .reset_index()
    
    df_grouped = df_grouped[df_grouped['approach'] == name]
    assert len(df_grouped) == 1
    df_grouped = df_grouped.iloc[0].to_dict()
    
    df_residuals = df[df['approach'] == 'Residuals']
    df_condensed = df[df['approach'] == 'Condensed']
    df_min_condensed = df[df['approach'] == 'Min-Condensed']
    df = df[df['approach'] == name]
    
    ret = ''
    for k in ['precision', 'recall', 'f1']:
        if len(ret) > 0:
            ret += ' & '
        
        sig = ''
        non_sig = ''
        
        for sign_name, vals in [('\\dagger', df_residuals), ('\\ast', df_condensed), ('\\ddagger', df_min_condensed)]:
            if significant(df[k], vals[k]):
                sig += sign_name
            else:
                non_sig += sign_name
        
        ret += '{:.3f}'.format(df_grouped[k], 3).lstrip('0') + '$^{' + sig+ '\\phantom{' + non_sig + '}}$'
    
    return ret
    
def line(name):
    df_r04 = dfs['Robust04'][['approach', 'precision', 'recall', 'f1']]
    df_cw09 = dfs['CW09'][['approach', 'precision', 'recall', 'f1']]
    df_cw12 = dfs['CW12'][['approach', 'precision', 'recall', 'f1']]
    
    return col(df_r04, name) + ' & ' + col(df_cw09, name) + ' & ' + col(df_cw12, name)

def produce_table():
    return '''\\begin{table*}[t]
\\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?),
Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall.
All of this on the Topic Level. {\\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}
We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Residuals ($\\dagger$), Condensed Lists ($\\ast$), and Min-Condensed ($\\ddagger$).
}
\\label{table-reconstruction-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{Reconstr. on Robust04} & \\multicolumn{3}{c}{Reconstr. on CW09} & \\multicolumn{3}{c}{Reconstr. on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Precision                & Recall         & F1   & Precision                & Recall         & F1 & Precision                & Recall         & F1             \\\\

\\midrule

Residuals & ''' + line('Residuals') + '''\\\\
\\midrule

Min Res. & ''' + line('MinResiduals') + '''\\\\
Cond. Lists & ''' + line('Condensed') + '''\\\\
PBS-RP & ''' + line('PBS-RP-ML') + '''\\\\


\\midrule

Min-Condensed & ''' + line('Min-Condensed') + '''\\\\
BS (R+P)$_{75}$ &  ''' + line('Min-PBS-RP-75') + '''\\\\
BS (R+P)$_{90}$ &  ''' + line('Min-PBS-RP-90') + '''\\\\
BS (R+P)$_{95}$ &  ''' + line('Min-PBS-RP-95') + '''\\\\

\\bottomrule
\\end{tabular} 
\\end{table*} 
'''

print(produce_table())

\begin{table*}[t]
\caption{Reconstruction effectiveness: Precision (how many of the system-pairs that I tell apart are correct?),
Recall (how many of the apart system pairs do I find?), and F1 as the harmonic mean of precision and recall.
All of this on the Topic Level. {\color{red} ToDo: Look why precision of residuals is not 1, are these only the special cases that we discussed earlier?}
We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Residuals ($\dagger$), Condensed Lists ($\ast$), and Min-Condensed ($\ddagger$).
}
\label{table-reconstruction-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{Reconstr. on Robust04} & \multicolumn{3}{c}{Reconstr. on CW09} & \multicolumn{3}{c}{Reconstr. on CW12} \\
\cmidrule(r{1

### Helper-Methods to remove the effects of inefficient runs


In [None]:

def avg(i):
    i = json.loads(i)
    
    return mean(i.values())
df = []
for trec in tqdm(['trec13', 'trec18', 'trec19', 'trec20', 'trec21', 'trec22', 'trec23']):
    df_trec = load_evaluations(glob(f'../resources/eval/trec-system-runs/{trec}/*.jsonl'))
    df_trec['ndcg@10'] = df_trec[('depth-10-complete', 'ndcg@10')].apply(avg)
    df_trec = df_trec[['ndcg@10']].sort_values('ndcg@10', ascending=False).reset_index()[['run', 'ndcg@10']].reset_index()
    df_trec['position'] = df_trec['index']
    df_trec['trec'] = trec
    del df_trec['index']
    df += [df_trec]
df = pd.concat(df)
df.to_json('../resources/processed/ndcg-at-10-effectiveness.jsonl', lines=True, orient='records')
df