# Create Table `table-per-topic-rmse-effectiveness`

In [2]:
%%time
import sys
sys.path.append('../python/')
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import load_df_reconstruction
import json
from statistics import mean
from scipy.stats import ttest_ind

CPU times: user 8 µs, sys: 11 µs, total: 19 µs
Wall time: 21.7 µs


### Load the data

In [3]:
dfs = {}
corpora = {'Robust04': ['trec13'], 'CW09': ['trec18', 'trec19', 'trec20', 'trec21'], 'CW12': ['trec22', 'trec23']}

# We follow the preprocessing steps of Zobel et al. and include only the top 75% of the runs to mitigate the effects of low-performing runs
RUNS_TO_INCLUDE = {'Robust04': 82, 'CW09': 24, 'CW12': 22}


P_VALUE = 0.05
# We compare against 3 things
P_VALUE_BONFERONNI_CORRECTED = P_VALUE / 3

for corpus, trecs in corpora.items():
    df_for_corpus = []
    for trec in trecs:
        df_for_corpus += [load_df_reconstruction(trec, RUNS_TO_INCLUDE[corpus], True, min_unjudged=0, max_unjudged=0.7)]
    dfs[corpus] = pd.concat(df_for_corpus)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5637/5637 [02:01<00:00, 46.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 164.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 3692/3692 [00:16<00:00, 220.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 464.52it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2912/2912 [00:20<00:00, 142.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 360.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1924/19

In [36]:
def load_data_for_approach(corpus, approach):
    field_to_use = 'actual'
    
    if 'MinResiduals' == approach:
        approach = 'Residuals'
        field_to_use = 'lower'
    if 'MaxResiduals' == approach:
        approach = 'Residuals'
        field_to_use = 'upper'
    
    ground_truth = []
    actual = []
    
    for _, i in dfs[corpus][dfs[corpus]['approach'] == approach].iterrows():
        for topic_data in i['topic_data']:
            ground_truth += [topic_data['ground_truth']]
            actual += [topic_data['prediction'][field_to_use]]
    
    return ground_truth, actual

def p_value(ground_truth, a, b):
    # we apply only the squared part because the root mean part is what happens later for the aggregation
    if len(ground_truth) != len(a) or len(a) != len(b):
        raise ValueError('Incompatible lengths...')
        
    a_squared_error = []
    b_squared_error = []
    
    for i in range(len(ground_truth)):
        a_squared_error += [pow(a[i] - ground_truth[i], 2)]
        b_squared_error += [pow(b[i] - ground_truth[i], 2)]
    
    _, p = ttest_ind(a_squared_error, b_squared_error)
    
    return p
    

def load_result_df(corpus):
    ret = []
    
    _, actual_min_residuals = load_data_for_approach(corpus, 'MinResiduals')
    _, actual_condensed = load_data_for_approach(corpus, 'Condensed')
    _, actual_max_residuals = load_data_for_approach(corpus, 'MaxResiduals')
    
    for approach in ['MinResiduals', 'MaxResiduals', 'Condensed', 'PBS-R-ML', 'PBS-P-ML', 'PBS-RP-ML',
                     'PBS-R-75', 'PBS-P-75', 'PBS-PR-75', 'PBS-R-90', 'PBS-P-90', 'PBS-PR-90', 'PBS-R-95', 'PBS-P-95', 'PBS-PR-95']:
        ground_truth, actual = load_data_for_approach(corpus, approach)
        
        ret += [{
            'Approach': approach,
            'Corpus': corpus,
            'Lower': mean_squared_error(ground_truth, np.maximum(actual, ground_truth), squared=False),
            
            'Lower (P Value To MinResiduals)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_min_residuals, ground_truth)),
            'Lower (P Value To Condensed)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_condensed, ground_truth)),
            'Lower (P Value To MaxResiduals)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_max_residuals, ground_truth)),
            
            'Actual': mean_squared_error(ground_truth, actual, squared=False),
            
            'Actual (P Value To MinResiduals)': p_value(ground_truth, actual, actual_min_residuals),
            'Actual (P Value To Condensed)': p_value(ground_truth, actual, actual_condensed),
            'Actual (P Value To MaxResiduals)': p_value(ground_truth, actual, actual_max_residuals),
            
            'Upper': mean_squared_error(ground_truth, np.minimum(actual, ground_truth), squared=False),
            
            'Upper (P Value To MinResiduals)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_min_residuals, ground_truth)),
            'Upper (P Value To Condensed)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_condensed, ground_truth)),
            'Upper (P Value To MaxResiduals)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_max_residuals, ground_truth)),
            
        }]

    return pd.DataFrame(ret)

df = pd.concat([load_result_df('Robust04'), load_result_df('CW09'), load_result_df('CW12')])
df

Unnamed: 0,Approach,Corpus,Lower,Lower (P Value To MinResiduals),Lower (P Value To Condensed),Lower (P Value To MaxResiduals),Actual,Actual (P Value To MinResiduals),Actual (P Value To Condensed),Actual (P Value To MaxResiduals),Upper,Upper (P Value To MinResiduals),Upper (P Value To Condensed),Upper (P Value To MaxResiduals)
0,MinResiduals,Robust04,0.004222,1.0,5.976201e-146,0.0,0.057941,1.0,7.03899e-08,0.0,0.057787,1.0,1.604796e-42,1.931844e-74
1,MaxResiduals,Robust04,0.210111,0.0,0.0,1.0,0.210118,0.0,0.0,1.0,0.001746,1.931844e-74,9.519509e-45,1.0
2,Condensed,Robust04,0.062462,5.976201e-146,1.0,0.0,0.068168,7.03899e-08,1.0,0.0,0.027301,1.604796e-42,1.0,9.519509e-45
3,PBS-R-ML,Robust04,0.006552,0.0004181342,5.078367e-144,0.0,0.058047,0.9614837,8.78969e-08,0.0,0.057677,0.9604516,2.118802e-42,2.001612e-74
4,PBS-P-ML,Robust04,0.078448,6.5826029999999995e-108,6.922492e-13,0.0,0.083109,2.302487e-26,1.696858e-12,1.330822e-301,0.027441,1.9351889999999998e-41,0.9232189,3.264208e-36
5,PBS-RP-ML,Robust04,0.037425,6.850928999999999e-51,5.701288e-46,0.0,0.055602,0.2404322,4.395194e-14,0.0,0.04112,2.807997e-15,2.142858e-16,1.062199e-60
0,MinResiduals,CW09,0.008903,1.0,8.919301e-109,0.0,0.076349,1.0,4.359888e-05,0.0,0.075828,1.0,1.105545e-38,5.4670159999999996e-64
1,MaxResiduals,CW09,0.338173,0.0,0.0,1.0,0.338173,0.0,0.0,1.0,0.000423,5.4670159999999996e-64,3.542271e-30,1.0
2,Condensed,CW09,0.080643,8.919301e-109,1.0,0.0,0.087487,4.359888e-05,1.0,0.0,0.033921,1.105545e-38,1.0,3.542271e-30
3,PBS-R-ML,CW09,0.020921,2.681281e-07,2.033863e-93,0.0,0.077491,0.7123767,0.0002205808,0.0,0.074613,0.6993865,1.541829e-36,1.7857860000000002e-61


In [40]:
def cell(approach, corpus, field):
    val = df[(df['Approach'] == approach) & (df['Corpus'] == corpus)]
    if len(val) != 1:
        raise ValueError('Can not happen')
    
    val = val.iloc[0].to_dict()
    
    p_comparison = [('MinResiduals', '\\dagger'), ('Condensed', '\\ast'), ('MaxResiduals', '\\ddagger')]
    significant = ''
    non_significant = ''
    
    for k, v in p_comparison:
        if val[field + ' (P Value To ' + k + ')'] < P_VALUE_BONFERONNI_CORRECTED:
            significant += v
        else:
            non_significant += v
    
    return "{:.3f}".format(val[field]).lstrip('0') + '$^{' + significant + '\\phantom{' + non_significant + '}}$'

def row(approach):
    ret = ''
    for corpus in ['Robust04', 'CW09', 'CW12']:
        for field in ['Lower', 'Actual', 'Upper']:
            ret += ' & ' + cell(approach, corpus, field)
    return ret

def table():
    return """\\begin{table*}[t]
\\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\\dagger$), Condensed Lists ($\\ast$), and Max-Residuals ($\\ddagger$).}
\\label{table-per-topic-rmse-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{RMSE on Robust04} & \\multicolumn{3}{c}{RMSE on CW09} & \\multicolumn{3}{c}{RMSE on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\\\
\\midrule
Min Res. """ + row('MinResiduals') + """\\\\
Cond. Lists """ + row('Condensed') + """\\\\
Max Res. """ + row('MaxResiduals') + """\\\\

\\midrule

BS (R) """ + row('PBS-R-ML') + """\\\\
BS (P) """ + row('PBS-P-ML') + """\\\\
BS (R+P) """ + row('PBS-RP-ML') + """\\\\

\\bottomrule
\\end{tabular} 
\\end{table*}
"""

print(table())

\begin{table*}[t]
\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\dagger$), Condensed Lists ($\ast$), and Max-Residuals ($\ddagger$).}
\label{table-per-topic-rmse-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{RMSE on Robust04} & \multicolumn{3}{c}{RMSE on CW09} & \multicolumn{3}{c}{RMSE on CW12} \\
\cmidrule(r{1em}){2-4} \cmidrule(r{1em}){5-7} \cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\
\midrule
Min Res.  & .004$^{\ast\ddagger\phantom{\dagger}}$ & .058$^{\ast\ddagger\phantom{\dagger}}$ & .058$^{\ast\ddagger\phantom{\dagger}}$ & .009$^{\ast\ddagg

### Create the Table