# Create Table `table-per-topic-rmse-effectiveness`

In [3]:
%%time
import sys
sys.path.append('../python/')
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from reconstruction_evaluation import load_df_reconstruction, calculate_error
import json
from statistics import mean
from scipy.stats import ttest_ind


P_VALUE = 0.05
# We compare against 3 things
P_VALUE_BONFERONNI_CORRECTED = P_VALUE / 3

# To Reload the data
#!rm -Rf processed-evaluation-results.json

dfs = load_preprocessed_reconstruction_or_from_cache()

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 21.5 µs


### Load the data

In [37]:
def load_data_for_approach(corpus, approach):
    field_to_use = 'actual'
    
    if 'MinResiduals' == approach:
        approach = 'Residuals'
        field_to_use = 'lower'
    if 'MaxResiduals' == approach:
        approach = 'Residuals'
        field_to_use = 'upper'
    
    ground_truth = []
    actual = []
    
    for _, i in dfs[corpus][dfs[corpus]['approach'] == approach].iterrows():
        for topic_data in i['topic_data']:
            ground_truth += [topic_data['ground_truth']]
            actual += [topic_data['prediction'][field_to_use]]
    
    return ground_truth, actual

def p_value(ground_truth, a, b):
    # we apply only the squared part because the root mean part is what happens later for the aggregation
    if len(ground_truth) != len(a) or len(a) != len(b):
        raise ValueError('Incompatible lengths...')
        
    a_squared_error = []
    b_squared_error = []
    
    for i in range(len(ground_truth)):
        a_squared_error += [pow(a[i] - ground_truth[i], 2)]
        b_squared_error += [pow(b[i] - ground_truth[i], 2)]
    
    _, p = ttest_ind(a_squared_error, b_squared_error)
    
    return p

def error_distribution(min_value, max_value, actual, predicted, normalized):
    ret = []
    
    if len(min_value) != len(max_value) or len(min_value) != len(actual) or len(min_value) != len(predicted):
        raise ValueError('Incompatible lengths...')
    
    for i in range(len(actual)):
        ret += [calculate_error(min_value[i], max_value[i], actual[i], predicted[i], normalized)]
    
    return ret

def load_result_df(corpus):
    ret = []
    
    _, actual_min_residuals = load_data_for_approach(corpus, 'MinResiduals')
    _, actual_condensed = load_data_for_approach(corpus, 'Condensed')
    _, actual_max_residuals = load_data_for_approach(corpus, 'MaxResiduals')
    
    for approach in ['MinResiduals', 'MaxResiduals', 'Condensed', 'PBS-R-ML', 'PBS-P-ML', 'PBS-RP-ML',
                     'Min-PBS-R-75', 'Min-PBS-P-75', 'Min-PBS-RP-75', 'Min-PBS-R-90', 'Min-PBS-P-90', 'Min-PBS-RP-90', 'Min-PBS-R-95', 'Min-PBS-P-95', 'Min-PBS-RP-95']:
        ground_truth, actual = load_data_for_approach(corpus, approach)

        ret += [{
            'Approach': approach,
            'Corpus': corpus,
            'Lower': mean_squared_error(ground_truth, np.maximum(actual, ground_truth), squared=False),
            
            'Lower (P Value To MinResiduals)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_min_residuals, ground_truth)),
            'Lower (P Value To Condensed)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_condensed, ground_truth)),
            'Lower (P Value To MaxResiduals)': p_value(ground_truth, np.maximum(actual, ground_truth), np.maximum(actual_max_residuals, ground_truth)),
            
            'Actual': mean_squared_error(ground_truth, actual, squared=False),
            
            'Actual (P Value To MinResiduals)': p_value(ground_truth, actual, actual_min_residuals),
            'Actual (P Value To Condensed)': p_value(ground_truth, actual, actual_condensed),
            'Actual (P Value To MaxResiduals)': p_value(ground_truth, actual, actual_max_residuals),
            
            'Upper': mean_squared_error(ground_truth, np.minimum(actual, ground_truth), squared=False),
            
            'Upper (P Value To MinResiduals)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_min_residuals, ground_truth)),
            'Upper (P Value To Condensed)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_condensed, ground_truth)),
            'Upper (P Value To MaxResiduals)': p_value(ground_truth, np.minimum(actual, ground_truth), np.minimum(actual_max_residuals, ground_truth)),
            
            'Errors': error_distribution(actual_min_residuals, actual_max_residuals, ground_truth, actual, False),
            'Errors (Normalized)': error_distribution(actual_min_residuals, actual_max_residuals, ground_truth, actual, True),
            
        }]

    return pd.DataFrame(ret)

df = pd.concat([load_result_df('Robust04'), load_result_df('CW09'), load_result_df('CW12')])

# Verify that the structure of the table is as expected.
df.head(2)

Unnamed: 0,Approach,Corpus,Lower,Lower (P Value To MinResiduals),Lower (P Value To Condensed),Lower (P Value To MaxResiduals),Actual,Actual (P Value To MinResiduals),Actual (P Value To Condensed),Actual (P Value To MaxResiduals),Upper,Upper (P Value To MinResiduals),Upper (P Value To Condensed),Upper (P Value To MaxResiduals),Errors,Errors (Normalized)
0,MinResiduals,Robust04,0.004222,1.0,5.976201e-146,0.0,0.057941,1.0,7.03899e-08,0.0,0.057787,1.0,1.604796e-42,1.931844e-74,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0, 0.0, 0..."
1,MaxResiduals,Robust04,0.210111,0.0,0.0,1.0,0.210118,0.0,0.0,1.0,0.001746,1.931844e-74,9.519509e-45,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.087471714101...","[0, 0, 0, 0, 0, 0, -1.0, 0, 0, -1.0, -1.0, -1...."


### Create The Table

In [6]:
def cell(approach, corpus, field):
    val = df[(df['Approach'] == approach) & (df['Corpus'] == corpus)]
    if len(val) != 1:
        raise ValueError('Can not happen')
    
    val = val.iloc[0].to_dict()
    
    p_comparison = [('MinResiduals', '\\dagger'), ('Condensed', '\\ast'), ('MaxResiduals', '\\ddagger')]
    significant = ''
    non_significant = ''
    
    for k, v in p_comparison:
        if val[field + ' (P Value To ' + k + ')'] < P_VALUE_BONFERONNI_CORRECTED:
            significant += v
        else:
            non_significant += v
    
    return "{:.3f}".format(val[field]).lstrip('0') + '$^{' + significant + '\\phantom{' + non_significant + '}}$'

def row(approach):
    ret = ''

    for corpus in ['Robust04', 'CW09', 'CW12']:
        for field in ['Lower', 'Actual', 'Upper']:
            ret += ' & ' + cell(approach, corpus, field)

    return ret

def table():
    return """\\begin{table*}[t]
\\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\\dagger$), Condensed Lists ($\\ast$), and Max-Residuals ($\\ddagger$).}
\\label{table-per-topic-rmse-effectiveness}
\\renewcommand{\\tabcolsep}{3.8pt} 
\\centering
\\small

\\begin{tabular}{@{}l@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{\\hspace{2em}}c@{\\hspace{.5em}}c@{\\hspace{.5em}}c@{}}
\\toprule
& \\multicolumn{3}{c}{RMSE on Robust04} & \\multicolumn{3}{c}{RMSE on CW09} & \\multicolumn{3}{c}{RMSE on CW12} \\\\
\\cmidrule(r{1em}){2-4} \\cmidrule(r{1em}){5-7} \\cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\\\
\\midrule
Min Res. """ + row('MinResiduals') + """\\\\
Cond. Lists """ + row('Condensed') + """\\\\
Max Res. """ + row('MaxResiduals') + """\\\\

\\midrule

BS (R) """ + row('PBS-R-ML') + """\\\\
BS (P) """ + row('PBS-P-ML') + """\\\\
BS (R+P) """ + row('PBS-RP-ML') + """\\\\

\\midrule

BS (R, 75) """ + row('Min-PBS-R-75') + """\\\\
BS (P, 75) """ + row('Min-PBS-P-75') + """\\\\
BS (R+P, 75) """ + row('Min-PBS-RP-75') + """\\\\

\\midrule

BS (R, 90) """ + row('Min-PBS-R-90') + """\\\\
BS (P, 90) """ + row('Min-PBS-P-90') + """\\\\
BS (R+P, 90) """ + row('Min-PBS-RP-90') + """\\\\

\\midrule

BS (R, 95) """ + row('Min-PBS-R-95') + """\\\\
BS (P, 95) """ + row('Min-PBS-P-95') + """\\\\
BS (R+P, 95) """ + row('Min-PBS-RP-95') + """\\\\

\\bottomrule
\\end{tabular} 
\\end{table*}
"""

print(table())

\begin{table*}[t]
\caption{TBD. We report statistical significance according to students t-test with Bonferroni correction at p=0.05 to Min-Residuals ($\dagger$), Condensed Lists ($\ast$), and Max-Residuals ($\ddagger$).}
\label{table-per-topic-rmse-effectiveness}
\renewcommand{\tabcolsep}{3.8pt} 
\centering
\small

\begin{tabular}{@{}l@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{\hspace{2em}}c@{\hspace{.5em}}c@{\hspace{.5em}}c@{}}
\toprule
& \multicolumn{3}{c}{RMSE on Robust04} & \multicolumn{3}{c}{RMSE on CW09} & \multicolumn{3}{c}{RMSE on CW12} \\
\cmidrule(r{1em}){2-4} \cmidrule(r{1em}){5-7} \cmidrule{8-10}

 & Lower                & Actual         & Upper   & Lower                & Actual         & Upper & Lower                & Actual         & Upper             \\
\midrule
Min Res.  & .004$^{\ast\ddagger\phantom{\dagger}}$ & .058$^{\ast\ddagger\phantom{\dagger}}$ & .058$^{\ast\ddagger\phantom{\dagger}}$ & .009$^{\ast\ddagg