In [1]:
import pandas as pd

In [2]:
results = pd.read_csv('results.csv')

In [4]:
results.columns

Index(['timestamp', 'scale', 'Aagg', 'Agb', 'agg', 'predicate_strategy',
       'predicate_size', 'g1', 'g2', 'gs1', 'gs2', 'k', 'gamma', 'rho_query',
       'rho_expl', 'rho_topk', 'rho_influ', 'rho_rank', 'error_nq',
       'error_topk', 'error_infci', 'error_rnkci', 't_phase_1', 't_phase_2',
       't_phase_3', 'runtime', 'mings'],
      dtype='object')

In [125]:
def print_aggregated_result(aggregated_results, highlighter = None):
    for col in ['error_nq', 'error_topk', 'error_infci', 'error_rnkci']:
        aggregated_results[col] = aggregated_results[col].apply(lambda row: f'{row * 100:.2f}\\%')
    aggregated_results['runtime'] = aggregated_results['runtime'].apply(lambda row: f'{row:.0f}s')
    aggregated_results = aggregated_results[['error_nq', 'error_topk', 'error_infci', 'error_rnkci', 'runtime']]
    aggregated_results = aggregated_results.reset_index()
    for idx, row in aggregated_results.iterrows():
        if highlighter and row[highlighter['attr']] == highlighter['val']:
            print('& ' + ' & '.join([f'\\textbf{{{x}}}' for x in row.tolist()]) + ' \\\\')
        else:
            print('& ' + ' & '.join([f'{x}' for x in row.tolist()]) + ' \\\\')
            
def quick_check(results, param):
    print(f'\t\midrule')
    print(f'\t\multirow{{3}}{{*}}{{{param}}}')
    print_aggregated_result(results.groupby(param).median())
    print()
    
def quick_check_full(results):
    params = ['rho_query', 'rho_topk', 'rho_influ', 'rho_rank', 
              'gamma', 'k', 'split_factor', 'predicate_strategy', 'scale']
    for param in params:
        quick_check(results, param)

In [126]:
results = pd.read_csv('results_questions.csv')
quick_check(results, ['Agb', 'g1', 'g2'])

	\midrule
	\multirow{3}{*}{['Agb', 'g1', 'g2']}
& AGE & (40, 50] & (30, 40] & 0.26\% & 2.40\% & 5.93\% & 45.29\% & 27s \\
& AGE & (60, 70] & (20, 30] & 0.13\% & 0.00\% & 1.32\% & 2.69\% & 21s \\
& CITIZEN & Born in U.S & Not a citizen & 0.10\% & 0.00\% & 1.50\% & 25.40\% & 60s \\
& CITIZEN & Naturalized citizen & Born in U.S & 1.37\% & 0.00\% & 1.58\% & 21.50\% & 60s \\
& CLASSWKR & Federal government employee & State government employee & 0.41\% & 25.41\% & 27.88\% & 84.42\% & 6s \\
& CLASSWKR & Wage/salary, private & Self-employed, not incorporated & 0.34\% & 0.00\% & 3.09\% & 70.67\% & 38s \\
& EDUC & Bachelor's degree & High school diploma or equivalent & 0.03\% & 0.00\% & 1.47\% & 24.82\% & 31s \\
& EDUC & Master's degree & Bachelor's degree & 0.14\% & 2.48\% & 8.90\% & 79.16\% & 17s \\
& MARST & Divorced & Never married/single & 0.08\% & 0.00\% & 1.24\% & 45.68\% & 26s \\
& MARST & Married, spouse present & Never married/single & 0.02\% & 0.00\% & 0.43\% & 1.98\% & 55s \\
& RACE 

In [108]:
print_aggregated_result(results.groupby('Agb').median(), {'attr': 'Agb', 'val': 'SEX'})

& AGE & 0.27\% & 1.34\% & 5.85\% & 54.37\% & 26s \\
& CITIZEN & 0.11\% & 0.00\% & 1.45\% & 27.08\% & 59s \\
& CLASSWKR & 0.36\% & 0.00\% & 2.99\% & 66.06\% & 35s \\
& MARST & 0.02\% & 0.00\% & 0.42\% & 1.98\% & 51s \\
& RACE & 0.08\% & 0.00\% & 1.51\% & 4.89\% & 51s \\
& RELATE & 0.03\% & 0.00\% & 0.91\% & 1.96\% & 37s \\
& \textbf{SEX} & \textbf{0.02\%} & \textbf{0.00\%} & \textbf{0.37\%} & \textbf{1.38\%} & \textbf{65s} \\


In [109]:
results.groupby('Agb').median()[['error_nq', 'error_topk', 'error_infci', 'error_rnkci', 'runtime']]

Unnamed: 0_level_0,error_nq,error_topk,error_infci,error_rnkci,runtime
Agb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AGE,0.002654,0.013428,0.058489,0.543697,26.266555
CITIZEN,0.00109,0.0,0.014522,0.270796,58.550568
CLASSWKR,0.003641,0.0,0.029913,0.660577,35.070312
MARST,0.000232,0.0,0.004154,0.01982,50.85492
RACE,0.000803,0.0,0.015081,0.048889,50.96019
RELATE,0.000258,0.0,0.009088,0.019588,37.496479
SEX,0.000241,0.0,0.003666,0.013793,65.203422


In [122]:
results_control = pd.read_csv('results_control.csv')

In [111]:
print_aggregated_result(results_control.query('agg == "CNT" or agg == "SUM"').groupby('agg').median())

& CNT & 6.22\% & 0.00\% & 0.07\% & 1.03\% & 63s \\
& SUM & 7.91\% & 0.00\% & 0.18\% & 0.69\% & 63s \\


In [112]:
print_aggregated_result(results_control.groupby('rho_query').median())

& 0.01 & 0.08\% & 0.00\% & 0.36\% & 1.38\% & 65s \\
& 0.1 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 1.0 & 0.01\% & 0.00\% & 0.37\% & 1.38\% & 65s \\


In [113]:
print_aggregated_result(results_control.groupby('rho_topk').median())

& 0.1 & 0.02\% & 0.00\% & 0.36\% & 1.38\% & 62s \\
& 0.5 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 2.0 & 0.02\% & 0.00\% & 0.36\% & 1.38\% & 65s \\


In [121]:
print_aggregated_result(results_control.groupby('rho_influ').median())

& 0.1 & 0.02\% & 0.00\% & 0.82\% & 1.38\% & 65s \\
& 0.5 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 2.0 & 0.02\% & 0.00\% & 0.19\% & 1.38\% & 65s \\


In [115]:
print_aggregated_result(results_control.groupby('rho_rank').median())

& 0.1 & 0.02\% & 0.00\% & 0.37\% & 1.90\% & 65s \\
& 1.0 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 10.0 & 0.02\% & 0.00\% & 0.37\% & 1.03\% & 65s \\


In [116]:
print_aggregated_result(results_control.groupby('gamma').median())

& 0.9 & 0.02\% & 0.00\% & 0.34\% & 1.38\% & 65s \\
& 0.95 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 0.99 & 0.03\% & 0.00\% & 0.41\% & 1.38\% & 65s \\


In [117]:
print_aggregated_result(results_control.groupby('scale').median())

& 0.1 & 0.23\% & 0.00\% & 3.63\% & 64.05\% & 7s \\
& 0.5 & 0.05\% & 0.00\% & 0.73\% & 1.38\% & 31s \\
& 1.0 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\


In [118]:
print_aggregated_result(results_control.groupby('split_factor').median())

& 0.1 & 0.02\% & 0.00\% & 0.36\% & 1.72\% & 65s \\
& 0.5 & 0.02\% & 0.00\% & 0.36\% & 1.38\% & 65s \\
& 0.9 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\


In [119]:
print_aggregated_result(results_control.groupby('predicate_strategy').median())

& 1-way marginal & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 2-way marginal & 0.02\% & 0.00\% & 0.41\% & 0.02\% & 2957s \\
& 3-way marginal & 0.02\% & 0.00\% & 0.50\% & 0.00\% & 64358s \\


In [124]:
print_aggregated_result(results_control.groupby('k').median())

& 5 & 0.02\% & 0.00\% & 0.37\% & 1.38\% & 65s \\
& 10 & 0.02\% & 0.00\% & 0.56\% & 17.33\% & 70s \\
& 20 & 0.02\% & 0.51\% & 0.86\% & 48.32\% & 78s \\
