In [None]:
# imports
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
from sklearn.metrics import roc_curve

# add latex
import os
os.environ["PATH"] += os.pathsep + '/home/sundar/texlive/2022/bin/x86_64-linux'

# Add directory above current directory to path
import sys; sys.path.insert(0, '..')
from load_data import *

# graphing
plt.rcParams.update({
    'font.size': 14,
    'text.usetex': True,
    'text.latex.preamble': r'\usepackage{libertine}\usepackage[libertine]{newtxmath} \usepackage{sfmath}',
    'font.family': 'sans-serif',
})

reps = 500
attack_colors = ["#c159a1", "#6d392e", "#9b9c07"]

data_dir = 'results/'
img_dir = f'{data_dir}/images'
os.makedirs(img_dir, exist_ok=True)
save_img = True

colors = ['black', '#5bdb5f', '#0080ffff', '#f10800ff', '#ff9400ff', '#00e4f8ff']
markers = ['+', '1', 6, '*', 'd', 's']
# synth_models = ['NonPrivate', 'BayNet_3parents', 'RAP_2Kiters', 'CTGAN', 'IndHist']
# synth_model_labels = ['NonPrivate', 'BayNet', 'RAP', 'CTGAN', 'IndHist']
synth_models = ['NonPrivate']
synth_model_labels = ['NonPrivate']

synth_sizes = [10, 100, 1000, 10000, 100000, 1000000]
synth_size_labels = ['$10^1$', '$10^2$', '$10^3$', '$10^4$', '$10^5$', '$10^6$']

# DP plots does not have 10^1
dp_synth_indices = [1, 3, 5]
dp_synth_size_labels = [synth_size_labels[i] for i in dp_synth_indices]

data_names = ['acs', 'fire']
data_labels = ['ACS', 'FIRE']

def get_attack(attack_name):
    if attack_name == 'recon':
        return result_privacy[
            (result_privacy['attack_name'] == 'recon') &
            (result_privacy['k'] == 3) &
            (result_privacy['scale_type'] == 'cond') &
            (result_privacy['n_queries'] == -1)
        ]
    elif attack_name == 'dcr':
        return result_privacy[
            (result_privacy['attack_name'] == 'dcr') &
            (result_privacy['k'] == 3) &
            (result_privacy['n_queries'] == -1)
        ]
    elif attack_name == 'inference':
        return result_privacy[
            (result_privacy['attack_name'] == 'infer') &
            (result_privacy['k'] == 3) &
            (result_privacy['n_queries'] == -1)
        ]

result_privacy = pd.read_csv(f'{data_dir}/results_privacy.csv')
result_utility = pd.read_csv(f'{data_dir}/results_utility.csv')
result_utility = result_utility[result_utility['synth_size'].isin(synth_sizes)]

In [None]:
# plot utility (Avg 3-TVD & mean relative error)
tasks = [('avg_tvd', '3-TVD'), ('rel_mean', '$\\text{MRE}_{> 10}$')]
fig, axs = plt.subplots(len(tasks), 2)
first_label = True
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    result_data = result_utility[result_utility['data_name'] == data_name]
    for j, (task, task_label) in enumerate(tasks):
        ax = axs.flat[2 * j + i]
        for synth_model, color, synth_model_label in zip(synth_models, colors, synth_model_labels):
            curr_result = result_data[result_data['synth_model'] == synth_model]
            xs = curr_result['synth_size'].to_numpy()
            ys = curr_result[task].to_numpy()
            ax.plot(xs, ys, color=color, marker='o', label=synth_model_label if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('Synthetic data size ($m$)')
        
        if task == 'avg_tvd':
            ax.set_ylim(0.0, 1.0)
            ax.set_yticks(np.linspace(0.0, 1.0, 6))
        elif task == 'rel_mean':
            ax.set_ylim(0.0, 1.5)
            ax.set_yticks(np.linspace(0.0, 1.5, 6))
        elif task == 'f1_diff':
            ax.set_ylim(0.0, 0.5)
            ax.set_yticks(np.linspace(0.0, 0.5, 6))
        ax.set_xticks(synth_sizes)
        
        ax.set_ylabel(f'Error ({task_label})')
        ax.set_title(data_label)
        
        ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

        first_label = False

# axs.flat[1].legend(bbox_to_anchor=(1.575, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05))
fig.set_size_inches(10, len(tasks) * 5)
plt.subplots_adjust(wspace=0.3, hspace=0.3)

if save_img:
    fig.savefig(f'{img_dir}/utility.pdf', bbox_inches='tight')

In [None]:
# plot privacy (only recon adversary)
fig, axs = plt.subplots(1, 2)
result_recon = get_attack('recon')

first_label = True
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    result_recon_data = result_recon[result_recon['data_name'] == data_name]
    ax = axs.flat[i]
    for synth_model, color, synth_model_label in zip(synth_models, colors, synth_model_labels):
        curr_result = result_recon_data[result_recon_data['synth_model'] == synth_model]
        ax.plot(curr_result['synth_size'].to_numpy(), curr_result['acc'].to_numpy() * 100, color=color, marker='o', label=synth_model_label if first_label else None)
    ax.plot(curr_result['synth_size'].to_numpy(), np.ones(len(curr_result['synth_size'])) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
    ax.set_xscale('log')
    ax.set_xticks(synth_sizes)
    ax.set_xlabel('Synthetic data size ($m$)')
    ax.set_ylim(43, 105)
    ax.set_ylabel('Attack accuracy (\%)')
    ax.set_title(data_label)

    ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

    first_label = False

# axs.flat[1].legend(bbox_to_anchor=(1, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.15))
fig.set_size_inches(10, 5)

if save_img:
    fig.savefig(f'{img_dir}/privacy_recon.pdf', bbox_inches='tight')

In [None]:
# plot privacy ROC (only recon adversary)
fig, axs = plt.subplots(1, 2)
synth_size = '1M'
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    ax = axs.flat[i]
    for synth_model, color, synth_model_label in zip(synth_models, colors, synth_model_labels):
        y_true = []
        y_score = []
        for rep in range(reps):
            rep_dir = f'{data_dir}/{data_name}/reps/rep_{rep}/'
            user = int(np.genfromtxt(f'{rep_dir}/user.csv'))

            df = pd.read_csv(f'{rep_dir}/df.csv.gz', compression='gzip')
            sol = np.load(f'{rep_dir}/{synth_model}/{synth_size}/simple/3way_cond/sol_-1_recon.npz')['arr_0']

            y_true.append(df.iloc[user, df.columns.get_loc(get_default_secret_bit(data_name))])
            y_score.append(sol[user])

        fpr, tpr, _ = roc_curve(y_true, y_score)
        ax.plot(fpr, tpr, color=color, label=synth_model_label if i == 0 else '')
    ax.plot(fpr, fpr, color='black', linestyle='--', label='Random baseline' if i == 0 else '')
    ax.set_xlabel('False positive rate')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_ylabel('True positive rate')
    ax.set_title(data_label)

    ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

# axs.flat[1].legend(bbox_to_anchor=(0.5, 0.0), ncol=4)
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.15))

fig.set_size_inches(10, 5)

if save_img:
    fig.savefig(f'{img_dir}/privacy_recon_roc.pdf', bbox_inches='tight')

In [None]:
# plot recon vs dcr vs inference for 1M
def plot_recon_vs_dcr_vs_infer(synth_size, fig=None, axs=None, legend=True):
    if fig is None or axs is None:
        fig, axs = plt.subplots(1, 2)
        axs = axs.flat
    pos = np.arange(len(synth_models))
    width = 1 / len(synth_models)

    result_recon = get_attack('recon')
    result_dcr = get_attack('dcr')
    result_inference = get_attack('inference')

    for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
        ax = axs[i]
        result_recon_data = result_recon[result_recon['data_name'] == data_name]
        result_dcr_data = result_dcr[result_dcr['data_name'] == data_name]
        result_inference_data = result_inference[result_inference['data_name'] == data_name]
        results = []
        yerrs = []
        for synth_model in synth_models:
            ax = axs[i]
            curr_results = []
            curr_yerrs = []
            for task, task_label in [('recon', '$\\textrm{Adv}_{recon}$'), ('dcr', '$\\textrm{Adv}_{dcr}$'), ('inference', '$\\textrm{Adv}_{infer}$')]:
                if task == 'recon':
                    curr_result = result_recon_data[result_recon_data['synth_model'] == synth_model]
                elif task == 'dcr':
                    curr_result = result_dcr_data[result_dcr_data['synth_model'] == synth_model]
                else:
                    curr_result = result_inference_data[result_inference_data['synth_model'] == synth_model]

                if len(curr_result) == 0:
                    curr_results.append(0)
                    curr_yerrs.append(0)
                else:
                    curr_result = curr_result[curr_result['synth_size'] == synth_size]['acc'].to_numpy()[0]
                    curr_results.append(curr_result)
                    curr_yerrs.append(np.sqrt(curr_result * (1 - curr_result) / 500))
                
            results.append(curr_results)
            yerrs.append(curr_yerrs)
        results = np.array(results)
        yerrs = np.array(yerrs)

        for j, (task, task_label, color) in enumerate([('recon', '$\\textrm{Adv}_{recon}$', attack_colors[0]), ('dcr', '$\\textrm{Adv}_{dcr}$', attack_colors[1]), ('inference', '$\\textrm{Adv}_{infer}$', attack_colors[2])]):
            ax.bar(pos + width * j, results[:, j] * 100, color=color, label=task_label, width=width, yerr=yerrs[:, j] * 100, capsize=2.5)
        xlim_left, xlim_right = ax.get_xlim()
        ax.plot([xlim_left, xlim_right], [50, 50], color='black', linestyle='--', label='Random baseline')
        ax.set_ylim(43, 105)
        ax.set_ylabel('Attack accuracy (\%)')
        ax.set_title(data_label)
        ax.set_xticks(np.arange(len(synth_models)) + 0.185)
        ax.set_xticklabels(synth_model_labels)

    if legend:
        axs[1].legend(bbox_to_anchor=(1.55, 1.03125))
        fig.set_size_inches(12, 5)

    return fig, axs

fig, _ = plot_recon_vs_dcr_vs_infer(1000000)
if save_img:
    fig.savefig(f'{img_dir}/recon_vs_dcr_vs_infer.pdf', bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(len(synth_sizes), 2)
for i, (synth_size, synth_size_label) in enumerate(zip(synth_sizes, synth_size_labels)):
    curr_axs = (axs.flat[2 * i], axs.flat[2 * i + 1])
    plot_recon_vs_dcr_vs_infer(synth_size, fig=fig, axs=curr_axs, legend=False)
    curr_axs[0].set_title(f'ACS, Synthetic data size = {synth_size_label}')
    curr_axs[1].set_title(f'FIRE, Synthetic data size = {synth_size_label}')

axs.flat[1].legend(bbox_to_anchor=(1.45, 1.03125))
fig.set_size_inches(15, len(synth_sizes) * 6)

if save_img:
    fig.savefig(f'{img_dir}/recon_vs_dcr_vs_infer_full.pdf', bbox_inches='tight')

In [None]:
def plot_priv_util_tradeoff(combined_df, synth_models, synth_model_labels, synth_colors=colors):
    combined_df = combined_df[(combined_df['synth_model'].isin(synth_models))]

    tasks = [('avg_tvd', '3-TVD'), ('rel_mean', '$\\text{MRE}_{> 10}$')]

    fig, axs = plt.subplots(len(tasks), 2)
    for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
        combined_df_data = combined_df[combined_df['data_name'] == data_name]
        for j, (task, task_label) in enumerate(tasks):
            ax = axs.flat[2 * j + i]
            for (synth_size, marker) in zip(synth_sizes, markers):
                curr_df = combined_df_data[combined_df_data['synth_size'] == synth_size]

                errs = []
                accs = []
                for synth_model in synth_models:
                    curr_err = curr_df[curr_df['synth_model'] == synth_model][task]
                    curr_acc = curr_df[curr_df['synth_model'] == synth_model][f'acc']
                        
                    errs.append(curr_err)
                    accs.append(curr_acc)

                for curr_err, curr_acc, synth_model_label, color in zip(errs, accs, synth_model_labels, synth_colors):
                    ax.scatter(curr_err, curr_acc * 100, c=color, alpha=0.5, s=250, label=synth_model_label, marker=marker)

            ax.set_xlabel(f'Error ({task_label})')
            ax.set_ylabel('$\\text{Acc}_{max} (\%)$')
                
            if task == 'avg_tvd':
                ax.set_xlim(0, 1)
                ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])

                # plot good privacy line
                ax.plot([0, 1], [60, 60], color='black', linestyle='--')
            elif task == 'rel_mean':
                ax.set_xlim(0, 1.6)
                ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6])

                # plot good privacy line
                ax.plot([0, 1.6], [60, 60], color='black', linestyle='--')
            elif task == 'f1_diff':
                ax.set_xlim(0.0, 0.5)
                ax.set_xticks([0, 0.1, 0.2, 0.3, 0.4, 0.5])

            ax.set_ylim(50, 100)
            ax.set_yticks([50, 60, 70, 80, 90, 100])
            ax.set_title(data_label)

            if task != 'f1_diff':
                # plot good utility line
                ax.plot([0.2, 0.2], [50, 100], color='black', linestyle='--')

    # axs.flat[1].legend(bbox_to_anchor=(1.55, 1.03125))
    handles = []
    for synth_model_label, color in zip(synth_model_labels, synth_colors):
        handles.append(mlines.Line2D([], [], color=color, marker='.', linestyle='None',
                            markersize=10, label=synth_model_label))

    for synth_size_label, marker in zip(synth_size_labels, markers):
        handles.append(mlines.Line2D([], [], color='black', marker=marker, linestyle='None',
                            markersize=10, label=f'$m$ = {synth_size_label}'))
        
    if '$' in synth_model_labels[0]:
        # DP
        axs.flat[1].legend(handles=handles, bbox_to_anchor=(1.45, 1.03125))
    else:
        axs.flat[1].legend(handles=handles, bbox_to_anchor=(1, 1.03125))
    fig.set_size_inches(10, len(tasks) * 5)
    plt.subplots_adjust(wspace=0.3, hspace=0.3)

    return fig

In [None]:
result_recon = get_attack('recon')
result_dcr = get_attack('dcr')
result_inference = get_attack('inference')
result_attack = pd.concat([result_recon, result_dcr, result_inference])
result_attack = result_attack.groupby(['data_name', 'synth_model', 'synth_size']).max()

combined_df = pd.merge(result_utility, result_attack, how='inner', on=['data_name', 'synth_model', 'synth_size'])

fig = plot_priv_util_tradeoff(combined_df, synth_models, synth_model_labels)
if save_img:
    fig.savefig(f'{img_dir}/privacy_utility_tradeoff.pdf', bbox_inches='tight')

In [None]:
result_utility = pd.read_csv(f'{data_dir}/results_utility.csv')

for attack in ['recon', 'dcr', 'inference']:
    result_attack = get_attack(attack)

    combined_df = pd.merge(result_utility, result_attack, how='inner', on=['data_name', 'synth_model', 'synth_size'])

    fig = plot_priv_util_tradeoff(combined_df, synth_models, synth_model_labels)
    if save_img:
        fig.savefig(f'{img_dir}/privacy_utility_tradeoff_{attack}.pdf', bbox_inches='tight')

In [None]:
def plot_acc_vs_eps_dp(synth_model):
    fig, axs = plt.subplots(1, 2)

    attacks = ('recon', '$\\textrm{Adv}_{recon}$'), ('dcr', '$\\textrm{Adv}_{dcr}$'), ('inference', '$\\textrm{Adv}_{infer}$')
    # attacks = [('recon', '$\\textrm{Adv}_{recon}$'), ('dcr', '$\\textrm{Adv}_{dcr}$')]

    if synth_model == 'RAP_2Kiters':
        nondp_name = 'RAP_2Kiters'
        dp_name = 'RAP_2Kiters'
    elif synth_model == 'PrivBayes_3parents':
        nondp_name = 'BayNet_3parents'
        dp_name = 'PrivBayes_3parents'

    epses = [1, 10, 100]
    eps_labels = ['1', '10', '100']

    pos = np.arange(len(epses))
    width = 1 / (len(epses) + 1)

    for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
        ax = axs.flat[i]
        results = []
        yerrs = []
        for eps in epses:
            curr_results = []
            curr_yerrs = []
            for (attack, _) in attacks:
                attack_results = get_attack(attack)
                if eps == 1000:
                    curr_result = attack_results[(attack_results['data_name'] == data_name) & (attack_results['synth_model'] == nondp_name)]['acc'].to_numpy()[-1]
                else:
                    curr_result = attack_results[(attack_results['data_name'] == data_name) & (attack_results['synth_model'] == f'{dp_name}_{eps}eps')]['acc'].to_numpy()[-1]
                yerr = np.sqrt(curr_result * (1 - curr_result) / 500)

                curr_results.append(curr_result)
                curr_yerrs.append(yerr)
            results.append(curr_results)
            yerrs.append(curr_yerrs)
        results = np.array(results)
        yerrs = np.array(yerrs)

        for k, ((attack, attack_label), color) in enumerate(zip(attacks, attack_colors)):
            ax.bar(pos + width * k, results[:, k] * 100, color=color, label=attack_label, width=width, yerr=yerrs[:, k] * 100, capsize=2.5)
        xlim_left, xlim_right = ax.get_xlim()
        ax.plot([xlim_left, xlim_right], [50, 50], color='black', linestyle='--', label='Random baseline')

        ax.set_ylim(43, 105)
        ax.set_ylabel('Attack accuracy (\%)')
        ax.set_title(data_label)
        ax.set_xticks(np.arange(len(epses)) + 0.25)
        ax.set_xticklabels(eps_labels)
        ax.set_xlabel('Epsilon ($\\varepsilon$)')

        ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

    axs.flat[1].legend(bbox_to_anchor=(1.65, 1.03125))
    fig.set_size_inches(10, 5)

    return fig

In [None]:
# plot accuracy vs epsilon for RAP
fig = plot_acc_vs_eps_dp('RAP_2Kiters')
if save_img:
    fig.savefig(f'{img_dir}/privacy_all_dp_rapdp.pdf', bbox_inches='tight')

In [None]:
# plot accuracy vs epsilon for privbayes
fig = plot_acc_vs_eps_dp('PrivBayes_3parents')
if save_img:
    fig.savefig(f'{img_dir}/privacy_all_dp_privbayes.pdf', bbox_inches='tight')

In [None]:
def plot_acc_vs_eps_dp(synth_model, attack_name):
    fig, axs = plt.subplots(1, 2)

    if synth_model == 'RAP_2Kiters':
        nondp_name = 'RAP_2Kiters'
        dp_name = 'RAP_2Kiters'
    elif synth_model == 'PrivBayes_3parents':
        nondp_name = 'BayNet_3parents'
        dp_name = 'PrivBayes_3parents'

    epses = [1, 10, 100]
    eps_labels = ['1', '10', '100']
    attack_results = get_attack(attack_name)

    pos = np.arange(len(epses))
    width = 1 / (len(epses) + 1)

    for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
        ax = axs.flat[i]

        results = []
        yerrs = []
        for eps in epses:
            curr_results = []
            curr_yerrs = []
            for synth_index in dp_synth_indices:
                if eps == 1000:
                    curr_result = attack_results[(attack_results['data_name'] == data_name) & (attack_results['synth_model'] == nondp_name)]['acc'].to_numpy()[synth_index]
                else:
                    curr_result = attack_results[(attack_results['data_name'] == data_name) & (attack_results['synth_model'] == f'{dp_name}_{eps}eps')]['acc'].to_numpy()[synth_index]
                yerr = np.sqrt(curr_result * (1 - curr_result) / 500)
                
                curr_results.append(curr_result)
                curr_yerrs.append(yerr)
            results.append(curr_results)
            yerrs.append(curr_yerrs)
        results = np.array(results)
        yerrs = np.array(yerrs)

        for k, (synth_index, color, synth_size_label) in enumerate(zip(dp_synth_indices, colors[1:], dp_synth_size_labels)):
            ax.bar(pos + width * k, results[:, k] * 100, color=color, label=f'$m =$ {synth_size_label}', width=width, yerr=yerrs[:, k] * 100, capsize=2.5)
        xlim_left, xlim_right = ax.get_xlim()
        ax.plot([xlim_left, xlim_right], [50, 50], color='black', linestyle='--', label='Random baseline')

        ax.set_ylim(43, 105)
        ax.set_ylabel('Attack accuracy (\%)')
        ax.set_title(data_label)
        ax.set_xticks(np.arange(len(epses)) + 0.25)
        ax.set_xticklabels(eps_labels)
        ax.set_xlabel('Epsilon ($\\varepsilon$)')

    axs.flat[1].legend(bbox_to_anchor=(1.65, 1.03125))
    fig.set_size_inches(10, 5)

    return fig

In [None]:
# plot accuracy vs synth size for rapdp
fig = plot_acc_vs_eps_dp('RAP_2Kiters', 'recon')
if save_img:
    fig.savefig(f'{img_dir}/privacy_recon_dp_rapdp.pdf', bbox_inches='tight')

In [None]:
# plot accuracy vs synth size for privbayes
fig = plot_acc_vs_eps_dp('PrivBayes_3parents', 'inference')
if save_img:
    fig.savefig(f'{img_dir}/privacy_infer_dp_privbayes.pdf', bbox_inches='tight')

In [None]:
def plot_util_vs_eps_dp(synth_model):
    tasks = [('avg_tvd', '3-TVD'), ('rel_mean', '$\\text{MRE}_{>10}$')]

    if synth_model == 'RAP_2Kiters':
        nondp_name = 'RAP_2Kiters'
        dp_name = 'RAP_2Kiters'
    elif synth_model == 'PrivBayes_3parents':
        nondp_name = 'BayNet_3parents'
        dp_name = 'PrivBayes_3parents'
    
    epses = [1, 10, 100]
    eps_labels = ['1', '10', '100']

    pos = np.arange(len(epses))
    width = 1 / (len(epses) + 1)

    curr_synth_models = []
    for eps in epses:
        if eps == 1000:
            curr_synth_models.append(nondp_name)
        else:
            curr_synth_models.append(f'{dp_name}_{eps}eps')

    fig, axs = plt.subplots(len(tasks), 2)
    for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
        result_data = result_utility[result_utility['data_name'] == data_name]
        for j, (task, task_label) in enumerate(tasks):
            ax = axs.flat[2 * j + i]

            results = []
            yerrs = []
            for synth_model in curr_synth_models:
                curr_results = []
                curr_yerrs = []
                for synth_index in dp_synth_indices:
                    curr_result_data = result_data[result_data['synth_model'] == synth_model]
                    curr_result = curr_result_data[task].to_numpy()[synth_index]
                    yerr = curr_result_data[f'{task}_std'].to_numpy()[synth_index]

                    curr_results.append(curr_result)
                    curr_yerrs.append(yerr)
                results.append(curr_results)
                yerrs.append(curr_yerrs)

            results = np.array(results)
            yerrs = np.array(yerrs)

            for k, (synth_index, color, synth_size_label) in enumerate(zip(dp_synth_indices, colors[1:], dp_synth_size_labels)):
                ax.bar(pos + width * k, results[:, k], color=color, label=f'$m =$ {synth_size_label}', width=width, yerr=yerrs[:, k], capsize=2.5)

            if task == 'avg_tvd':
                ax.set_ylim(0.0, 1.0)
                ax.set_yticks(np.linspace(0.0, 1.0, 6))
            elif task == 'rel_mean':
                ax.set_ylim(0.0, 1.5)
                ax.set_yticks(np.linspace(0.0, 1.5, 6))
            elif task == 'f1_diff':
                ax.set_ylim(0.0, 0.5)
                ax.set_yticks(np.linspace(0.0, 0.5, 6))
            ax.set_ylabel(f'Error ({task_label})')
            ax.set_title(data_label)
            ax.set_xticks(np.arange(len(epses)) + 0.25)
            ax.set_xticklabels(eps_labels)
            ax.set_xlabel('Epsilon ($\\varepsilon$)')

    axs.flat[1].legend(bbox_to_anchor=(1.45, 1.03125))
    fig.set_size_inches(10, len(tasks) * 5)
    plt.subplots_adjust(wspace=0.3, hspace=0.3)

    return fig

In [None]:
# plot utility against epsilon (Avg 3-TVD & 75th percentile relative error) for RAP
fig = plot_util_vs_eps_dp('RAP_2Kiters')
if save_img:
    fig.savefig(f'{img_dir}/utility_dp_rapdp.pdf', bbox_inches='tight')

In [None]:
# plot utility against epsilon (Avg 3-TVD & 75th percentile relative error) for privbayes
fig = plot_util_vs_eps_dp('PrivBayes_3parents')
if save_img:
    fig.savefig(f'{img_dir}/utility_dp_privbayes.pdf', bbox_inches='tight')

In [None]:
def plot_priv_util_tradeoff_dp(synth_model):
    if synth_model == 'RAP_2Kiters':
        nondp_name = 'RAP_2Kiters'
        dp_name = 'RAP_2Kiters'
    elif synth_model == 'PrivBayes_3parents':
        nondp_name = 'BayNet_3parents'
        dp_name = 'PrivBayes_3parents'
        
    result_recon = get_attack('recon')
    result_dcr = get_attack('dcr')
    result_inference = get_attack('inference')
    result_attack = pd.concat([result_recon, result_dcr, result_inference])
    result_attack = result_attack.groupby(['data_name', 'synth_model', 'synth_size']).max()

    eps_labels = ['1', '10', '100']
    curr_synth_models = []
    for eps in [1, 10, 100]:
        curr_synth_models.append(f'{dp_name}_{eps}eps')
    # curr_synth_models.append(nondp_name)
    curr_synth_model_labels = [f'$\\varepsilon = {eps}$' for eps in eps_labels]

    # curr_synth_model_labels[-1] = f'{curr_synth_model_labels[-1]} ({synth_model_label})'

    combined_df = pd.merge(result_utility, result_attack, how='inner', on=['data_name', 'synth_model', 'synth_size'])

    if 'BayNet' in nondp_name:
        curr_colors = [attack_colors[0], attack_colors[1], attack_colors[2], colors[1]]
    elif 'RAP' in nondp_name:
        curr_colors = [attack_colors[0], attack_colors[1], attack_colors[2], colors[2]]

    fig = plot_priv_util_tradeoff(combined_df, curr_synth_models, curr_synth_model_labels, synth_colors=curr_colors)

    return fig

In [None]:
fig = plot_priv_util_tradeoff_dp('RAP_2Kiters')
if save_img:
    fig.savefig(f'{img_dir}/privacy_utility_tradeoff_dp_rapdp.pdf', bbox_inches='tight')

In [None]:
fig = plot_priv_util_tradeoff_dp('PrivBayes_3parents')
if save_img:
    fig.savefig(f'{img_dir}/privacy_utility_tradeoff_dp_privbayes.pdf', bbox_inches='tight')

In [None]:
curr_synth_models = synth_models[:-2] # drop CTGAN and IndHist
curr_synth_model_labels = synth_model_labels[:-2] # drop CTGAN and IndHist
# curr_synth_models, curr_synth_model_labels = synth_models, synth_model_labels
curr_data_names = data_names[1:] # drop ACS
curr_data_labels = data_labels[1:] # drop ACS

fig, axs = plt.subplots(len(curr_data_names), len(curr_synth_models))

results = result_privacy[
    (result_privacy['attack_name'] == 'recon') &
    (result_privacy['k'] == 3) &
    (result_privacy['n_queries'] == -1)
]

first_label = True
for j, (data_name, data_label) in enumerate(zip(curr_data_names, curr_data_labels)):
    for i, (synth_model, synth_model_label) in enumerate(zip(curr_synth_models, curr_synth_model_labels)):
        ax = axs.flat[j * len(curr_synth_models) + i]

        curr_results = results[(results['data_name'] == data_name) & (results['synth_model'] == synth_model)]
        mar_results = curr_results[curr_results['scale_type'] == 'normal']['acc'].to_numpy()
        cond_results = curr_results[curr_results['scale_type'] == 'cond']['acc'].to_numpy()

        ax.errorbar(synth_sizes, mar_results * 100, color=attack_colors[0], marker='o', label='Marginal queries' if first_label else None, yerr=np.sqrt(1 / 500 * mar_results * (1 - mar_results)) * 100, capsize=4, ecolor=attack_colors[0])
        ax.errorbar(synth_sizes, cond_results * 100, color=attack_colors[1], marker='o', label='Conditional queries' if first_label else None, yerr=np.sqrt(1 / 500 * cond_results * (1 - cond_results)) * 100, capsize=4, ecolor=attack_colors[1])

        ax.plot(synth_sizes, np.ones(len(synth_sizes)) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('Synthetic data size ($m$)')
        ax.set_xticks(synth_sizes)
        ax.set_ylim(43, 105)
        ax.set_yticks(np.linspace(50, 100, 6))
        ax.set_ylabel(f'{data_label}\nAttack accuracy (\%)')
        if j == 0:
            ax.set_title(synth_model_label)

        ax.label_outer()
        first_label = False

# axs.flat[len(curr_synth_models) - 1].legend(bbox_to_anchor=(1.9, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2))
fig.set_size_inches(len(curr_synth_models) * 4, len(curr_data_labels) * 4)
if save_img:
    fig.savefig(f'{img_dir}/cond_vs_mar_min.pdf', bbox_inches='tight')

In [None]:
curr_synth_models = synth_models[:-2] # drop CTGAN and IndHist
curr_synth_model_labels = synth_model_labels[:-2] # drop CTGAN and IndHist
# curr_synth_models, curr_synth_model_labels = synth_models, synth_model_labels
curr_colors = ['black', attack_colors[0], attack_colors[1], attack_colors[2]]

fig, axs = plt.subplots(len(curr_synth_models), len(data_names))

results = result_privacy[
    (result_privacy['attack_name'] == 'recon') &
    (result_privacy['k'] == 3) &
    (result_privacy['n_queries'] == -1)
]

first_label = True
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    for j, (synth_model, synth_model_label) in enumerate(zip(curr_synth_models, curr_synth_model_labels)):
        ax = axs.flat[j * len(data_names) + i]

        curr_results = results[(results['data_name'] == data_name) & (results['synth_model'] == synth_model)]
        mar_results = curr_results[curr_results['scale_type'] == 'normal']['acc'].to_numpy()
        cond_results = curr_results[curr_results['scale_type'] == 'cond']['acc'].to_numpy()

        ax.errorbar(synth_sizes, mar_results * 100, color=attack_colors[0], marker='o', label='Marginal queries' if first_label else None, yerr=np.sqrt(1 / 500 * mar_results * (1 - mar_results)) * 100, capsize=4, ecolor=attack_colors[0])
        ax.errorbar(synth_sizes, cond_results * 100, color=attack_colors[1], marker='o', label='Conditional queries' if first_label else None, yerr=np.sqrt(1 / 500 * cond_results * (1 - cond_results)) * 100, capsize=4, ecolor=attack_colors[1])

        ax.plot(synth_sizes, np.ones(len(synth_sizes)) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('Synthetic data size ($m$)')
        ax.set_xticks(synth_sizes)
        ax.set_ylim(43, 105)
        ax.set_yticks(np.linspace(50, 100, 6))
        ax.set_ylabel(f'{synth_model_label}\nAttack accuracy (\%)')
        if j == 0:
            ax.set_title(data_label)

        ax.label_outer()
        first_label = False

# axs.flat[len(curr_synth_models) - 1].legend(bbox_to_anchor=(1.9, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, 0.05))
fig.set_size_inches(len(data_names) * 4, len(curr_synth_models) * 4)
if save_img:
    fig.savefig(f'{img_dir}/cond_vs_mar_all.pdf', bbox_inches='tight')

In [None]:
curr_synth_models = synth_models[:-2] # drop CTGAN and IndHist
curr_synth_model_labels = synth_model_labels[:-2] # drop CTGAN and IndHist
# curr_synth_models, curr_synth_model_labels = synth_models, synth_model_labels
curr_colors = ['black', attack_colors[0], attack_colors[1], attack_colors[2]]

fig, axs = plt.subplots(len(curr_synth_models), len(data_names))

results = result_privacy[
    (result_privacy['attack_name'] == 'recon') &
    (result_privacy['scale_type'] == 'cond')
]
ks = [2, 3, 4, 234]
k_labels = ['All 2-way', 'All 3-way', '100K 4-way', '2 + 3 + 4-way']
curr_colors = ['black', attack_colors[0], attack_colors[1], attack_colors[2]]

first_label = True
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    for j, (synth_model, synth_model_label) in enumerate(zip(curr_synth_models, curr_synth_model_labels)):
        ax = axs.flat[j * len(data_names) + i]

        curr_results = results[(results['data_name'] == data_name) & (results['synth_model'] == synth_model)]
        for k, k_label, color in zip(ks, k_labels, curr_colors):
            if k == 2 or k == 3:
                curr_results = curr_results[curr_results['n_queries'] == -1]
            k_results = curr_results[curr_results['k'] == k]['acc'].to_numpy()

            ax.errorbar(synth_sizes, k_results * 100, color=color, marker='o', label=f'{k_label} queries' if first_label else None, yerr=np.sqrt(1 / 500 * k_results * (1 - k_results)) * 100, capsize=4, ecolor=color)

        ax.plot(synth_sizes, np.ones(len(synth_sizes)) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('Synthetic data size ($m$)')
        ax.set_xticks(synth_sizes)
        ax.set_ylim(43, 105)
        ax.set_yticks(np.linspace(50, 100, 6))
        ax.set_ylabel(f'{synth_model_label}\nAttack accuracy (\%)')
        if j == 0:
            ax.set_title(data_label)

        ax.label_outer()
        first_label = False

# axs.flat[len(curr_synth_models) - 1].legend(bbox_to_anchor=(2.0, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, 0.025))
fig.set_size_inches(len(data_names) * 4, len(curr_synth_models) * 4)
if save_img:
    fig.savefig(f'{img_dir}/2_vs_3_vs_4way.pdf', bbox_inches='tight')

In [None]:
from tqdm import tqdm
import numpy as np

reps = 500

avg_n_queries = {
    'acs': {
        2: 0,
        3: 0,
        4: 100000,
        234: 0
    },
    'fire': {
        2: 0,
        3: 0,
        4: 100000,
        234: 0
    },
}

for data_name in ['acs', 'fire']:
    for k in [2, 3]:
        for rep in tqdm(range(reps)):
            query_results = np.load(f'{data_dir}/{data_name}/reps/rep_{rep}/queries/simple/{k}way/result.npz')
            n_queries = len(query_results) // 2
            avg_n_queries[data_name][k] += n_queries / reps
    
    avg_n_queries[data_name][234] = avg_n_queries[data_name][2] + avg_n_queries[data_name][3] + avg_n_queries[data_name][4]

avg_n_queries

In [None]:
curr_synth_models = synth_models[:-2] # drop CTGAN and IndHist
curr_synth_model_labels = synth_model_labels[:-2] # drop CTGAN and IndHist
# curr_synth_models, curr_synth_model_labels = synth_models, synth_model_labels
curr_colors = ['black', attack_colors[0], attack_colors[1], attack_colors[2]]
curr_data_names = data_names[1:] # drop ACS
curr_data_labels = data_labels[1:] # drop ACS

fig, axs = plt.subplots(len(curr_data_names), len(curr_synth_models))

results = result_privacy[
    (result_privacy['attack_name'] == 'recon') &
    (result_privacy['scale_type'] == 'cond')
]

n_queriess = [10, 100, 1000, 10000, 100000, -1]
synth_size = 1000000

first_label = True
for j, (data_name, data_label) in enumerate(zip(curr_data_names, curr_data_labels)):
    for i, (synth_model, synth_model_label) in enumerate(zip(curr_synth_models, curr_synth_model_labels)):
        ax = axs.flat[j * len(curr_synth_models) + i]

        ks = [2, 3, 4]
        k_labels = ['All 2-way', 'All 3-way', '100K 4-way']
        k, k_label, color = ks[1], k_labels[1], curr_colors[1]
        curr_results = results[
            (results['data_name'] == data_name) &
            (results['synth_model'] == synth_model) &
            (results['synth_size'] == synth_size) &
            (results['k'] == k) &
            (results['n_queries'].isin(n_queriess))
        ]

        curr_n_queries, curr_accs = [], []
        for n_queries in n_queriess:
            if n_queries >= avg_n_queries[data_name][k]:
                continue

            if n_queries == -1:
                curr_n_queries.append(avg_n_queries[data_name][k])
            else:
                curr_n_queries.append(n_queries)

            curr_accs.append(curr_results[curr_results['n_queries'] == n_queries]['acc'].to_numpy()[0])

        ax.plot(curr_n_queries, np.array(curr_accs) * 100, color=color, marker='o', label=k_label if first_label else None)

        ax.plot(curr_n_queries, np.ones(len(curr_n_queries)) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('\# queries')
        ax.set_xticks([10, 100, 1000, 10000])
        ax.set_ylim(43, 105)
        ax.set_yticks(np.linspace(50, 100, 6))
        ax.set_ylabel(data_label)
        if j == 0:
            ax.set_title(f'{synth_model_label}\nAttack accuracy (\%)')
        ax.label_outer()

        first_label = False

# axs.flat[len(curr_synth_models) - 1].legend(bbox_to_anchor=(1.7, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2))
fig.set_size_inches(len(curr_synth_models) * 4, len(curr_data_names) * 4)

if save_img:
    fig.savefig(f'{img_dir}/acc_vs_n_queries_min.pdf', bbox_inches='tight')

In [None]:
curr_synth_models = synth_models[:-2] # drop CTGAN and IndHist
curr_synth_model_labels = synth_model_labels[:-2] # drop CTGAN and IndHist
# curr_synth_models, curr_synth_model_labels = synth_models, synth_model_labels
curr_colors = ['black', attack_colors[0], attack_colors[1], attack_colors[2]]

fig, axs = plt.subplots(len(curr_synth_models), len(data_names))

results = result_privacy[
    (result_privacy['solver'] == 'recon') &
    (result_privacy['scale_type'] == 'cond')
]

n_queriess = [10, 100, 1000, 10000, 100000, -1]
synth_size = 1000000

first_label = True
for i, (data_name, data_label) in enumerate(zip(data_names, data_labels)):
    for j, (synth_model, synth_model_label) in enumerate(zip(curr_synth_models, curr_synth_model_labels)):
        ax = axs.flat[j * len(data_names) + i]

        ks = [2, 3, 4]
        k_labels = ['All 2-way', 'All 3-way', '100K 4-way']
        for k, k_label, color in zip(ks, k_labels, curr_colors):
            curr_results = results[
                (results['data_name'] == data_name) &
                (results['synth_model'] == synth_model) &
                (results['synth_size'] == synth_size) &
                (results['k'] == k) &
                (results['n_queries'].isin(n_queriess))
            ]

            curr_n_queries, curr_accs = [], []
            for n_queries in n_queriess:
                if n_queries >= avg_n_queries[data_name][k]:
                    continue

                if n_queries == -1:
                    curr_n_queries.append(avg_n_queries[data_name][k])
                else:
                    curr_n_queries.append(n_queries)

                curr_accs.append(curr_results[curr_results['n_queries'] == n_queries]['acc'].to_numpy()[0])

            ax.plot(curr_n_queries, np.array(curr_accs) * 100, color=color, marker='o', label=k_label if first_label else None)

        ax.plot(curr_n_queries, np.ones(len(curr_n_queries)) * 50, color='black', linestyle='--', label='Random baseline' if first_label else None)
        ax.set_xscale('log')
        ax.set_xlabel('\# queries')
        ax.set_xticks([10, 100, 1000, 10000, 100000])
        ax.set_ylim(43, 105)
        ax.set_yticks(np.linspace(50, 100, 6))
        ax.set_ylabel(f'{synth_model_label}\nAttack accuracy (\%)')
        if j == 0:
            ax.set_title(data_label)
        ax.label_outer()

        first_label = False

# axs.flat[len(curr_synth_models) - 1].legend(bbox_to_anchor=(1.7, 1.03125))
fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, 0.025))
fig.set_size_inches(len(data_names) * 4, len(curr_synth_models) * 4)

if save_img:
    fig.savefig(f'{img_dir}/acc_vs_n_queries.pdf', bbox_inches='tight')

## Reported Values

In [None]:
# Section 6.1: AUCs
results = get_attack('recon')
results[(results['synth_size'] == 1000000) & (results['synth_model'].isin(synth_models))]

In [None]:
# Section 6.2: Comparison to prior attacks
cols = ['data_name', 'synth_model', 'synth_size', 'acc']
results_recon = get_attack('recon')
results_recon = results_recon[(results_recon['synth_size'] == 1000000) & (results_recon['synth_model'] == 'RAP_2Kiters')][cols]

results_dcr = get_attack('dcr')
results_dcr = results_dcr[(results_dcr['synth_size'] == 1000000) & (results_dcr['synth_model'] == 'RAP_2Kiters')][cols]

results_infer = get_attack('inference')
results_infer = results_infer[(results_infer['synth_size'] == 1000000) & (results_infer['synth_model'] == 'RAP_2Kiters')][cols]

results = results_recon.merge(results_dcr, how='inner', on=['data_name', 'synth_model', 'synth_size'], suffixes=['_recon', '_dcr'])
results = results.merge(results_infer, how='inner', on=['data_name', 'synth_model', 'synth_size']).rename({'acc': 'acc_infer'}, axis='columns')
results

In [None]:
# Section 6.3: Impact of synth data size
results = get_attack('recon')
results[(results['synth_size'].isin([100, 1000, 1000000])) & (results['synth_model'].isin(['RAP_2Kiters', 'BayNet_3parents']))].sort_values(['synth_model', 'data_name'])

In [None]:
# Section 6.4: Calculate tradeoff accs
df_privacy = pd.read_csv(f'{data_dir}/results_privacy.csv')
df_privacy = df_privacy[((df_privacy['k'] == 3) & (df_privacy['n_queries'] == -1) & (df_privacy['scale_type'] == 'cond')) | (df_privacy['attack_name'] == 'dcr') | (df_privacy['attack_name'] == 'infer')]
df_privacy = df_privacy.groupby(['data_name', 'synth_model', 'synth_size'], as_index=False).min()[['data_name', 'synth_model', 'synth_size', 'acc']]

df_utility = pd.read_csv(f'{data_dir}/results_utility.csv')

def get_min_amax(data_name, synth_model):
    curr_df = df_utility[
        (df_utility['data_name'] == data_name) &
        (df_utility['synth_model'] == synth_model) &
        (df_utility['avg_tvd'] < 0.2) &
        (df_utility['rel_mean'] < 0.2)
    ]

    curr_df = curr_df.merge(df_privacy, on=['data_name', 'synth_model', 'synth_size'])
    min_record = curr_df[curr_df['acc'] == curr_df['acc'].min()]
    return min_record[['data_name', 'synth_model', 'synth_size', 'avg_tvd', 'rel_mean', 'acc']]

def get_min_err(data_name, synth_model, err):
    curr_df = df_privacy[
        (df_privacy['data_name'] == data_name) &
        (df_privacy['synth_model'] == synth_model) &
        (df_privacy['acc'] < 0.6)
    ]

    curr_df = curr_df.merge(df_utility, on=['data_name', 'synth_model', 'synth_size'])
    min_record = curr_df[curr_df[err] == curr_df[err].min()]
    return min_record[['data_name', 'synth_model', 'synth_size', 'avg_tvd', 'rel_mean', 'acc']]

In [None]:
get_min_amax('acs', 'BayNet_3parents')

In [None]:
get_min_amax('acs', 'RAP_2Kiters')

In [None]:
get_min_amax('fire', 'BayNet_3parents')

In [None]:
get_min_amax('fire', 'RAP_2Kiters')

In [None]:
get_min_err('acs', 'CTGAN', 'rel_mean')

In [None]:
get_min_err('acs', 'BayNet_3parents', 'rel_mean')

In [None]:
get_min_err('acs', 'RAP_2Kiters', 'rel_mean')

In [None]:
get_min_err('fire', 'CTGAN', 'rel_mean')

In [None]:
get_min_err('fire', 'BayNet_3parents', 'rel_mean')

In [None]:
get_min_err('fire', 'RAP_2Kiters', 'rel_mean')

In [None]:
# Section 6.5: Impact of DP
cols = ['data_name', 'synth_model', 'synth_size', 'acc']
curr_synth_models = ['RAP_2Kiters', 'RAP_2Kiters_10eps']
results_recon = get_attack('recon')
results_recon = results_recon[(results_recon['synth_size'] == 1000000) & (results_recon['synth_model'].isin(curr_synth_models))][cols]

results_dcr = get_attack('dcr')
results_dcr = results_dcr[(results_dcr['synth_size'] == 1000000) & (results_dcr['synth_model'].isin(curr_synth_models))][cols]

results_infer = get_attack('inference')
results_infer = results_infer[(results_infer['synth_size'] == 1000000) & (results_infer['synth_model'].isin(curr_synth_models))][cols]

results = results_recon.merge(results_dcr, how='inner', on=['data_name', 'synth_model', 'synth_size'], suffixes=['_recon', '_dcr'])
results = results.merge(results_infer, how='inner', on=['data_name', 'synth_model', 'synth_size']).rename({'acc': 'acc_infer'}, axis='columns')
results

In [None]:
results = get_attack('recon')
results[(results['synth_model'].apply(lambda x: '_10eps' in x or '_100eps' in x)) & (results['synth_size'].isin([1000000, 100]))]

In [None]:
results = get_attack('inference')
results[(results['synth_model'].apply(lambda x: '_10eps' in x or '_100eps' in x)) & (results['synth_size'].isin([1000000, 100]))]

In [None]:
# Section 6.6: Computational overhead
with open(f'{data_dir}/log_time.txt', 'r') as f:
    log_time = [line.rstrip() for line in f.readlines()]

# parse log into results dictionary
n_queriess = [10, 100, 1000, 10000, 100000]
n_rowss = [10, 100, 1000, 10000, 100000, 1000000]
results_dict = {
    action: {
        n_queries: {
            n_rows: None 
            for n_rows in n_rowss
        }
        for n_queries in n_queriess
    }
    for action in ['gen_queries', 'process_queries', 'attack']
}

for i, curr_line in enumerate(log_time):
    if ',' not in curr_line:
        curr_value = float(curr_line)
        action_line_split = log_time[i-1].split(',')
        curr_action, n_queries = action_line_split[0], int(action_line_split[1])
        if curr_action == 'gen_queries':
            for n_rows in n_rowss:
                results_dict[curr_action][n_queries][n_rows] = curr_value
        else:
            n_rows = int(action_line_split[2])
            results_dict[action_line_split[0]][n_queries][n_rows] = curr_value

records = []
for i, n_rows in enumerate(n_rowss):
    record = {'n_rows': n_rows}
    print(f'$10^{i + 1}$ & ', end='')
    for n_queries in n_queriess:
        elapsed_time = results_dict['gen_queries'][n_queries][n_rows] + \
                    results_dict['process_queries'][n_queries][n_rows] + \
                    results_dict['attack'][n_queries][n_rows]
        
        record[n_queries] = elapsed_time / 500
        print(f'{elapsed_time / 500:.2f} & ', end='')
    
        # records.append({
        #     'n_queries': n_queries,
        #     'n_rows': n_rows,
        #     'amort_time': elapsed_time / 500
        # })
    print()
    
    records.append(record)

results_time = pd.DataFrame.from_records(records)
results_time

In [None]:
# Section 6.6 Memory usage

with open(f'{data_dir}/memory_usage.txt', 'r') as f:
    log_memory = np.array([float(line.rstrip()) for line in f.readlines()])

amort_memory = (log_memory.max() / 32) / 1024
print(f'Average memory cost: {amort_memory:.2f} GB/rep')

In [None]:
# Appendix B: total number of possible queries
from itertools import combinations
from tqdm import tqdm

def get_n_queries(data_name, k):
    secret_bit = get_default_secret_bit(data_name)
    avg_n_queries = 0
    reps = 500
    for rep in tqdm(range(reps)):
        df = pd.read_csv(f'{data_dir}/{data_name}/reps/rep_{rep}/df.csv')
        cols_X = list(df.columns)
        cols_X.remove(secret_bit)
        df_matrix = df[cols_X].to_numpy()

        _, m = df_matrix.shape

        # gather number of unique values for each attribute
        n_uniq_vals = []
        for attr_ind in range(m):
            n_uniq_vals.append(len(np.unique(df_matrix[:, attr_ind])))

        n_queries = 0
        attr_indss = combinations(range(m), k - 1)
        for attr_inds in attr_indss:
            curr_n_queries = 1
            for attr_ind in attr_inds:
                curr_n_queries *= n_uniq_vals[attr_ind]
            
            n_queries += curr_n_queries
        
        avg_n_queries += n_queries / reps
    
    return avg_n_queries

get_n_queries('acs', 4), get_n_queries('fire', 4)

In [None]:
result_privacy[(result_privacy['data_name'] == 'fire') & (result_privacy['synth_model'] == 'RAP_2Kiters') & (result_privacy['n_queries'].isin([1000, 10000, -1]) & (result_privacy['synth_size'] == 1000000))]

In [None]:
result_privacy[(result_privacy['data_name'] == 'fire') & (result_privacy['synth_model'] == 'RAP_2Kiters') & (result_privacy['n_queries'] == -1) & (result_privacy['k'] == 3) & (result_privacy['synth_size'] == 1000)]