In [1]:
import sys
sys.path.append("../")
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import torch
import seaborn as sns
from opacus.accountants.rdp import RDPAccountant

import wilds
from wilds.common.grouper import CombinatorialGrouper


sns.set_style("whitegrid")

plt.rcParams['text.usetex'] = True #Let TeX do the typsetting
plt.rcParams['text.latex.preamble'] = r"""
\usepackage{sansmath}
\sansmath
""" #Force sans-serif math mode (for axes labels)
plt.rcParams['font.family'] = 'sans-serif' # ... for regular text
plt.rcParams['font.sans-serif'] = 'Helvetica, Avant Garde, Computer Modern Sans serif' # Choose a nice font here

fontsize = 15

In [2]:
from opacus.accountants.analysis.gdp import compute_mu_poisson, eps_from_mu

def get_sampling_weights(ds_name):
    full_dataset = wilds.get_dataset(
        dataset=ds_name,
        version="1.0",
        root_dir="../data",
        download=True,
        split_scheme="official")

    train_grouper = CombinatorialGrouper(
            dataset=full_dataset,
            groupby_fields=full_dataset._metadata_fields)

    trn_dset = full_dataset.get_subset(
                "train",
                train_grouper=train_grouper,
                frac=1.0,
                subsample_to_minority=False)

    groups, group_counts = train_grouper.metadata_to_group(
                    trn_dset.metadata_array,
                    return_counts=True)
    group_weights = 1 / group_counts
    weights = group_weights[groups]
    weights = weights / weights.sum() * len(trn_dset)
    return weights, len(trn_dset)
    
def get_privacy_spent(sigma, epochs, n_iters, sample_rate, alphas,):
    accountant = RDPAccountant()
    for _ in range(epochs):
        for _ in range(n_iters):
            accountant.step(noise_multiplier=sigma*grad_norm, sample_rate=sample_rate)
    return accountant.get_privacy_spent(delta=1e-5, alphas=alphas)

def get_privacy_spent_v2(n_samples, sample_rate, sigma, epochs, weight):
    #n_samples = 162_770
    delta = delta=1 / (2 * n_samples)
    mu = compute_mu_poisson(
        steps=epochs / sample_rate,
        sample_rate = sample_rate * weight,
        noise_multiplier=sigma,
    )
    eps = eps_from_mu(mu=mu, delta=delta)
    return eps

def get_sigma_epsilon(ds_name, epochs, sample_rate, sigmas, grad_norm, weighted_sampling=True):
    alphas = [1 + x / 2000.0 for x in range(1, 20000)] + list(range(12, 64))

    if weighted_sampling:
        weights, n_samples = get_sampling_weights(ds_name)
        weight = weights.max().item()
    else:
        _, n_samples = get_sampling_weights(ds_name)
        weight = 1.0

    data = {}
    for epoch, sigma in zip(epochs, sigmas):
        #epsilon, alpha = get_privacy_spent(sigma, epoch, int(1 / sample_rate), sample_rate * weight, alphas)
        epsilon = get_privacy_spent_v2(n_samples, sample_rate, sigma, epoch, weight)
        data[sigma] = epsilon
    return data

In [3]:
def get_res(log_path, col_names):
    df = pd.read_csv(f"{log_path}/train_eval.csv")
    trn_res = df[col_names].values.tolist()[-1]
    df = pd.read_csv(f"{log_path}/test_eval.csv")
    tst_res = df[col_names].values.tolist()[-1]
    return trn_res, tst_res

In [4]:
column_names = [
    ("train", "not blond", "female"),
    ("train", "not blond", "male"),
    ("train", "blond", "female"),
    ("train", "blond", "male"),
    ("test", "not blond", "female"),
    ("test", "not blond", "male"),
    ("test", "blond", "female"),
    ("test", "blond", "male"),
]
col_names = ['acc_y:notblond_male:0', 'acc_y:notblond_male:1', 'acc_y:blond_male:0', 'acc_y:blond_male:1']
order = ['ERM', 'IWERM', 'gDRO', 'DP ERM', 'DP ERM IW']
def key_fn(x):
    ret = []
    for xi in x:
        if xi in order:
            ret.append(order.index(xi))
        else:
            ret.append(0 if xi == "train" else 1)
    return ret

base_path = "./data"

data = {}
log_path = os.path.join(base_path, "celebA/erm-resnet50/")
data["ERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "celebA/iwerm-resnet50/")
data["IWERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "celebA/erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.1_1.0_0.0001/")
data["DP ERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "celebA/weightederm-dp_resnet50-dpsgd_1e-5_0.1_1.0_0.0001/")
data["DP ERM IW"] = np.concatenate(get_res(log_path, col_names))

df = pd.DataFrame.from_dict(data).transpose()
df.columns = pd.MultiIndex.from_tuples(column_names)
df = df.stack(0)
df = df.sort_index(axis=0, level=[0, 1], key=key_fn)
print(df.to_latex(float_format="%.2f", multirow=True).replace("llrrrr", "llcccc").replace("{l}", "{c}"))

\begin{tabular}{llcccc}
\toprule
          &      & \multicolumn{2}{c}{blond} & \multicolumn{2}{c}{not blond} \\
          &      & female & male &    female & male \\
\midrule
\multirow{2}{*}{ERM} & train &   1.00 & 0.99 &      1.00 & 1.00 \\
          & test &   0.80 & 0.42 &      0.97 & 1.00 \\
\cline{1-6}
\multirow{2}{*}{IWERM} & train &   0.98 & 0.99 &      0.98 & 0.99 \\
          & test &   0.87 & 0.49 &      0.95 & 0.98 \\
\cline{1-6}
\multirow{2}{*}{DP ERM} & train &   0.80 & 0.41 &      0.96 & 0.99 \\
          & test &   0.74 & 0.29 &      0.98 & 1.00 \\
\cline{1-6}
\multirow{2}{*}{DP ERM IW} & train &   0.94 & 0.96 &      0.88 & 0.90 \\
          & test &   0.92 & 0.85 &      0.91 & 0.92 \\
\bottomrule
\end{tabular}



In [4]:
def get_data(sigmas, log_paths, col_names, max_epoch=-1, early_stop=False, base_path="./data/"):
    #base_path = "../logs/"
    
    results = {}
    for sigma, log_path in zip(sigmas, log_paths):
        results.setdefault("sigma", []).append(sigma)
        
        if early_stop:
            df = pd.read_csv(os.path.join(base_path, f"{ds_name}/{log_path}/val_eval.csv"))
            val_res = df[["epoch", "acc_avg", "acc_wg"] + col_names].values
            epoch_no = val_res[:max_epoch, 1].argmax()
            results.setdefault("val epoch", []).append(epoch_no + 1)
        else:
            epoch_no = max_epoch - 1
            results.setdefault("val epoch", []).append(epoch_no)

        df = pd.read_csv(os.path.join(base_path, f"{ds_name}/{log_path}/train_eval.csv"))
        train_res = df[["epoch", "acc_avg", "acc_wg"] + col_names].values
        if epoch_no > len(train_res):
            epoch_no = -1
            print(log_path, len(train_res))
        results.setdefault("trn acc", []).append(train_res[epoch_no][1])
        results.setdefault("trn wg acc", []).append(train_res[epoch_no][2])
        disparity = np.max(train_res[epoch_no][4:]) - np.min(train_res[epoch_no][4:])
        results.setdefault("trn disparity", []).append(disparity)

        df = pd.read_csv(os.path.join(base_path, f"{ds_name}/{log_path}/test_eval.csv"))
        res = df[["epoch", "acc_avg", "acc_wg", "epsilon"] + col_names].values.tolist()
        results.setdefault("tst acc", []).append(res[epoch_no][1])
        results.setdefault("tst wg acc", []).append(res[epoch_no][2])
        results.setdefault("epsilon", []).append(res[epoch_no][3])
        disparity = np.max(res[-1][4:]) - np.min(res[epoch_no][4:])
        results.setdefault("tst disparity", []).append(disparity)

    for k, v in results.items():
        results[k] = np.array(v)
    return results

In [323]:
ds_name = "celebA"
epochs = 30

col_names = ['acc_y:notblond_male:0', 'acc_y:notblond_male:1', 'acc_y:blond_male:0', 'acc_y:blond_male:1']

sigmas = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
sigmas = [0.35, 0.4, 0.5, 1.0, 5.0]
log_paths = []
#for sigma in sigmas:
#    log_paths.append(f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_{sigma}_1.0_0.0001")
log_paths = [
    #f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.1_1.0_0.0001",
    f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.35_1.0_0.0001",
    f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.4_1.0_0.0001",
    f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.5_1.0_0.0001",
    f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_1.0_1.0_0.0001",
    f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_5.0_1.0_0.0001",
    #f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_10.0_1.0_0.0001",
]
    
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df1 = pd.DataFrame.from_dict(data)

sigmas = [1.0, 2.0, 3.0, 5.0, 10.0]
log_paths = [
    f"weightederm-dp_resnet50-dpsgd_1e-5_1.0_0.1_0.0001",
    f"weightederm-dp_resnet50-dpsgd_1e-5_2.0_0.1_0.0001",
    f"weightederm-dp_resnet50-dpsgd_1e-5_3.0_0.1_0.0001",
    f"weightederm-dp_resnet50-dpsgd_1e-5_5.0_0.1_0.0001",
    f"weightederm-dp_resnet50-dpsgd_1e-5_10.0_0.1_0.0001",
]
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df2 = pd.DataFrame.from_dict(data)

In [324]:
ds_name = "celebA"
sample_rate = 0.0001
grad_norm = 1.0

#df = pd.merge(df1[['sigma', 'tst disparity', 'tst acc', 'val epoch']],
#              df2[['sigma', 'tst disparity', 'tst acc', 'val epoch']], on="sigma")

#eps = get_sigma_epsilon(ds_name, df1['val epoch'].tolist(), sample_rate, df1['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=False)
eps = get_sigma_epsilon(ds_name, [epochs] * len(df1['sigma']), sample_rate, df1['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=False)
eps = [eps[s] for s in df1['sigma'].tolist()]
#weps = get_sigma_epsilon(ds_name, df2['val epoch'].tolist(), sample_rate, df2['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=True)
weps = get_sigma_epsilon(ds_name, [epochs] * len(df2['sigma']), sample_rate, df2['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=True)
weps = [weps[s] for s in df2['sigma'].tolist()]

ret = {
    "eps": eps,
    "weps": weps,
    "val epoch": df1['val epoch'].tolist(),
    "val epoch dpiw": df2['val epoch'].tolist(),
    "tst disparity": df1['tst disparity'].tolist(),
    "tst disparity dpiw": df2['tst disparity'].tolist(),
    "tst acc": df1['tst acc'].tolist(),
    "tst acc dpiw": df2['tst acc'].tolist(),
    "tst wg acc": df1['tst wg acc'].tolist(),
    "tst wg acc dpiw": df2['tst wg acc'].tolist(),
}
joblib.dump(ret, f"data/disparity_{ds_name}.pkl")

print(ret)

#plt.plot(eps, df['tst disparity_x'].tolist(), label="DP")
#plt.plot(weps, df['tst disparity_y'].tolist(), label="DP IW")
#
#plt.xticks(fontsize=fontsize)
#plt.yticks(fontsize=fontsize)
#plt.xscale("log")
#plt.xlabel("Privacy budget $\epsilon$", fontsize=fontsize)
#plt.ylabel("Disparity", fontsize=fontsize)
#plt.legend(fontsize=fontsize, frameon=False)
#plt.tight_layout()
#plt.savefig(f"./figs/disparity_{ds_name}.png")
#plt.show()

{'eps': [19.308295030488967, 5.986238994520431, 1.6704210664616577, 0.25916170582883546, 0.0342371335515071], 'weps': [11.211052000565424, 3.8919350898153136, 2.3720271692939545, 1.3253629702536864, 0.6194620711495639], 'val epoch': [29, 29, 29, 29, 29], 'val epoch dpiw': [29, 29, 29, 29, 29], 'tst disparity': [0.7322347164154055, 0.715302646160126, 0.8302440494298935, 1.0, 1.0], 'tst disparity dpiw': [0.14462369680404663, 0.15380823612213135, 0.1383289098739624, 0.3144318461418152, 0.8349544405937195], 'tst acc': [0.9348261952400208, 0.931319534778595, 0.9172928333282472, 0.8667468428611755, 0.8667468428611755], 'tst acc dpiw': [0.8767157793045044, 0.8292756080627441, 0.8033263087272644, 0.6442240476608276, 0.1332531869411468], 'tst wg acc': [0.2555555701255798, 0.2722222208976745, 0.1555555611848831, 0.0, 0.0], 'tst wg acc dpiw': [0.7666666507720947, 0.7611111402511597, 0.7596549391746521, 0.5512939691543579, 0.0]}


In [256]:
df

Unnamed: 0,sigma,tst disparity_x,tst acc_x,val epoch_x,tst disparity_y,tst acc_y,val epoch_y
0,1.0,1.0,0.866747,1,0.10784,0.844404,19
1,5.0,1.0,0.866747,1,1.0,0.866747,1
2,10.0,1.0,0.866747,2,0.834954,0.866747,4


In [274]:
ds_name = "utkface"

col_names = [
    'acc_y:male_race:White',
    'acc_y:male_race:Black',
    'acc_y:male_race:Asian',
    'acc_y:male_race:Indian',
    'acc_y:male_race:Others',
    'acc_y:female_race:White',
    'acc_y:female_race:Black',
    'acc_y:female_race:Asian',
    'acc_y:female_race:Indian',
    'acc_y:female_race:Others',
]

base_path = "./data"

data = {}
log_path = os.path.join(base_path, "utkface/erm-resnet50/")
data["ERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "utkface/iwerm-resnet50/")
data["IWERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "utkface/erm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.1_1.0_0.001/")
data["DP ERM"] = np.concatenate(get_res(log_path, col_names))
log_path = os.path.join(base_path, "utkface/weightederm-dp_resnet50-lr1e-3-dpsgd_1e-5_0.1_1.0_0.001/")
data["DP ERM IW"] = np.concatenate(get_res(log_path, col_names))

In [275]:
column_names = []
for i in ["train", "test"]:
    for j in ["male", "female"]:
        for k in ["White", "Black", "Asian", "Indian", "Others"]:
            column_names.append((i, j, k))
            
order = ['ERM', 'IWERM', 'DP ERM', 'DP ERM IW']
def key_fn(x):
    ret = []
    for xi in x:
        if xi in order:
            ret.append(order.index(xi))
        else:
            ret.append(0 if xi == "train" else 1)
    return ret

df = pd.DataFrame.from_dict(data).transpose()
df.columns = pd.MultiIndex.from_tuples(column_names)
df = df.stack(0)
df = df.sort_index(axis=0, level=[0, 1], key=key_fn)
print(df.to_latex(float_format="%.2f", multirow=True).replace("llrrrrrrrrrr", "llccccc|ccccc").replace("{l}", "{c}"))

\begin{tabular}{llccccc|ccccc}
\toprule
          &      & \multicolumn{5}{c}{female} & \multicolumn{5}{c}{male} \\
          &      &  Asian & Black & Indian & Others & White & Asian & Black & Indian & Others & White \\
\midrule
\multirow{2}{*}{ERM} & train &   1.00 &  1.00 &   1.00 &   0.99 &  1.00 &  1.00 &  1.00 &   1.00 &   0.99 &  1.00 \\
          & test &   0.82 &  0.85 &   0.89 &   0.84 &  0.89 &  0.87 &  0.96 &   0.93 &   0.93 &  0.95 \\
\cline{1-12}
\multirow{2}{*}{IWERM} & train &   1.00 &  1.00 &   1.00 &   0.99 &  1.00 &  1.00 &  1.00 &   1.00 &   0.99 &  1.00 \\
          & test &   0.86 &  0.89 &   0.91 &   0.86 &  0.91 &  0.87 &  0.95 &   0.93 &   0.89 &  0.93 \\
\cline{1-12}
\multirow{2}{*}{DP ERM} & train &   0.87 &  0.89 &   0.94 &   0.90 &  0.90 &  0.86 &  0.96 &   0.92 &   0.85 &  0.91 \\
          & test &   0.87 &  0.90 &   0.93 &   0.96 &  0.93 &  0.76 &  0.92 &   0.88 &   0.77 &  0.88 \\
\cline{1-12}
\multirow{2}{*}{DP ERM IW} & train &   0.87 &  0.90 &   0.93

In [74]:
ds_name = "utkface"
epochs = 100

col_names = [
    'acc_y:male_race:White',
    'acc_y:male_race:Black',
    'acc_y:male_race:Asian',
    'acc_y:male_race:Indian',
    'acc_y:male_race:Others',
    'acc_y:female_race:White',
    'acc_y:female_race:Black',
    'acc_y:female_race:Asian',
    'acc_y:female_race:Indian',
    'acc_y:female_race:Others',
]

sigmas = [0.5, 0.8, 1.0, 5.0, 10.0]
log_paths = []
for sigma in sigmas:
    log_paths.append(f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_{sigma}_1.0_0.001")
#log_paths.append(f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_1.0_1.0_0.001")
#log_paths.append(f"erm-dp_resnet50-lr1e-3-dpsgd_1e-5_10.0_1.0_0.001")
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df1 = pd.DataFrame.from_dict(data)

sigmas = [0.7, 1.0, 5.0, 10.0]
log_paths = []
for sigma in sigmas:
    log_paths.append(f"weightederm-dp_resnet50-lr1e-3-dpsgd_1e-5_{sigma}_1.0_0.001")
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df2 = pd.DataFrame.from_dict(data)

In [75]:
ds_name = "utkface"
sample_rate = 0.001
grad_norm = 1.0

#df = pd.merge(df1[['sigma', 'tst disparity']], df2[['sigma', 'tst disparity']], on="sigma")
#
#eps = get_sigma_epsilon(ds_name, epochs, sample_rate, df['sigma'].tolist(), False)
#eps = [eps[s] for s in df['sigma'].tolist()]
#weps = get_sigma_epsilon(ds_name, epochs, sample_rate, df['sigma'].tolist(), True)
#weps = [weps[s] for s in df['sigma'].tolist()]

eps = get_sigma_epsilon(ds_name, [epochs] * len(df1['sigma']), sample_rate, df1['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=False)
eps = [eps[s] for s in df1['sigma'].tolist()]
weps = get_sigma_epsilon(ds_name, [epochs] * len(df2['sigma']), sample_rate, df2['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=True)
weps = [weps[s] for s in df2['sigma'].tolist()]

ret = {
    "eps": eps,
    "weps": weps,
    "val epoch": df1['val epoch'].tolist(),
    "val epoch dpiw": df2['val epoch'].tolist(),
    "tst disparity": df1['tst disparity'].tolist(),
    "tst disparity dpiw": df2['tst disparity'].tolist(),
    "tst acc": df1['tst acc'].tolist(),
    "tst acc dpiw": df2['tst acc'].tolist(),
    "tst wg acc": df1['tst wg acc'].tolist(),
    "tst wg acc dpiw": df2['tst wg acc'].tolist(),
}
joblib.dump(ret, f"data/disparity_{ds_name}.pkl")
print(ret)

#plt.plot(eps, df['tst disparity_x'].tolist(), label="DP")
#plt.plot(weps, df['tst disparity_y'].tolist(), label="DP IW")
#
#plt.xticks(fontsize=fontsize)
#plt.yticks(fontsize=fontsize)
#plt.xscale("log")
#plt.xlabel("Privacy budget $\epsilon$", fontsize=fontsize)
#plt.ylabel("Disparity", fontsize=fontsize)
#plt.legend(fontsize=fontsize, frameon=False)
#plt.tight_layout()
#plt.savefig(f"./figs/disparity_{ds_name}.png")
#plt.show()

Downloading dataset to ../data/UTKFace_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Downloading  to ../data/UTKFace_v1.0/archive.tar.gz


0Byte [00:00, ?Byte/s]


../data/UTKFace_v1.0/archive.tar.gz may be corrupted. Please try deleting it and rerunning this command.

Exception:  unknown url type: ''
problem with:  ../data/UTKFace_v1.0/39_1_20170116174525125.jpg.chip.jpg
problem with:  ../data/UTKFace_v1.0/61_1_20170109142408075.jpg.chip.jpg
problem with:  ../data/UTKFace_v1.0/61_1_20170109150557335.jpg.chip.jpg
Downloading dataset to ../data/UTKFace_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Downloading  to ../data/UTKFace_v1.0/archive.tar.gz


0Byte [00:00, ?Byte/s]


../data/UTKFace_v1.0/archive.tar.gz may be corrupted. Please try deleting it and rerunning this command.

Exception:  unknown url type: ''
problem with:  ../data/UTKFace_v1.0/39_1_20170116174525125.jpg.chip.jpg
problem with:  ../data/UTKFace_v1.0/61_1_20170109142408075.jpg.chip.jpg
problem with:  ../data/UTKFace_v1.0/61_1_20170109150557335.jpg.chip.jpg
{'eps': [11.40438095741977, 2.347376731680425, 1.506315400257187, 0.1899858365788877, 0.08729803886149753], 'weps': [12.855302476865257, 5.55425476644115, 0.6643310892945099, 0.30655830788646443], 'val epoch': [99, 99, 99, 99, 99], 'val epoch dpiw': [99, 99, 99, 99], 'tst disparity': [0.3632596731185913, 0.27713799476623535, 0.288329541683197, 0.9362154006958008, 0.6993197053670883], 'tst disparity dpiw': [0.31173253059387207, 0.2925248146057129, 0.48421278595924383, 0.6915723532438278], 'tst acc': [0.7737892866134644, 0.7378929257392883, 0.6858538389205933, 0.4953271150588989, 0.5278249979019165], 'tst acc dpiw': [0.7514868378639221, 0

# iNaturalist

In [5]:
ds_name = "inaturalist"
epochs = 20

col_names = [
    'acc_y:Actinopterygii',
    'acc_y:Amphibia',
    'acc_y:Animalia',
    'acc_y:Arachnida',
    'acc_y:Aves', 'acc_y:Chromista',
    'acc_y:Fungi', 'acc_y:Insecta',
    'acc_y:Mammalia', 'acc_y:Mollusca',
    'acc_y:Plantae',
    'acc_y:Protozoa',
    'acc_y:Reptilia',
]

sigmas = [0.3, 0.36, 0.5, 1.0]
log_paths = []
#log_paths.append(f"erm-dp_resnet18-lr1e-3-dpsgd_1e-5_0.36_10.0_0.0001")
for sigma in sigmas:
    log_paths.append(f"erm-dp_resnet18-lr1e-3-dpsgd_1e-5_{sigma}_10.0_0.0001")
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df1 = pd.DataFrame.from_dict(data)

sigmas = [0.8, 2.0, 3.0, 4.0]
log_paths = []
for sigma in sigmas:
    log_paths.append(f"weightederm-dp_resnet18-dpsgd_1e-5_{sigma}_10.0_0.0001")
data = get_data(sigmas, log_paths, col_names, max_epoch=epochs, early_stop=False, base_path="../logs/")
df2 = pd.DataFrame.from_dict(data)

erm-dp_resnet18-lr1e-3-dpsgd_1e-5_0.5_10.0_0.0001 8


In [6]:
ds_name = "inaturalist"
sample_rate = 0.0001
grad_norm = 10.0

eps = get_sigma_epsilon(ds_name, [epochs] * len(df1['sigma']), sample_rate, df1['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=False)
eps = [eps[s] for s in df1['sigma'].tolist()]
weps = get_sigma_epsilon(ds_name, [epochs] * len(df2['sigma']), sample_rate, df2['sigma'].tolist(), grad_norm=grad_norm, weighted_sampling=True)
weps = [weps[s] for s in df2['sigma'].tolist()]

ret = {
    "eps": eps,
    "weps": weps,
    "val epoch": df1['val epoch'].tolist(),
    "val epoch dpiw": df2['val epoch'].tolist(),
    "tst disparity": df1['tst disparity'].tolist(),
    "tst disparity dpiw": df2['tst disparity'].tolist(),
    "tst acc": df1['tst acc'].tolist(),
    "tst acc dpiw": df2['tst acc'].tolist(),
    "tst wg acc": df1['tst wg acc'].tolist(),
    "tst wg acc dpiw": df2['tst wg acc'].tolist(),
}
joblib.dump(ret, f"data/disparity_{ds_name}.pkl")
print(ret)

#plt.plot(eps, df['tst disparity_x'].tolist(), label="DP")
#plt.plot(weps, df['tst disparity_y'].tolist(), label="DP IW")
#
#plt.xticks(fontsize=fontsize)
#plt.yticks(fontsize=fontsize)
#plt.xscale("log")
#plt.xlabel("Privacy budget $\epsilon$", fontsize=fontsize)
#plt.ylabel("Disparity", fontsize=fontsize)
#plt.legend(fontsize=fontsize)
#plt.tight_layout()
#plt.savefig(f"./figs/disparity_{ds_name}.png")
#plt.show()

Downloading dataset to ../data/inaturalist_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Using downloaded and verified file: ../data/inaturalist_v1.0/archive.tar.gz
Extracting ../data/inaturalist_v1.0/archive.tar.gz to ../data/inaturalist_v1.0

../data/inaturalist_v1.0/archive.tar.gz may be corrupted. Please try deleting it and rerunning this command.

Exception:  Compressed file ended before the end-of-stream marker was reached
Downloading dataset to ../data/inaturalist_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Using downloaded and verified file: ../data/inaturalist_v1.0/archive.tar.gz
Extracting ../data/inaturalist_v1.0/archive.tar.gz to ../data/inaturalist_v1.0

../data/inaturalist_v1.0/archive.tar.gz may be corrupted. Please try deleting it and rerunning this command.

Exception:  Compressed file ended before the end-of-stream marker was reached
{'eps': [121.28333738780964, 11.8306143257