In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

# Load data files
import numpy as np
import urllib.request
import io

url_x = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_x.npy'
url_y = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_y.npy'

with urllib.request.urlopen(url_x) as url:
    f = io.BytesIO(url.read())
X = np.load(f)

with urllib.request.urlopen(url_y) as url:
    f = io.BytesIO(url.read())
y = np.load(f)

In [2]:
import torch

In [3]:
from timeit import default_timer as timer

In [4]:
def gaussian_mech_zCDP_vec(vec, sensitivity, rho):
    sigma = np.sqrt((sensitivity**2) / (2 * rho))
    return vec + np.random.normal(loc=0, scale=sigma, size=vec.shape)

In [5]:
# Split data into training and test sets
training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

Train and test set sizes: 36176 9044


# Clipping and Gradient definition

In [6]:
def L2_clip_array(vs , b):
    norms = np.linalg.norm(vs, ord = 2, axis = 1)
    ratios = vs/norms[:, None]
    results = np.where((norms > b)[:, None], b*ratios, vs)
    return results

In [12]:
def vgradient(theta_in, x_in, y_in, C):
    x = x_in
    y = y_in
    theta = theta_in
    exponent = y * np.dot(x, theta)
    rhs = (y/(1+np.exp(exponent)))
    gradients = -(x*rhs[:, None])
    return gradients
#     clipped_grads = L2_clip_array(gradients, C)
#     return np.sum(clipped_grads, axis = 0)

In [13]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

# Baseline (gradient clipping DP-SGD)

In [148]:
def dp_gradient_descent(epochs, rho):
    rho_i = rho/epochs
    d = X_train.shape[1]
    theta = np.zeros(d)                 # leaks the number of features, without privacy
    clipping_param = 1
    num_examples = X_train.shape[0]     # leaks the number of training examples, without privacy

    BATCH_SIZE = 256
    num_batches = int(num_examples / BATCH_SIZE)
    batches_X = np.array_split(X, num_batches)
    batches_y = np.array_split(y, num_batches)

    for i in range(epochs):

        for xs, ys in zip(batches_X, batches_y):
            grads           = vgradient(theta, xs, ys, clipping_param)
            clipped_grads   = L2_clip_array(grads, clipping_param)
            avg_grad        = np.mean(clipped_grads, axis=0) # sensitivity is clipping_param / len(xs)
            #print(avg_grad)
            noisy_grad      = gaussian_mech_zCDP_vec(avg_grad, clipping_param/len(xs), rho_i)
            theta           = theta - (.1/(i+5))*np.sign(noisy_grad)

    return theta

In [149]:
def zcdp_eps(rho, delta):
    return rho + 2*np.sqrt(rho * np.log(1/delta))
zcdp_eps(0.001, 1e-5)

0.21559660262893474

In [150]:
rho = 0.001
epochs = 10
print('eps:', zcdp_eps(rho, 1e-5))
accs = [accuracy(dp_gradient_descent(epochs, rho)) for _ in range(10)]
print('mean:', np.mean(accs))
print('std:', np.std(accs))

eps: 0.21559660262893474
mean: 0.7937859354268022
std: 0.008198639943351002
