# Adult DP-SGD Training
This notebook is used for training the adult dataset on dp-sgd algoriths using the tf-privacy libraries. The primary use case is in running experiments to determine optimal hyperparameters for data pruning. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
import tensorflow_datasets as tfds
import os
from os import path
import pickle
import time
import datetime

In [None]:
# import tf-privacy libraries
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent

from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras_vectorized import VectorizedDPKerasSGDOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras_vectorized import VectorizedDPKerasAdagradOptimizer

In [None]:
# method to remove random entries, rounding to a dataset divisible by 100
def remove_random_entries(path):
    df = pd.read_csv(path)
    return df.drop(np.random.choice(df.index, (df.shape[0]%100), replace=False))

### Load dataset

In [None]:
# load training, test and validation datasets
train_df = remove_random_entries("data/train-one-hot.csv")
train_target_df = train_df.pop('salary')

test_df = remove_random_entries("data/test-one-hot.csv")
test_target_df = test_df.pop('salary')

val_df = remove_random_entries("data/val-one-hot.csv")
val_target_df = val_df.pop('salary')

In [None]:
train_df

In [None]:
#NUM_TRAIN_EXAMPLES=len(train_target_df.values)
NUM_TRAIN_EXAMPLES=len(train_target_df.values)
EPOCHS=100
BATCH_SIZE=100
N_MICROBATCHES=100
LEARNING_RATE=0.001
#L2_NORM_CLIP=5.9
#NOISE_MULTIPLIER=1.0
DELTA=1e-5

### Calculating epsilon
A method which takes in steps, batch_size, num_training_examples and noise_multiplier to return privacy spent in steps taken, or equivilently, in epochs.

In [None]:
# method which takes in steps and returns privacy spent in steps taken
# > STEPS
# > NOISE_MULTIPLIER
# > BATCH_SIZE
# > DELTA
def compute_epsilon(steps, batch_size, num_training_examples, noise_multiplier):
    """Computes epsilon value for given hyperparameters."""
    if noise_multiplier == 0.0:
        return float('inf')
    orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
    sampling_probability = batch_size / num_training_examples
    rdp = compute_rdp(q=sampling_probability,
                      noise_multiplier=noise_multiplier,
                      steps=steps,
                      orders=orders)
    # Delta is set to approximate 1 / (number of training points).
    return get_privacy_spent(orders, rdp, target_delta=DELTA)[0]

## Running dp-sgd
Method which takes in a list of clipping norms and noise multipliers to scan over these parameters for a list of epochs.

In [None]:
# a method to scan over both clipping_norm and noise_multiplier and run a training loop for each pair. 
# returns a pandas df for the saved results
def run_dpsgd_scan(l2_norm_clip_scan, noise_multiplier_scan, epoch_scan):
    start = time.time()
    
    columns = ['epoch','noise_multiplier', 'clipping_norm', 'acc', 'val_acc', 'epsilon']
    df = pd.DataFrame(columns=columns)
    data = []
    
    total_loops = len(l2_norm_clip_scan)*len(noise_multiplier_scan)
    current_loop = 0
    
    for noise_multiplier in noise_multiplier_scan:
        for l2_norm_clip in l2_norm_clip_scan:
            # reset tf session
            tf.keras.backend.clear_session()

            # redefining variables
            L2_NORM_CLIP=l2_norm_clip
            NOISE_MULTIPLIER=noise_multiplier

            # set optimiser options
            optimizer = VectorizedDPKerasSGDOptimizer(
                l2_norm_clip=L2_NORM_CLIP,
                noise_multiplier=NOISE_MULTIPLIER,
                num_microbatches=N_MICROBATCHES,
                learning_rate=LEARNING_RATE
            )
            # define model
            model = tf.keras.Sequential([
                tf.keras.Input(shape=(63,)),
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                tf.keras.layers.Dense(32, activation="relu"),
                tf.keras.layers.Dense(1)]
            )
            # compile model
            model.compile(optimizer=optimizer,
                          loss=tf.keras.losses.BinaryCrossentropy(from_logits=True,
                                                                  reduction=tf.losses.Reduction.NONE),
                          metrics=['accuracy'])
            # start training
            history = model.fit(train_df.values,
                                train_target_df.values,
                                validation_data=(val_df.values, val_target_df.values),
                                batch_size=BATCH_SIZE,
                                epochs=EPOCHS, 
                                verbose=0)
            # calculate and append information required: 
            for epoch in epoch_scan:
                STEPS = epoch * NUM_TRAIN_EXAMPLES / BATCH_SIZE
                values = [epoch, 
                          NOISE_MULTIPLIER, 
                          L2_NORM_CLIP, 
                          history.history["accuracy"][epoch-1],
                          history.history["val_accuracy"][epoch-1],
                          compute_epsilon(STEPS,
                                          BATCH_SIZE,
                                          NUM_TRAIN_EXAMPLES,
                                          NOISE_MULTIPLIER)]
                zipped = zip(columns, values)
                a_dictionary = dict(zipped)
                data.append(a_dictionary)
                
            # printing information loop information
            loss, acc = model.evaluate(val_df.values, val_target_df.values, verbose=0)
            current_loop += 1
            print ("# {} out of {} -- Noise: {} -- Clipping Norm: {} -- Accuracy {}".format(current_loop,
                                                                                            total_loops, 
                                                                                            NOISE_MULTIPLIER, 
                                                                                            L2_NORM_CLIP, 
                                                                                            acc))
            print("Elapsed time:", datetime.timedelta(seconds=time.time() - start))
    end = datetime.timedelta(seconds=time.time() - start)
    df = df.append(data, True)
    print("Completed {} experiments in {}".format(total_loops, end))
    return df    

In [None]:
clipping_norm = [2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.25, 6.5, 6.75, 7, 7.25, 7.5, 7.75, 8, 8.25, 8.5, 8.75, 9.0, 9.25, 9.5, 9.75, 10]
noise_multiplier = [0.9, 1.0, 1.1, 1.15, 1.2, 1.25, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.15, 2.2, 2.25, 2.3, 2.4, 2.5]
epochs = [20, 40, 60, 80, 100]

In [None]:
# scan over all parameters and write results to a dataframe
df = run_dpsgd_scan(clipping_norm, noise_multiplier, epochs)

### Save output

In [None]:
# save dataframe
df.to_csv('results/dp_sgd_scan_results_4.csv', index=False)