# Dummy DP-SGD Script

This script incorporates a epsilon threashold into the training loop as a callback function.

In [None]:
import os
from os import path
import time
import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
import tensorflow_datasets as tfds


In [None]:
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent

from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer

### Load dataset

In [None]:
def remove_random_entries(path):
    data_dir = Path("/project/differential-privacy/adult_analysis/")
    df = pd.read_csv(data_dir / path)
    return df.drop(np.random.choice(df.index, (df.shape[0]%100), replace=False))

In [None]:
# load training, test and validation datasets
train_df = remove_random_entries("data/train-one-hot.csv")
train_target_df = train_df.pop('salary')

test_df = remove_random_entries("data/test-one-hot.csv")
test_target_df = test_df.pop('salary')

val_df = remove_random_entries("data/val-one-hot.csv")
val_target_df = val_df.pop('salary')

### Computing Epsilon

In [None]:
# hyperparameters
NUM_TRAIN_EXAMPLES=len(train_target_df.values)
EPOCHS=100
BATCH_SIZE=100
N_MICROBATCHES=100
LEARNING_RATE=0.001
L2_NORM_CLIP=2
NOISE_MULTIPLIER=2
DELTA=1e-5

if BATCH_SIZE % N_MICROBATCHES != 0:
    raise ValueError('Batch size should be an integer multiple of the number of microbatches')

### Useful Methods

In [None]:
# method which takes in steps and returns privacy spent in steps taken
# > NOISE_MULTIPLIER
# > BATCH_SIZE
# > DELTA
def compute_epsilon(steps, batch_size=BATCH_SIZE, num_training_examples=NUM_TRAIN_EXAMPLES, noise_multiplier=NOISE_MULTIPLIER):
    """Computes epsilon value for given hyperparameters."""
    if NOISE_MULTIPLIER == 0.0:
        return float('inf')
    orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
    sampling_probability = batch_size / num_training_examples
    rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=NOISE_MULTIPLIER,
                    steps=steps,
                    orders=orders)
    # Delta is set to approximate 1 / (number of training points).
    return get_privacy_spent(orders, rdp, target_delta=DELTA)[0]

In [None]:
def get_epsilon_per_epoch(total_epochs):
    epsilon_epoch = [[] for _ in range(2)]
    for epoch in range(total_epochs):
        steps = epoch*len(train_target_df.values)/BATCH_SIZE
        epsilon_epoch[0].append(epoch)
        epsilon_epoch[1].append(compute_epsilon(steps))
    return pd.DataFrame({'epochs': epsilon_epoch[0], 'epsilon': epsilon_epoch[1]})

## Define & train model

In [None]:
def get_compiled_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(63,)))
    model.add(tf.keras.layers.Dense(128, activation="relu"))
    model.add(tf.keras.layers.Dense(64, activation="relu"))
    model.add(tf.keras.layers.Dense(32, activation="relu"))
    model.add(tf.keras.layers.Dense(1))
    
    optimizer = DPKerasSGDOptimizer(
        l2_norm_clip=L2_NORM_CLIP,
        noise_multiplier=NOISE_MULTIPLIER,
        num_microbatches=N_MICROBATCHES,
        learning_rate=LEARNING_RATE)
    
    model.compile(optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.losses.Reduction.NONE),
        metrics=['accuracy'])
    
    return model

In [None]:
model = get_compiled_model()
model.summary()

### Training model

In [None]:
def get_epsilon(epoch):
    steps = (epoch + 1) * NUM_TRAIN_EXAMPLES / BATCH_SIZE
    epsilon=compute_epsilon(steps)
    return epsilon

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super().__init__()
        self.threshold = threshold
        self.best_weights = None
        
    def on_epoch_end(self, epoch, logs=None):
        eps = get_epsilon(epoch)
        print(f"Epoch: {epoch}, epsilon: {eps:.5f}")
        if eps < self.threshold:
            self.best_weights = self.model.get_weights()
        else:
            self.model.stop_training = True
            # revert to final valid epoch
            self.model.set_weights(self.best_weights)

In [None]:
start = time.time()
history = model.fit(
    train_df.values,
    train_target_df.values,
    validation_data=(val_df.values, val_target_df.values),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS, 
    verbose=1,
    callbacks=[CustomCallback(threshold=0.3), tf.keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10)],
)
end = time.time()
print("Total time:", datetime.timedelta(seconds=end - start))

## Results
Summary of the model performance. The defined methods can be used to compare multiple models.

In [None]:
# model plotter
def plot_model_results(history, clr, i="_alt"):
    ax[0].plot(history.history["loss"], "{}".format(clr), label="M{} Train loss".format(i), linewidth=2)
    ax[0].plot(history.history["val_loss"], "{}--".format(clr), label="M{} Val loss".format(i), linewidth=2)
    ax[1].plot(history.history["accuracy"], "{}".format(clr), label="M{} Train accuracy".format(i), linewidth=1.5)
    ax[1].plot(history.history["val_accuracy"], "{}--".format(clr), label="M{} Val accuracy".format(i), linewidth=1.5)
    ax[0].set_xlabel("$Epochs$", fontsize=16), ax[1].set_xlabel("$Epochs$", fontsize=16)
    ax[0].set_ylabel("$Loss$", fontsize=16), ax[1].set_ylabel("$Accuracy$", fontsize=16)
    ax[0].set_title("Loss", fontsize=18), ax[1].set_title("Accuracy", fontsize=18)
    ax[0].legend(frameon=False, fontsize=14), ax[1].legend(frameon=False, fontsize=14)
    
# print results
def return_results(model, test_features, test_labels, i="0"):
    # Evaluate model comparison
    loss, acc = model.evaluate(test_features, test_labels, verbose=0)
    print("M{}|| Accuracy: {:.2f}% --- Loss: {:.2f}".format(i, 100 * acc, loss))

In [None]:
# print result summary 
return_results(model, test_df.values, test_target_df.values)

# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(14, 6), ncols=2)

# plot results of each model
plot_model_results(history, "g", 1)