# Adult DP-SGD Analysis
The main adult dp-sgd analysis script. This script is used to analyse the dp-sgd results from all the different pruned datasets and create the final plots of the research topic.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
import tensorflow_datasets as tfds
from sklearn.metrics import auc
import os
from os import path
import pickle
import time
import datetime

### Averaged experiment results

In [None]:
# takes in dataframe and returns a new averaged df
def get_mean_results(df, prune_type, prune_frac):
    new_df = df.loc[(df['prune_type'] == prune_type) & 
       (df['prune_frac'] == prune_frac)].groupby(["prune_frac", 
                                           "epoch",
                                           "batches",
                                           "noise_multiplier",
                                           "clipping_norm"]).mean().reset_index().drop(['run_number'], 
                                                                                       axis=1)
    return new_df

### Plotting tools 

In [None]:
# takes in dataframe and returns a new averaged df over experiments
def get_mean_results_bs(df, batch_size, prune_type, prune_frac):
    new_df = df.loc[(df['batch_size'] == batch_size) & (df['prune_type'] == prune_type) & 
                    (df['prune_frac'] == prune_frac)].groupby(["prune_frac", 
                                                               "steps",
                                                               "epochs",
                                                               "noise_multiplier",
                                                               "clipping_norm"]).mean().reset_index().drop(['run_number'], 
                                                                                                           axis=1)
    return new_df

In [None]:
# create a comparative plot for different influence types
def plot_maker(df, batch_size, prune_list, prune_type, x_axis, y_axis, opt_label, line_width):
    prune_type_list = ['influential', 'random']
    if prune_type == 'none':
        zero_prune = get_mean_results_bs(df, batch_size, prune_type, 0.0)
        x, y = get_x_y(zero_prune, x_axis, y_axis)
        ax.plot(x, y, label="{}".format(opt_label), linewidth=line_width)
    else:
        for prune_frac in prune_list:
            if prune_type == "random" or prune_type == "influential":
                new_df = get_mean_results_bs(df, batch_size, prune_type, prune_frac)
                x, y = get_x_y(new_df, x_axis, y_axis)
                ax.plot(x, y, label="{} {} {:.0f}%".format(opt_label, 
                                                           'Infl' if prune_type=='influential' else 'Random', 
                                                           prune_frac*100), linewidth=line_width)
            if prune_type == "both":
                for i in range(len(prune_type_list)):
                    new_df = get_mean_results_bs(df, batch_size, prune_type_list[i], prune_frac)
                    x, y = get_x_y(new_df, x_axis, y_axis)
                    ax.plot(x, y, label="{} {} {:.0f}%".format(opt_label, 
                                                               'Infl' if prune_type_list[i]=='influential' else 'Random',
                                                               prune_frac*100), linewidth=line_width)
            
    

In [None]:
# create a comparative plot 
def plot_maker_color(df, batch_size, prune_list, prune_type, x_axis, y_axis, opt_label, color, line_width):
    prune_type_list = ['influential', 'random']
    if prune_type == 'none':
        zero_prune = get_mean_results_bs(df, batch_size, prune_type, 0.0)
        x, y = get_x_y(zero_prune, x_axis, y_axis)
        ax.plot(x, y, label="{}".format(opt_label), color=color, linewidth=line_width)
    else:
        for prune_frac in prune_list:
            if prune_type == "random" or prune_type == "influential":
                new_df = get_mean_results_bs(df, batch_size, prune_type, prune_frac)
                x, y = get_x_y(new_df, x_axis, y_axis)
                ax.plot(x, y, label="{} {} {:.0f}%".format(opt_label, 
                                                           'Infl' if prune_type=='influential' else 'Random', 
                                                           prune_frac*100), color=color, linewidth=line_width)
            if prune_type == "both":
                for i in range(len(prune_type_list)):
                    new_df = get_mean_results_bs(df, batch_size, prune_type_list[i], prune_frac)
                    x, y = get_x_y(new_df, x_axis, y_axis)
                    ax.plot(x, y, label="{} {} {:.0f}%".format(opt_label, 
                                                               'Infl' if prune_type_list[i]=='influential' else 'Random',
                                                               prune_frac*100), color=color, linewidth=line_width)
            
    

In [None]:
# create a plot of all the experiments
def plot_maker_experiments(df, batch_size, prune_frac, prune_type, experiments, x_axis, y_axis, color, alpha, line_width, linestyle):
    for experiment in range(experiments):
        new_df = df.loc[(df['batch_size'] == batch_size) & 
                        (df['prune_type'] == prune_type) & 
                        (df['prune_frac'] == prune_frac) & 
                        (df['run_number'] == experiment)].groupby(["epochs"]).mean().reset_index().drop(['run_number'], 
                                                                                                           axis=1)
        x, y = get_x_y(new_df, x_axis, y_axis)
        ax.plot(x, y, color=color, alpha=alpha, linewidth=line_width, linestyle=linestyle)


In [None]:
# create a plot of all the experiments
def plot_maker_averaged(df, batch_size, prune_frac, prune_type, x_axis, y_axis, alpha, line_width, linestyle, opt_label):
    new_df = get_mean_results_bs(df, batch_size, prune_type, prune_frac)
    x, y = get_x_y(new_df, x_axis, y_axis)
    ax.plot(x, y, alpha=alpha, linewidth=line_width, linestyle=linestyle, label=opt_label)


In [None]:
# get x and y array
def get_x_y(df, x_string, y_string):
    return df[x_string], df[y_string]*100

In [None]:
# integrate line plot 
def return_area_max_epsilon(epsilon, x_array, y_array):
    return auc(x_array, y_array)

### Optimise batch_size

In [None]:
# load data
df = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan_all.csv')
#df_1 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan.csv')
#df_2 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan_2.csv')
#df_3 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan_3.csv')
#df = pd.concat([df_1, df_2, df_3])

In [None]:
df.head(5)

### plot accuracy vs epsilon with experiments

In [None]:
# plot options
BATCH_SIZE = [10, 15, 20, 25, 30, 35, 40, 45, 50, 100] #, 125, 150, 175, 200]
PRUNE_FRAC_LIST = [0.0, 0.1]
PRUNE_TYPE = ['none', 'random', 'influential', 'both']
X_AXIS = ["epoch", "steps", "epsilon"]
Y_AXIS = ['val_acc', 'acc', 'steps', 'epochs']
LINE_WIDTH = 3


# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(12, 8), ncols=1)

x_axis = 'epsilon'
y_axis = 'val_acc'

batch_size = 50

for batch_size in BATCH_SIZE:

    plot_maker(df, batch_size, [0.00], 'none', x_axis, y_axis, "Batch Size {}".format(batch_size), 2.5)

ax.set_xlim(0.1, 1.)
ax.legend(frameon=False, fontsize=12)


plt.xlabel("Epsilon [\u03B5]", fontsize=12)
plt.ylabel("Accuracy [%]", fontsize=12)

plt.savefig("plots/epsilon_performance_batch_size_scan.pdf")

# Final results:
Epoch: 50
BS: 50
Prune: Influence

In [None]:
# load data
df_random = pd.read_csv('results/dp_sgd_pruning_results_random_1.csv')
df_infl = pd.read_csv('results/dp_sgd_pruning_results_infl_cp50_bs50_1.csv')
#df_1 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan.csv')
#df_2 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan_2.csv')
#df_3 = pd.read_csv('results/dp_sgd_pruning_results_cpave_bs500_double_batch_size_scan_3.csv')
df = pd.concat([df_random, df_infl])

In [None]:
# example of averaging over experiments
df.head(5)

### Comparison of experiments

In [None]:
# plot options
BATCH_SIZE = [10, 15, 20, 25, 30, 35, 40, 45, 50, 100] #, 125, 150, 175, 200]
PRUNE_FRAC_LIST = [0.0, 0.1]
PRUNE_TYPE = ['none', 'random', 'influential', 'both']
X_AXIS = ["epoch", "steps", "epsilon"]
Y_AXIS = ['val_acc', 'acc', 'steps', 'epochs']
LINE_WIDTH = 3


# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(12, 8), ncols=1)

x_axis = 'epsilon'
y_axis = 'val_acc'

batch_size = 40

plot_maker_color(df, batch_size, [0.0], 'none', x_axis, y_axis, "Batch Size {}".format(batch_size), 'lightcoral', 3)
plot_maker_experiments(df, batch_size, 0.0, 'none', 10, x_axis, y_axis, 'lightcoral', 1, 1, '--')

plot_maker_color(df, batch_size, [0.01], 'influential', x_axis, y_axis, "Batch Size {}".format(batch_size), 'seagreen', 3)
plot_maker_experiments(df, batch_size, 0.01, 'influential', 10, x_axis, y_axis, 'seagreen', 1, 1, '--')

ax.set_xlim(0.1, 0.5)
ax.legend(frameon=False, fontsize=12)

plt.xlabel("Epsilon [\u03B5]", fontsize=12)
plt.ylabel("Accuracy [%]", fontsize=12)

plt.savefig("plots/epsilon_performance_experiment_comparison.pdf")

### Random vs influential pruning

In [None]:
# plot options
BATCH_SIZE = [25, 30, 35, 40]
PRUNE_FRAC_LIST = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2]
PRUNE_TYPE = ['none', 'random', 'influential', 'both']
X_AXIS = ["epoch", "steps", "epsilon"]
Y_AXIS = ['val_acc', 'acc', 'steps', 'epochs']
LINE_WIDTH = 3


# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(12, 8), ncols=1)

x_axis = 'epsilon'
y_axis = 'val_acc'

batch_size = 40

plot_maker_color(df, batch_size, [0.0], 'none', x_axis, y_axis, "Batch Size {}".format(batch_size), 'lightcoral', LINE_WIDTH)
plot_maker_color(df, batch_size, [0.1], 'influential', x_axis, y_axis, "BS {}".format(batch_size), 'seagreen', LINE_WIDTH)
plot_maker_color(df, batch_size, [0.2], 'influential', x_axis, y_axis, "".format(batch_size), 'darkgreen', LINE_WIDTH)

plot_maker_color(df, batch_size, [0.1], 'random', x_axis, y_axis, "BS {}".format(batch_size), 'goldenrod', LINE_WIDTH)
plot_maker_color(df, batch_size, [0.2], 'random', x_axis, y_axis, "".format(batch_size), 'darkorange', LINE_WIDTH)


ax.set_xlim(0.1, 0.7)
ax.set_ylabel("Accuracy [%]", fontsize=12)
ax.set_xlabel("Epsilon [\u03B5]", fontsize=12)
ax.legend(frameon=False, fontsize=12)

plt.savefig("plots/epsilon_performance_comparison_bs_40.pdf")

### Scanning over prune fractions

In [None]:
# plot options
BATCH_SIZE = [25, 30, 35, 40]
PRUNE_FRACS = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2]
#PRUNE_FRACS = [0.1]
PRUNE_TYPE = ['none', 'random', 'influential']
X_AXIS = ["epoch", "steps", "epsilon"]
Y_AXIS = ['val_acc', 'acc', 'steps', 'epochs']
LINE_WIDTH = 3
#----------------------------#
x_axis = 'epsilon'
y_axis = 'val_acc'
batch_size = 40
#----------------------------#

# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(12, 8), ncols=1)

for prune_frac in PRUNE_FRACS:
    plot_maker_averaged(df, batch_size, prune_frac, 'influential', x_axis, y_axis, 1, 1.5, '--', "Batch Size: {} Prune Frac: {:.0f}%".format(batch_size, prune_frac*100))

    
plot_maker_color(df, batch_size, [0.0], 'none', x_axis, y_axis, "Batch Size: {} Prune Frac: 0%".format(batch_size), 'lightcoral', 4)

ax.set_xlim(0.1, 0.70)
ax.set_ylabel("Accuracy [%]", fontsize=12)
ax.set_xlabel("Epsilon [\u03B5]", fontsize=12)
ax.legend(frameon=False, fontsize=12)

plt.savefig("plots/epsilon_performance_comparison_prune_frac_scan.pdf")