In [1]:
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from IPython import display
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

torch.manual_seed(1)
np.random.seed(7)
sns.set(style="white", palette="muted", color_codes=True, context="talk")

%matplotlib inline
print(torch.__version__) 

1.4.0


# Set up the data

In [2]:
# Utility for loading up the dataset

def load_adult_data(path):
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                    'martial_status', 'occupation', 'relationship', 'race', 'sex',
                    'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'target']
    input_data = (pd.read_csv(path, names=column_names,
                              na_values="?", sep=r'\s*,\s*', engine='python')
                  .loc[lambda df: df['race'].isin(['White', 'Black'])])

    # targets; 1 when someone makes over 50k , otherwise 0
    y = (input_data['target'] == '>50K').astype(int)

    # features; note that the 'target' and sentive attribute columns are dropped
    X = (input_data
         .drop(columns=['target', 'fnlwgt'])
         .fillna('Unknown')
         .pipe(pd.get_dummies, drop_first=True))

    y = y.to_frame()
    for col in X.columns:
      X[col] = X[col].astype('float32')

    for col in y.columns:
      y[col] = y[col].astype('float32')

    print(f"features X: {X.shape[0]} samples, {X.shape[1]} attributes")
    print(f"targets y: {y.shape} samples")
    return X, y

class PandasDataSet(TensorDataset):
    def __init__(self, *dataframes):
        tensors = (self._df_to_tensor(df) for df in dataframes)
        super(PandasDataSet, self).__init__(*tensors)

    def _df_to_tensor(self, df):
        if isinstance(df, pd.Series):
            df = df.to_frame('dummy')
        return torch.from_numpy(df.values).float()

In [3]:
# load adult data set
path = 'adult.data'
# path = 'adult.data'
X, y = load_adult_data(path)

n_features = X.shape[1]

# split into train/test set
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)
print(X_train.head())

features X: 30940 samples, 95 attributes
targets y: (30940, 1) samples
        age  education_num  capital_gain  capital_loss  hours_per_week  \
27719  17.0            7.0           0.0           0.0            12.0   
936    47.0           14.0           0.0           0.0            25.0   
3936   46.0           11.0           0.0           0.0            38.0   
8500   45.0           14.0           0.0        1902.0            50.0   
3882   51.0           10.0           0.0           0.0            40.0   

       workclass_Local-gov  workclass_Never-worked  workclass_Private  \
27719                  0.0                     0.0                1.0   
936                    0.0                     0.0                1.0   
3936                   0.0                     0.0                0.0   
8500                   0.0                     0.0                1.0   
3882                   0.0                     0.0                1.0   

       workclass_Self-emp-inc  workclass_Self

In [4]:
# Set up training & testing data

train_data = PandasDataSet(X_train, y_train)
test_data = PandasDataSet(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True, drop_last=True)
# test_loader = DataLoader(test_data, batch_size=16, shuffle=False)
test_loader = DataLoader(test_data, batch_size=17, shuffle=False, drop_last=True)

print('# training samples:', len(train_data))
print('# testing samples:', len(test_data))
print('# batches:', len(train_loader))

print('# training samples:', len(test_data))
print('# batches:', len(test_loader))

# training samples: 24752
# testing samples: 6188
# batches: 1547
# training samples: 6188
# batches: 364


# Define accuracy & Model

In [5]:
def accuracy(model, test_loader):
    correct = 0

    for examples, labels in test_loader:
        output = model.forward(examples)
        batch_correct = torch.sum(torch.abs(output - labels) < 0.5)
        correct += batch_correct

    acc = float(correct)/len(test_data)
    
    return acc

In [6]:
class Classifier(nn.Module):
    def __init__(self, n_features, n_hidden=32, p_dropout=0.2):
        super(Classifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(n_features, n_hidden),
            nn.ReLU(),
            nn.Dropout(p_dropout),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Dropout(p_dropout),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Dropout(p_dropout),
            nn.Linear(n_hidden, 1),
        )

    def forward(self, x):
        return torch.sigmoid(self.network(x))

# The calculation for immediate sensitivity

In [7]:
def grad_immediate_sensitivity(model, criterion, inputs, labels):
    inp = Variable(inputs, requires_grad=True)
    
    outputs = model.forward(inp)
    loss = criterion(outputs, labels)
    
    # (1) first-order gradient (wrt parameters)
    first_order_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True, create_graph=True)
    
    # (2) L2 norm of the gradient from (1)
    grad_l2_norm = torch.norm(torch.cat([x.view(-1) for x in first_order_grads]), p = 2)
    
    # (3) Gradient (wrt inputs) of the L2 norm of the gradient from (2)
    sensitivity_vec = torch.autograd.grad(grad_l2_norm, inp, retain_graph=True)[0]
    
    # (4) L2 norm of (3) - "immediate sensitivity"
    s = [torch.norm(v, p=2).numpy().item() for v in sensitivity_vec]

    loss.backward()
    return s

# Train the model, calculating immediate sensitivities

In [8]:
# reset the model
model = Classifier(n_features=n_features)
model_criterion = nn.BCELoss()
model_optimizer = optim.Adam(model.parameters(),lr=0.001)

# number of epochs and iterations
epochs = 10
iters = epochs * len(train_loader)

# parameters for Renyi differential privacy
alpha = 10
epsilon = 4.0
epsilon_iter = epsilon / iters

for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))
    all_sensitivities = []
    
    for x_batch_train, y_batch_train in train_loader:
        model_optimizer.zero_grad()
        batch_sensitivities = grad_immediate_sensitivity(model, model_criterion, x_batch_train, y_batch_train)
        
        all_sensitivities.extend(batch_sensitivities)
        batch_sensitivity = np.max(batch_sensitivities)
        
        # this is the scale of the Gaussian noise to be added to the sum of gradients for the batch
        sigma_squared_sum = (batch_sensitivity**2 * alpha) / (2 * epsilon_iter)
        
        # this is the scale of the Gaussian noise to be added to the average of the gradients
        sigma_squared_avg = sigma_squared_sum / len(x_batch_train)
        
        # print('Please add Gaussian noise with sigma^2 =', sigma_squared_avg)
        
        model_optimizer.step()

    print("Max sensitivity for the epoch:", np.max(all_sensitivities))
    print("Mean sensitivity for the epoch:", np.mean(all_sensitivities))
    print("Accuracy:", accuracy(model, test_loader))

    
print('Done training')

Start of epoch 0
Max sensitivity for the epoch: 11.69965934753418
Mean sensitivity for the epoch: 0.14907496710725135
Accuracy: 0.789754363283775
Start of epoch 1
Max sensitivity for the epoch: 20.885934829711914
Mean sensitivity for the epoch: 0.5359701581596688
Accuracy: 0.7918552036199095
Start of epoch 2
Max sensitivity for the epoch: 42.6502571105957
Mean sensitivity for the epoch: 0.7458748826960143
Accuracy: 0.8196509372979961
Start of epoch 3
Max sensitivity for the epoch: 41.214656829833984
Mean sensitivity for the epoch: 0.8558362779601039
Accuracy: 0.8173884938590821
Start of epoch 4
Max sensitivity for the epoch: 79.3251953125
Mean sensitivity for the epoch: 0.974116238308897
Accuracy: 0.8188429217840982
Start of epoch 5
Max sensitivity for the epoch: 95.23429107666016
Mean sensitivity for the epoch: 1.0523426674042957
Accuracy: 0.8152876535229476
Start of epoch 6
Max sensitivity for the epoch: 94.62689208984375
Mean sensitivity for the epoch: 1.1012829068802443
Accuracy: 0