In [2]:
from google.colab import drive
drive.mount('/content/drive')

'''
Code inspired from:
https://github.com/BorealisAI/private-data-generation/tree/master/models, 
https://github.com/tensorflow/privacy/tree/master/research/pate_2017
'''

Mounted at /content/drive


# Data

## Load Credit Card Data

Don't run this section. This section is for loading another dataset that we tested. The results were not satisfying, so, we changed the data set.

In [None]:
from pandas import read_csv
dataframe = read_csv('/content/drive/MyDrive/Privacy/creditcard.csv', header=None)

In [None]:
from collections import Counter
# summarize the class distribution
target = dataframe.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
	per = v / len(target) * 100
	print('Class=%d, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
import numpy as np

train_labels = y.reshape((284807, 1))
COND_num_classes = 2 # Number of classes
train_labels_vec = np.zeros((len(train_labels), COND_num_classes), dtype='float32')
for i, label in enumerate(train_labels):
    train_labels_vec[i, int(train_labels[i])] = 1.0

train_data = X.astype('float32')
print(train_data.shape,train_labels_vec.shape)

## Load Banknote  Authentication  Dataset 

In [5]:
from pandas import read_csv
from scipy.special import expit
from sklearn.utils import shuffle
import pickle


def load_dataset(full_path = '/content/drive/MyDrive/Privacy/real_data_train.csv'):
  # load the dataset as a numpy array
  with open(full_path, 'rb') as f:
    data = pickle.load(f)
  # retrieve numpy array
  data = data.values
  # split into input and output elements
  X, y = data[:, :-1], data[:, -1]
  return X, y

X, y = load_dataset()
# preprocessing
X = expit(X)
X, y = shuffle(X, y)
print(X.shape,y.shape)

(1000, 4) (1000,)


# Utils

In [None]:
# Copyright 2019 RBC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#

import torch
import math
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.metrics import mutual_info_score



class Generator(nn.Module):
    def __init__(self, latent_size, output_size, conditional=False):
        super().__init__()
        z = latent_size
        d = output_size
        if conditional:
            z = z + 1
        else:
            d = d + 1
        self.main = nn.Sequential(
            nn.Linear(z, 2 * latent_size),
            nn.ReLU(),
            nn.Linear(2 * latent_size, d))

    def forward(self, x):
        return self.main(x)


class Discriminator(nn.Module):
    def __init__(self, input_size, wasserstein=False):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(input_size + 1, int(input_size / 2)),
            nn.ReLU(),
            nn.Linear(int(input_size / 2), 1))

        if not wasserstein:
            self.main.add_module(str(3), nn.Sigmoid())

    def forward(self, x):
        return self.main(x)


def weights_init(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)


def pate(data, netTD, lap_scale):

    results = torch.Tensor(len(netTD), data.size()[0]).type(torch.int64)
    for i in range(len(netTD)):
        # print(netTD[i])
        output = netTD[i].forward(data)
        pred = (output > 0.5).type(torch.Tensor).squeeze()
        results[i] = pred

    clean_votes = torch.sum(results, dim=0).unsqueeze(1).type(torch.cuda.DoubleTensor)
    noise = torch.from_numpy(np.random.laplace(loc=0, scale=1/lap_scale, size=clean_votes.size())).cuda()
    noisy_results = clean_votes + noise
    noisy_labels = (noisy_results > len(netTD)/2).type(torch.cuda.DoubleTensor)

    return noisy_labels, clean_votes


def moments_acc(num_teachers, clean_votes, lap_scale, l_list):
    q = (2 + lap_scale * torch.abs(2*clean_votes - num_teachers)
            )/(4 * torch.exp(lap_scale * torch.abs(2*clean_votes - num_teachers)))

    update = []
    for l in l_list:
        a = 2*lap_scale*lap_scale*l*(l + 1)
        t_one = (1 - q) * torch.pow((1 - q) / (1 - math.exp(2*lap_scale) * q), l)
        t_two = q * torch.exp(2*lap_scale * l)
        t = t_one + t_two
        update.append(torch.clamp(t, max=a).sum())

    return torch.cuda.DoubleTensor(update)


def mutual_information(labels_x: pd.Series, labels_y: pd.DataFrame):

    if labels_y.shape[1] == 1:
        labels_y = labels_y.iloc[:, 0]
    else:
        labels_y = labels_y.apply(lambda x: ' '.join(x.get_values()), axis=1)

    return mutual_info_score(labels_x, labels_y)


def normalize_given_distribution(frequencies):
    distribution = np.array(frequencies, dtype=float)
    distribution = distribution.clip(0)  # replace negative values with 0
    summation = distribution.sum()
    if summation > 0:
        if np.isinf(summation):
            return normalize_given_distribution(np.isinf(distribution))
        else:
            return distribution / summation
    else:
        return np.full_like(distribution, 1 / distribution.size)


# Model

## PATE GAN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import numpy as np
import math


class PATE_GAN:
    def __init__(self, input_dim, z_dim, num_teachers, target_epsilon, target_delta, conditional=True):

        self.generator = Generator(z_dim, input_dim, conditional).cuda().double()
        self.student_disc = Discriminator(input_dim, wasserstein=False).cuda().double()
        self.teacher_disc = [Discriminator(input_dim, wasserstein=False).cuda().double()
                             for _ in range(num_teachers)]
        self.generator.apply(weights_init)
        self.student_disc.apply(weights_init)
        self.z_dim = z_dim
        self.num_teachers = num_teachers

        for i in range(num_teachers):
            self.teacher_disc[i].apply(weights_init)

        self.target_epsilon = target_epsilon
        self.target_delta = target_delta
        self.conditional = conditional

    def train(self, x_train, y_train, hyperparams):
        batch_size = hyperparams.batch_size
        num_teacher_iters = hyperparams.num_teacher_iters
        num_student_iters = hyperparams.num_student_iters
        num_moments = hyperparams.num_moments
        lap_scale = hyperparams.lap_scale
        class_ratios = None
        real_label = 1
        fake_label = 0

        alpha = torch.cuda.DoubleTensor([0.0 for _ in range(num_moments)])
        l_list = 1 + torch.cuda.DoubleTensor(range(num_moments))
        criterion = nn.BCELoss()

        optimizer_g = optim.Adam(self.generator.parameters(), lr=hyperparams.lr)
        optimizer_sd = optim.Adam(self.student_disc.parameters(), lr=hyperparams.lr)
        optimizer_td = [optim.Adam(self.teacher_disc[i].parameters(), lr=hyperparams.lr
                                   ) for i in range(self.num_teachers)]

        tensor_data = data_utils.TensorDataset(torch.cuda.DoubleTensor(x_train), torch.cuda.DoubleTensor(y_train))
        train_loader = []
        for teacher_id in range(self.num_teachers):
            start_id = teacher_id * len(tensor_data) / self.num_teachers
            end_id = (teacher_id + 1) * len(tensor_data) / self.num_teachers if teacher_id != (
                    self.num_teachers - 1) else len(tensor_data)

            train_loader.append(data_utils.DataLoader(torch.utils.data.Subset( \
                tensor_data, range(int(start_id), int(end_id))), batch_size=batch_size, shuffle=True))

        steps = 0
        epsilon = 0

        # while epsilon < self.target_epsilon:
        while steps <=5000:
            for t_2 in range(num_teacher_iters):
                for i in range(self.num_teachers):
                    inputs, categories = None, None
                    for b, data in enumerate(train_loader[i], 0):
                        inputs, categories = data
                        break

                    # train teachers with real
                    optimizer_td[i].zero_grad()
                    label = torch.full((inputs.size()[0],), real_label).cuda()
                    output = self.teacher_disc[i].forward(torch.cat([inputs, categories.unsqueeze(1).double()], dim=1))
                    output = torch.squeeze(output)
                    err_d_real = criterion(output, label.double())

                    err_d_real.backward()

                    # train teachers with fake
                    z = torch.Tensor(batch_size, self.z_dim).uniform_(0, 1).cuda()
                    label.fill_(fake_label)

                    if self.conditional:
                        category = torch.multinomial(class_ratios,  inputs.size()[0], replacement=True).unsqueeze(1).cuda().double()
                        fake = self.generator(torch.cat([z.double(), category], dim=1))
                        output = self.teacher_disc[i].forward(torch.cat([fake.detach(), category], dim=1))
                    else:
                        fake = self.generator(z.double())
                        output = self.teacher_disc[i].forward(fake)

                    output = torch.squeeze(output)

                    err_d_fake = criterion(output, label.double())
                    err_d_fake.backward()
                    optimizer_td[i].step()

            # train the student discriminator
            for t_3 in range(num_student_iters):
                z = torch.Tensor(batch_size, self.z_dim).uniform_(0, 1).cuda()

                if self.conditional:
                    category = torch.multinomial(class_ratios,  inputs.size()[0], replacement=True).unsqueeze(1).cuda().double()
                    fake = self.generator(torch.cat([z.double(), category], dim=1))
                    predictions, clean_votes = pate(torch.cat(
                        [fake.detach(), category], dim=1), self.teacher_disc, lap_scale)
                    outputs = self.student_disc.forward(torch.cat([fake.detach(), category], dim=1))
                else:
                    fake = self.generator(z.double())
                    predictions, clean_votes = pate(fake.detach(), self.teacher_disc, lap_scale)
                    outputs = self.student_disc.forward(fake.detach())

                # update the moments
                alpha = alpha + moments_acc(self.num_teachers, clean_votes, lap_scale, l_list)

                # update student
                # predictions = torch.unsqueeze(predictions,1)
                err_sd = criterion(outputs, predictions)

                optimizer_sd.zero_grad()
                err_sd.backward()
                optimizer_sd.step()

            # train the generator
            optimizer_g.zero_grad()
            z = torch.Tensor(batch_size, self.z_dim).uniform_(0, 1).cuda()
            label = torch.full((inputs.size()[0],), real_label).cuda()

            if self.conditional:
                category = torch.multinomial(class_ratios,  inputs.size()[0], replacement=True).unsqueeze(1).cuda().double()
                fake = self.generator(torch.cat([z.double(), category], dim=1))
                output = self.student_disc(torch.cat([fake, category.double()], dim=1))
            else:
                fake = self.generator(z.double())
                output = self.student_disc.forward(fake)
            
            output = torch.squeeze(output)
            err_g = criterion(output, label.double())
            err_g.backward()
            optimizer_g.step()
            # if steps %1000 == 0:
            #   torch.save(self.generator.state_dict(), "/content/drive/MyDrive/Privacy/model_step%d.h5"%steps)
            #   torch.save(self.generator, "/content/drive/MyDrive/Privacy/entire_model_step%d.h5"%steps)

            # Calculate the privacy cost
            epsilon = min((alpha - math.log(self.target_delta)) / l_list)
            
            if steps % 100 == 0:
                print("Step : ", steps, "Student Loss: ", err_sd.item(), "Generator Loss: ", err_g.item(), "Epsilon : ",
                      epsilon.item())
            steps += 1

    def generate(self, num_rows, class_ratios, batch_size=1000):
        steps = num_rows // batch_size
        synthetic_data = []
        if self.conditional:
            class_ratios = torch.from_numpy(class_ratios)
        for step in range(steps):
            noise = torch.randn(batch_size, self.z_dim).cuda()
            if self.conditional:
                cat = torch.multinomial(class_ratios, batch_size, replacement=True).unsqueeze(1).cuda().double()
                synthetic = self.generator(torch.cat([noise.double(), cat], dim=1))
                synthetic = torch.cat([synthetic, cat], dim=1)

            else:
                synthetic = self.generator(noise.double())

            synthetic_data.append(synthetic.cpu().data.numpy())

        if steps * batch_size < num_rows:
            noise = torch.randn(num_rows - steps * batch_size, self.z_dim).cuda()

            if self.conditional:
                cat = torch.multinomial(class_ratios, num_rows - steps * batch_size, replacement=True).unsqueeze(
                    1).cuda().double()
                synthetic = self.generator(torch.cat([noise.double(), cat], dim=1))
                synthetic = torch.cat([synthetic, cat], dim=1)
            else:
                synthetic = self.generator(noise.double())
            synthetic_data.append(synthetic.cpu().data.numpy())

        return np.concatenate(synthetic_data)


# Evaulate

In [None]:
import argparse
import numpy as np
import pandas as pd
import collections
import os
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.special import expit

class_ratios = None
target_variable = 4
X_train = X
y_train = y
X_train_pos = []
y_train_pos = []
X_train_neg = []
y_train_neg = []
positive = True

if positive:
  #load positive data
  for i in range(len(y_train)): 
      if y_train[i] == 1:
          y_train_pos.append(y_train[i])
          X_train_pos.append(X_train[i])
  X_train = X_train_pos
  y_train = y_train_pos

else:
  # load negative data
  for i in range(len(y_train)): 
      if y_train[i] == 0:
          y_train_neg.append(y_train[i])
          X_train_neg.append(X_train[i])
  X_train = X_train_neg
  y_train = y_train_neg

X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train.shape)

input_dim = X_train.shape[1]
print("Input dim:", input_dim)
z_dim = int(input_dim / 4 + 1) if input_dim % 4 == 0 else int(input_dim / 4)
print("Z dim:", z_dim)

# TODO check True
conditional = False
Hyperparams = collections.namedtuple('Hyperarams','batch_size num_teacher_iters num_student_iters num_moments lap_scale class_ratios lr')
Hyperparams.__new__.__defaults__ = (None, None, None, None, None, None, None)

num_teachers = 10
target_epsilon = 8
batch_size = 16
student_iters = 5
teacher_iters = 5
num_moments = 100
lap_scale = 0.0001
target_delta = 1e-5

model = PATE_GAN(input_dim, z_dim, num_teachers, target_epsilon, target_delta, conditional)
model.train(X_train, y_train, Hyperparams(batch_size=batch_size, num_teacher_iters=teacher_iters,
                                          num_student_iters=student_iters, num_moments=num_moments,
                                          lap_scale=lap_scale, class_ratios=class_ratios, lr=1e-4))

## Generate New Dataset

In [None]:
syn_data = model.generate(4000, class_ratios)
X_syn, y_syn = syn_data[:, :-1], syn_data[:, -1]
print(X_syn.shape)
print(y_syn.shape)

if positive:
  output = open("/content/drive/MyDrive/Privacy/generated data with pate/pate_pos_x_low4000.csv", 'wb')
  pickle.dump(X_syn, output)
  output = open("/content/drive/MyDrive/Privacy/generated data with pate/pate_pos_y_low4000.csv", 'wb')
  pickle.dump(y_train, output)
  output.close()
else:
  output = open("/content/drive/MyDrive/Privacy/generated data with pate/pate_neg_x_low4000.csv", 'wb')
  pickle.dump(X_syn, output)
  output = open("/content/drive/MyDrive/Privacy/generated data with pate/pate_neg_x_low4000.csv", 'wb')
  pickle.dump(y_train, output)
  output.close()

(1979, 30)
(1979,)


## Load Generated Dataset

In [None]:
import pickle
with open('/content/drive/MyDrive/Privacy/synthetic_data_x.csv', 'rb') as f:
    data = pickle.load(f)
print(data)

with open('/content/drive/MyDrive/Privacy/synthetic_data_y.csv', 'rb') as f:
    data = pickle.load(f)
# print(data)

[[-0.06871043 -0.11243304 -0.0139605  ... -0.10280559 -0.29044451
   0.21550894]
 [-0.12334208 -1.51443969  1.53745991 ... -2.57993242 -3.428202
   2.38634704]
 [-0.49547485 -0.43342706  0.82718787 ... -0.8907703  -1.69688236
   1.16405087]
 ...
 [-0.38557645 -0.24448216  0.75410519 ... -0.93036286 -2.57071541
   1.42400039]
 [-0.37192661 -0.89022344  1.40215878 ... -1.26005326 -2.50563993
   0.43152854]
 [-0.17935156 -1.59148958  1.48924768 ... -2.73311052 -3.34220363
   1.55835475]]
[-1.13458813e-01 -3.45445870e-01 -4.83912181e-01  3.63954521e-02
 -2.88157157e-01 -5.32594697e-01 -1.18384900e-01  9.97940486e-02
 -2.39628383e-01 -2.16450680e+00  2.34119683e-01 -3.84500547e-01
 -1.38524935e+00  7.19098340e-02 -1.13392250e+00 -3.35737769e-01
 -1.19312353e+00 -1.43163906e+00  4.89505476e-01 -1.74986167e-02
 -2.08120607e-01  2.78912805e-01  2.25208036e-03 -9.06280842e-01
  1.91910146e-01 -1.52777441e-03  1.34045680e-01 -3.06395072e+00
 -2.85559853e-01 -1.17482253e+00 -1.61296529e+00 -8.577