In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Real Data

In [9]:
from pandas import read_csv
from scipy.special import expit
from sklearn.utils import shuffle
import pickle
import numpy as np

def load_dataset_train(full_path = '/content/drive/MyDrive/Privacy/real_data_train.csv'):
  # load the dataset as a numpy array
  with open(full_path, 'rb') as f:
    data = pickle.load(f)
  # retrieve numpy array
  data = data.values
  # split into input and output elements
  X, y = data[:, :-1], data[:, -1]
  return X, y

X_train, y_train = load_dataset_train()
X_train = expit(X_train)
print(X_train.shape, y_train.shape)
print(X_train[0])


def load_dataset_test(full_path = '/content/drive/MyDrive/Privacy/real_data_test.csv'):
  # load the dataset as a numpy array
  with open(full_path, 'rb') as f:
    data = pickle.load(f)
  # retrieve numpy array
  data = data.values
  # split into input and output elements
  X, y = data[:, :-1], data[:, -1]
  return X, y

X_test, y_test = load_dataset_test()
X_test = expit(X_test)
# X_test = np.concatenate([X_test, X_train[:200]])
# y_test = np.concatenate([y_test, y_train[:200]])
print(X_test.shape)

(1000, 4) (1000,)
[0.99411941 0.99982842 0.05258531 0.18114637]
(572, 4)


# Load Synthetic Data

In [15]:
import pickle
import pandas as pd
from sklearn.utils import shuffle

with open("/content/drive/MyDrive/Privacy/generated data with pate/pate_neg_x_low4000.csv", 'rb') as f:
    data1 = pickle.load(f)
df1 = pd.DataFrame(data1, columns = [i for i in range(data1.shape[1])])

print(df1.shape)
with open("/content/drive/MyDrive/Privacy/generated data with pate/pate_pos_x_low4000.csv", 'rb') as f:
    data2 = pickle.load(f)
df2 = pd.DataFrame(data2, columns = [i for i in range(data2.shape[1])])
print(df2.shape)

y_train = np.array([1 for i in range(len(df1))] + [0 for i in range(len(df2)) ])
print(len(y_train))

# Concat negative and positive data, and shuffle
result = pd.concat([df1, df2])
X_train_pate, y_train_pate = shuffle(result, y_train)

print(X_train_pate.shape)
print(y_train_pate.shape)

(2000, 4)
(2000, 4)
4000
(4000, 4)
(4000,)


# Evaluation Metrics

In [16]:
from keras import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
import tensorflow as tf

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model = Sequential([
Dense(input_dim = 4, units = 2, activation = 'relu'),
Dense(units = 10, activation = 'relu'),
Dropout(0.2),
# Dense(units = 10, activation = 'relu'),
Dense(units = 4, activation = 'relu'),
Dense(units =1, activation = 'sigmoid'),])

# Evaluate Private Data Generation Using a Neural Network

In [17]:
#evaluate the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy', f1_m, precision_m, recall_m])
model.fit(X_train_pate, y_train_pate, batch_size = 5, epochs = 10)

loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(loss, accuracy, f1_score, precision, recall)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1.1753512620925903 0.7150349617004395 0.5264100432395935 0.907561719417572 0.38300126791000366
