In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.datasets import cifar100
from keras import applications
from __future__ import print_function
from keras import optimizers
from keras.models import Sequential,Input,Model
from keras.models import load_model #save and load models
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from sklearn.preprocessing import LabelEncoder
import keras.backend as k

In [3]:
df = pd.read_csv('uber_census_multiclass.csv', encoding = "ISO-8859-1", engine='python')
df = df[['hour', 'lat', 'long', 'base', 'MedianIncomeByPlaceofBirth', 'MedianIncome', 'AvgPopulation', 'UserGroup']]
data = df
print(data.shape)
print(data.dtypes)
df1 = data

(9752, 8)
hour                            int64
lat                           float64
long                          float64
base                           object
MedianIncomeByPlaceofBirth    float64
MedianIncome                  float64
AvgPopulation                 float64
UserGroup                       int64
dtype: object


In [4]:
df1.head()

Unnamed: 0,hour,lat,long,base,MedianIncomeByPlaceofBirth,MedianIncome,AvgPopulation,UserGroup
0,59,40.7476,-74.002,B02617,49364.0,75862.0,10012.0,3
1,28,40.7385,-73.9733,B02617,51174.0,105170.0,4417.0,4
2,27,40.7045,-73.7286,B02617,26024.0,68854.0,3654.0,3
3,20,40.6323,-73.7,B02617,78274.0,211250.0,2732.0,4
4,0,40.9347,-73.9025,B02617,21628.0,27957.0,4379.0,1


In [5]:
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df1['UserGroup'])
encoded_Y = encoder.transform(df1['UserGroup'])
print(encoded_Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y.shape
print(dummy_y)
temp = pd.get_dummies(df1['UserGroup'])
dummy_y = temp.values

[3 4 3 ... 4 3 3]
[[0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [6]:
names_cloud = data.columns.tolist()
print(names_cloud)
X = np.array(data[names_cloud])
print(X.shape)

['hour', 'lat', 'long', 'base', 'MedianIncomeByPlaceofBirth', 'MedianIncome', 'AvgPopulation', 'UserGroup']
(9752, 8)


In [7]:
# fig = plt.figure(figsize=(20,20))
# cols = 5
# rows = (float(data.shape[1]) / cols)
# for i, column in enumerate(data.columns):
#     a = fig.add_subplot(rows, cols, i + 1)
#     a.set_title(column)
#     if data.dtypes[column] == np.object:
#         data[column].value_counts().plot(kind="bar", axes=a)
#     else:
#         data[column].hist(axes=a)
#         plt.xticks(rotation="vertical")
# plt.subplots_adjust(hspace=0.7, wspace=0.2)

In [8]:
y = (data['UserGroup']).values
print(pd.value_counts(pd.Series(y)))
data.drop('UserGroup',axis=1, inplace =True,)
y = dummy_y

4    3428
2    2284
3    2200
1    1612
0     228
dtype: int64


In [9]:
categorical_features = data.select_dtypes(include=['object']).columns
print(categorical_features)
ohc_category = ['base']
df_ohc = pd.get_dummies(data, columns = ohc_category)
print(df_ohc.shape)
df_ohc.head()

Index(['base'], dtype='object')
(9752, 10)


Unnamed: 0,hour,lat,long,MedianIncomeByPlaceofBirth,MedianIncome,AvgPopulation,base_B02512,base_B02598,base_B02617,base_B02682
0,59,40.7476,-74.002,49364.0,75862.0,10012.0,0,0,1,0
1,28,40.7385,-73.9733,51174.0,105170.0,4417.0,0,0,1,0
2,27,40.7045,-73.7286,26024.0,68854.0,3654.0,0,0,1,0
3,20,40.6323,-73.7,78274.0,211250.0,2732.0,0,0,1,0
4,0,40.9347,-73.9025,21628.0,27957.0,4379.0,0,0,1,0


In [10]:
names_x = df_ohc.columns.tolist()
print("Target Variable: UserGroup")
print("Predictors: "+str(names_x))
x = np.array(df_ohc[names_x])
print("Number of data samples : {0:d}".format(x.shape[0]))
print("Number of Predictor Features : {0:d}".format(x.shape[1]))
#df_ohc["Age"] = df_ohc["Age"].astype(str).astype(float)
# from google.colab import files
# df_ohc.to_csv('df.csv')
# files.download('df.csv')
df_ohc[names_x]
print(x)

Target Variable: UserGroup
Predictors: ['hour', 'lat', 'long', 'MedianIncomeByPlaceofBirth', 'MedianIncome', 'AvgPopulation', 'base_B02512', 'base_B02598', 'base_B02617', 'base_B02682']
Number of data samples : 9752
Number of Predictor Features : 10
[[ 59.      40.7476 -74.002  ...   0.       1.       0.    ]
 [ 28.      40.7385 -73.9733 ...   0.       1.       0.    ]
 [ 27.      40.7045 -73.7286 ...   0.       1.       0.    ]
 ...
 [ 42.      40.6607 -73.9894 ...   1.       0.       0.    ]
 [ 32.      40.7729 -73.9213 ...   0.       1.       0.    ]
 [ 48.      40.6197 -73.9664 ...   0.       1.       0.    ]]


In [11]:
df_ohc.dtypes

hour                            int64
lat                           float64
long                          float64
MedianIncomeByPlaceofBirth    float64
MedianIncome                  float64
AvgPopulation                 float64
base_B02512                     uint8
base_B02598                     uint8
base_B02617                     uint8
base_B02682                     uint8
dtype: object

In [12]:
x = x.astype('float32')
print(x)
x = x/255
print(x)
batch_size = 64 #upto us
epochs = 200
lrate = 0.001
decay = 1e-7 
data_size = 2500
ns = 10 #number of shadow models for one data_size
nh = 8 #number of hidden layers
nout = 1
seed = 9
np.random.seed(seed)
sh = np.arange(x.shape[0])
print(sh)
np.random.shuffle(sh)
target_rep = np.zeros((1,x.shape[0]))
print(target_rep)
target_rep[0,:] = sh
print(sh)

[[ 59.      40.7476 -74.002  ...   0.       1.       0.    ]
 [ 28.      40.7385 -73.9733 ...   0.       1.       0.    ]
 [ 27.      40.7045 -73.7286 ...   0.       1.       0.    ]
 ...
 [ 42.      40.6607 -73.9894 ...   1.       0.       0.    ]
 [ 32.      40.7729 -73.9213 ...   0.       1.       0.    ]
 [ 48.      40.6197 -73.9664 ...   0.       1.       0.    ]]
[[ 0.23137255  0.15979451 -0.29020393 ...  0.          0.00392157
   0.        ]
 [ 0.10980392  0.15975882 -0.29009137 ...  0.          0.00392157
   0.        ]
 [ 0.10588235  0.15962549 -0.28913176 ...  0.          0.00392157
   0.        ]
 ...
 [ 0.16470589  0.15945373 -0.29015452 ...  0.00392157  0.
   0.        ]
 [ 0.1254902   0.15989372 -0.28988746 ...  0.          0.00392157
   0.        ]
 [ 0.1882353   0.15929295 -0.2900643  ...  0.          0.00392157
   0.        ]]
[   0    1    2 ... 9749 9750 9751]
[[0. 0. 0. ... 0. 0. 0.]]
[   8 5849 3725 ... 6782 4444 8574]


In [13]:
from tensorflow.keras.optimizers import Adam
k.clear_session()
xtr_target = x[sh[:data_size]]
ytr_target = y[sh[:data_size]]
xts_target = x[sh[data_size:data_size*2]]
yts_target = y[sh[data_size:2*data_size]]
print(xts_target.shape, yts_target)
shadow_rep = np.zeros((20,x.shape[0]-2*data_size))
sh1 = sh[2*data_size:]
xtr_att = np.zeros((2*data_size*ns,1))
ytr_att = np.zeros((2*data_size*ns,1))
xtr_att_truelabels = np.zeros((2*data_size*ns,))
model_target = Sequential()
model_target.add(Dense(nh, input_shape =(x.shape[1],), activation='relu', name = 'hidden'))
model_target.add(Dense(5, activation='softmax', name = 'output'))
opt = Adam(lr=lrate, decay=decay) 
model_target.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
print(model_target.summary())
hist_target = model_target.fit(xtr_target, ytr_target,
                  batch_size = batch_size,
                  epochs = epochs,
                  validation_data=(xts_target, yts_target), shuffle=True, verbose=0)


(2500, 10) [[0 1 0 0 0]
 [0 0 0 1 0]
 [0 0 0 1 0]
 ...
 [0 0 0 0 1]
 [0 0 0 0 1]
 [0 1 0 0 0]]
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden (Dense)              (None, 8)                 88        
                                                                 
 output (Dense)              (None, 5)                 45        
                                                                 
Total params: 133
Trainable params: 133
Non-trainable params: 0
_________________________________________________________________
None


  super(Adam, self).__init__(name, **kwargs)


In [14]:
print(xtr_target.shape)

(2500, 10)


In [15]:
print('\n\nFor target model with training datasize = %d'%data_size)
print('Training accuracy = ', hist_target.history['accuracy'][-1])
print('Validation accuracy = ', hist_target.history['val_accuracy'][-1])
model_target_name = 'UBER_CENSUS_target_'+str(data_size)+'.h5'
model_target.save(model_target_name)
ytemp_tr_target = model_target.predict(xtr_target)
ytemp_ts_target = model_target.predict(xts_target)
xts_att = np.vstack((ytemp_tr_target,ytemp_ts_target))
yts_att = np.zeros((2*data_size,1))
yts_att[data_size:2*data_size] = 1  
xts_att_truelabels = np.vstack((ytr_target,yts_target))
xts_att_dict = {'xts_att':xts_att,'yts_att':yts_att,'xts_att_truelabels':xts_att_truelabels}
fname = './att_test_data_'+str(data_size)
np.save(fname,xts_att_dict)
datafile = './UBER_CENSUS_target_'+str(data_size)
np.save(datafile,target_rep)



For target model with training datasize = 2500
Training accuracy =  0.8460000157356262
Validation accuracy =  0.8335999846458435


**IBM diffprivlib**

Only GaussianNB is used, and we implored the *IBM diffprivlib* library to implement privacy mechanisms for our multi-class classifier model trained on 9 features with five classes to predict from. 

In [16]:
!pip install diffprivlib

Collecting diffprivlib
  Downloading diffprivlib-0.5.0.tar.gz (87 kB)
[?25l[K     |███▊                            | 10 kB 25.6 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 32.3 MB/s eta 0:00:01[K     |███████████▎                    | 30 kB 35.6 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 33.3 MB/s eta 0:00:01[K     |██████████████████▊             | 51 kB 23.9 MB/s eta 0:00:01[K     |██████████████████████▌         | 61 kB 25.4 MB/s eta 0:00:01[K     |██████████████████████████▎     | 71 kB 21.0 MB/s eta 0:00:01[K     |██████████████████████████████  | 81 kB 22.2 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 5.1 MB/s 
Collecting scipy>=1.5.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.2 MB/s 
Building wheels for collected packages: diffprivlib
  Building wheel for diffprivlib (setup.py) ... [?25l[?25hdone


In [17]:
from diffprivlib.models import GaussianNB

epsilons = np.logspace(-2, 2, 50)
bounds = ([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5])
accuracy = list()

ImportError: ignored

In [None]:
ytr = []
yts = []
for i in range(len(ytr_target)):
  index = np.nonzero(ytr_target[i])
  ytr.append(index[0][0]+1)

for i in range(len(yts_target)):
  index = np.nonzero(yts_target[i])
  yts.append(index[0][0]+1)

ytr_target_priv = np.asarray(ytr)
yts_target_priv = np.asarray(yts)
print(ytr_target_priv.shape)

**Training Differentially Private Model**

In [None]:
#Training & Calculating for Different Epsilons

for epsilon in epsilons:
  diffpriv_model = GaussianNB(epsilon=epsilon)
  diffpriv_model.fit(xtr_target, ytr_target_priv)
  #accuracy on test data for the model and appending them in a list
  accuracy.append(diffpriv_model.score(xts_target, yts_target_priv))

print(accuracy)

 

In [None]:
#Plotting the Accuracy vs epsilons for our multi-class classifier
plt.semilogx(epsilons, accuracy)
plt.title("Differentially private Naive Bayes accuracy")
plt.xlabel("epsilon")
plt.ylabel("Accuracy")
plt.show()

# Shadow models

In [None]:
for i in np.arange(ns):
    np.random.shuffle(sh1)
    shadow_rep[i,:] = sh1
    xtr_shadow = x[sh1[:data_size]]
    ytr_shadow = y[sh1[:data_size]]
    xts_shadow = x[sh1[data_size:2*data_size]]
    yts_shadow = y[sh1[data_size:2*data_size]]
    model_shadow = Sequential()
    model_shadow.add(Dense(nh, input_shape =(x.shape[1],), activation='relu', name = 'hidden'))
    model_shadow.add(Dense(5, activation='softmax', name = 'output'))
    opt = Adam(learning_rate=lrate, decay=decay) 
    model_shadow.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    if i == 0:
        print("Shadow Model Summary")
        print(model_shadow.summary())
    hist_shadow = model_shadow.fit(xtr_shadow, ytr_shadow,
                  batch_size = batch_size,
                  epochs = epochs,
                  validation_data=(xts_shadow, yts_shadow), shuffle=True, verbose=0)
    print("Shadow model no: %d"%i)
    print('\n\nFor shadow model with training datasize = %d'%data_size)
    print('Training accuracy = ', hist_shadow.history['accuracy'][-1])
    print('Validation accuracy = ', hist_shadow.history['val_accuracy'][-1])
    ytemp11 = model_shadow.predict(xtr_shadow)
    t1 = [[i] for i in np.argmax(ytemp11, axis=1)]
    
    ytemp22 = model_shadow.predict(xts_shadow)
    t2 = [[i] for i in np.argmax(ytemp22, axis=1)]


    model_shadow_name = 'UBER_CENSUS_shadow_'+str(data_size)+'_'+str(i)+'.h5'
    print(model_shadow_name)
    model_shadow.save(model_shadow_name)    
    
    print('****')
    #print(np.vstack((ytemp11,ytemp22)))
    # original_1 = np.argmax(ytemp11, axis=1) 
    # original_2 = np.argmax(ytemp22, axis=1) 
    # prediction_ = np.argmax(ytemp11, axis = 1)
    # print(prediction_)
    # print(ytemp11)
    # print(original_2)
    xtr_att[i*2*data_size:(i+1)*2*data_size] = np.vstack((t1, t2))
    ytr_att[((i*2)+1)*data_size:(i+1)*2*data_size] = 1

    #t3 = [[i] for i in np.argmax(ytr_shadow, axis=1)]
    #t4 = [[i] for i in np.argmax(yts_shadow, axis=1)]

    #xtr_att_truelabels[i*2*data_size:(i+1)*2*data_size] = np.hstack((t3,t4))
datafile = './UBER_CENSUS_shadow_'+str(data_size)
np.save(datafile,shadow_rep)
#xtr_att_dict = {'xtr_att':xtr_att,'ytr_att':ytr_att,'xtr_att_truelabels':xtr_att_truelabels}
xtr_att_dict = {'xtr_att':xtr_att,'ytr_att':ytr_att}
fname = './att_train_data_'+str(data_size)
np.save(fname,xtr_att_dict)

In [None]:
# t = [[i] for i in np.argmax(ytemp11, axis=1)]
# print(np.array(t))
print(xtr_att, xtr_att.shape)
print(ytr_att, ytr_att.shape)

print('****')
# print(xts_att)
xts_att_ = [[i] for i in np.argmax(xts_att, axis=1)]
t5 = np.array(xts_att_)
print(t5, t5.shape)
print(yts_att, yts_att.shape)

# Attack Models

In [None]:
model_attack = Sequential()
model_attack.add(Dense(nh, input_shape = (xtr_att.shape[1],), activation='sigmoid', name = 'hidden'))
model_attack.add(Dense(1, activation='sigmoid', name = 'output'))
opt = Adam(learning_rate=lrate, decay=decay) 
model_attack.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
print("Attack Model Summary")
print(model_attack.summary())
hist_attack = model_attack.fit(xtr_att, ytr_att,
                  batch_size = batch_size,
                  epochs = epochs,
                  validation_data=(t5, yts_att), shuffle=True, verbose=0)
print('\n\nFor attack model with training datasize = %d'%xtr_att.shape[0])
print('Training accuracy = ', hist_shadow.history['accuracy'][-1])
print('Validation accuracy = ', hist_shadow.history['val_accuracy'][-1])