<a href="https://colab.research.google.com/github/yannuma/PrivacyProject/blob/main/Model_Creation/Pyvacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Privacy Algorithms for PyTorch: PyVacy**
---
We wanted to use the [Pyvacy](https://github.com/ChrisWaites/pyvacy) library to train our NN models to ensure privacy .
Unfortunately, the Pyvacy Library is not well maintained.
This means that the latest Pyvacy github version does not match the pip version. As a result, the examples on github don't work and there is no way to know which functions to use and which parameters they need.

That's why we switched to the better maintained library Opacus. The results can be viewed in the [main file](https://github.com/yannuma/PrivacyProject/blob/main/Model_Creation/Main_PrivacyProject.ipynb).

Nevertheless, we have managed to create DPSGD with Pyvacy (results below). However, we do not know which privacy algorithms are behind the Pyvacy functions.

In [None]:
# install and import necessary libraries
!pip install pyvacy
import tarfile
import torch
import requests
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torchvision
import torch.utils.data as torch_data
import torchvision.transforms as transforms
import numpy as np
from pyvacy import optim, analysis
import itertools
import time
import pandas as pd
from google.colab import files



In [None]:
#Download the dataset and save features and labels in two different lists
response = requests.get(f'https://www.comp.nus.edu.sg/~reza/files/dataset_texas.tgz')
if response.status_code == 200:
    with open(f'dataset_texas.tgz', 'wb') as file:
        file.write(response.content)
    print("Download completed successfully.")
else:
    print(f"Failed to download file: {response.status_code}")

with tarfile.open(f'dataset_texas.tgz') as f:
    f.extractall(f'data/')

with open('data/texas/100/feats', 'r') as f:
    features = f.readlines()
with open('data/texas/100/labels', 'r') as f:
    labels = f.readlines()

print(len(features))
print(len(labels))

Download completed successfully.
67330
67330


In [None]:
'''
Classical Network Architecture:
3 Linear Layers
This neural network consists of three linear layers.
The first and second layers use the Tanh activation function, while the third layer outputs the final predictions.
Input size is 6169, and it outputs 101 classes with two hidden layers of size 128.
Tanh activation function (somehow) works better than classic activation function like Relu for DPSG.
'''
class NetSeq(nn.Module):
    def __init__(self):
        super(NetSeq, self).__init__()
        self.fc1 = nn.Linear(6169, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 101)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

'''
Function to calculate the acuracy of a network for a given dataset.
Device decides if the calculations happens on CPU or GPU.
'''
def calc_accuracy(loader, network, device):
    correct = 0
    total = 0
    for data in loader:
      inputs, labels = data
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = network(inputs)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
    return (correct / total, total)


In [None]:
'''
Train the neural network with DPSGD and calculate the accuracy of the trained model:
trainloader: basically dataset to train the model
testloader: dataset to calculate the accuracy of the trained model
lr: learning rate
epochs: number of training iterations
clip: clipping norm for the differential privacy mechanism
noiseMult: the noise which will be somehow multiplied/applied to our gradient
batchSize: the Size of the batch
delta: parameter for DPSG mechanism
network_out: boolean, if True it return the model otherwise the max accuracy and the privacy budget
'''
def train_nn_pp(trainloader, testloader, lr, epochs, clip, noiseMult, batchSize, delta, network_out):
		losses = []
		train_accs = []
		test_accs = []

		device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		network = NetSeq().to(device)
		criterion = nn.CrossEntropyLoss()
		max_accuracy = 0
		best_model_path = 'best_model.pth'

		#define a optimizer from Pyvacy
		optimizer = optim.DPSGD(
			l2_norm_clip= clip,
    	noise_multiplier=noiseMult,
    	batch_size=batchSize,
			params=network.parameters(),
    	lr=lr,
		)

		#calculate the epsilon with moments accountant
		epsilon = analysis.moments_accountant(
    	N=len(trainloader.dataset),
    	batch_size=batchSize,
    	noise_multiplier=noiseMult,
    	epochs=epochs,
    	delta=delta,
		)

		#print("Epsilon: ", epsilon)
		for epoch in range(epochs):

			#print('Epoch: {}'.format(epoch + 1))
			epoch_loss = []

			for i, data in enumerate(trainloader, 0):
				inputs, labels = data
				inputs, labels = inputs.to(device), labels.to(device)
				optimizer.zero_grad()
				outputs = network(inputs)
				loss = criterion(outputs, labels)
				epoch_loss.append(loss.item())
				loss.backward()
				optimizer.step()

			losses.append(np.mean(epoch_loss))

			# Compute accuracy on training data
			with torch.no_grad():
				acc_tuple = calc_accuracy(trainloader, network, device)

			#print('Accuracy of the network on %d train inputs: %d %%' % (acc_tuple[1], 100 * acc_tuple[0]))
			train_accs.append(acc_tuple[0])

			# Validate all classes
			with torch.no_grad():
				acc_tuple = calc_accuracy(testloader, network, device)

			#print('Accuracy of the network on %d test inputs: %d %%' % (acc_tuple[1], 100 * acc_tuple[0]))
			test_accs.append(acc_tuple[0])
			if acc_tuple[0] > max_accuracy:
				max_accuracy = acc_tuple[0]
			if network_out:
					torch.save(network.state_dict(), best_model_path)

		if network_out:
			network.load_state_dict(torch.load(best_model_path))
			network.to('cpu')
			return network
		else:
			#print('Final accuracy: ', max_accuracy)
			return (max_accuracy, epsilon)

Unfortunately, there is no method (or we don't know what it is called) to determine the privacy budget for training. Only a noise multiplier can be determined, which influences the privacy budget (As well as other parameters such as Batch Size and Number of Epochs).

In [None]:
#Process the Data such that we can create a torch dataset
features_list = [list(map(int, ''.join(feature.split()).split(','))) for feature in features]
labels_list = [int(label.strip()) for label in labels]
size = int(0.8 * len(features))
feat_tens_train = torch.tensor(features_list[:size], dtype=torch.float)
l_tens_train = torch.tensor(labels_list[:size], dtype=torch.long)
feat_tens_test = torch.tensor(features_list[size:], dtype=torch.float)
l_tens_test = torch.tensor(labels_list[size:], dtype=torch.long)


#Now define the parameters to train the neural network:
lr = 0.01
epochs = [20, 40, 60]
clip = 3
noiseMult = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2]
batch = 128
delta = 1e-5
pyvacy_res = []

dataset_train = TensorDataset(feat_tens_train, l_tens_train)
trainloader = DataLoader(dataset_train, batch_size=batch, shuffle=True)
dataset_test = TensorDataset(feat_tens_test, l_tens_test)
testloader = DataLoader(dataset_test, batch_size=batch, shuffle=True)

#train the model for the above defined parameters
for nm in noiseMult:
  for epoch in epochs:
    start_time = time.time()
    out = train_nn_pp(trainloader, testloader, lr, epoch, clip, nm, batch, delta, False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    pyvacy_res.append({ 'epsilon': out[1],'epoch': epoch, 'accuracy': out[0], 'time': elapsed_time})

df = pd.DataFrame(pyvacy_res)
df.to_csv('df_results_Pyvacy.csv', index=False)
files.download('df_results_Pyvacy.csv')
df

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,epsilon,epoch,accuracy,time
0,28.168824,20,0.372345,137.881667
1,38.321284,40,0.475865,275.346087
2,46.91545,60,0.499183,409.699776
3,12.278318,20,0.3794,136.543005
4,15.924232,40,0.455146,270.280533
5,18.967079,60,0.472152,412.085328
6,6.489703,20,0.371083,135.833283
7,8.207478,40,0.422991,271.550151
8,9.663488,60,0.425962,410.135968
9,3.981852,20,0.35727,137.403231
