# Section Project:

For the final project for this section, you're going to train a DP model using this PATE method on the MNIST dataset, provided below.

## Import Modules

In [1]:
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dists
import torch.utils.data as data

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from syft.frameworks.torch.differential_privacy import pate

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device



device(type='cuda', index=0)

## Prepare Data

### Load the MNIST Training & Test Datasets

In [2]:
mnist_trainset = datasets.MNIST(root='../data', train=True, download=True, transform=transforms.ToTensor())
mnist_testset = datasets.MNIST(root='../data', train=False, download=True, transform=transforms.ToTensor())

print("Training Set Size:", len(mnist_trainset))
print("Test Set Size:", len(mnist_testset))
print()
print("Min Data Value:", torch.min(mnist_trainset.data.min(), mnist_testset.data.min()))
print("Max Data Value:", torch.max(mnist_trainset.data.max(), mnist_testset.data.max()))
print()
print("Train Label Counts:", {label.item():count.item() for label, count in zip(*torch.unique(mnist_trainset.targets, return_counts=True))})
print("Test Label Counts:", {label.item():count.item() for label, count in zip(*torch.unique(mnist_testset.targets, return_counts=True))})

Training Set Size: 60000
Test Set Size: 10000

Min Data Value: tensor(0, dtype=torch.uint8)
Max Data Value: tensor(255, dtype=torch.uint8)

Train Label Counts: {0: 5923, 1: 6742, 2: 5958, 3: 6131, 4: 5842, 5: 5421, 6: 5918, 7: 6265, 8: 5851, 9: 5949}
Test Label Counts: {0: 980, 1: 1135, 2: 1032, 3: 1010, 4: 982, 5: 892, 6: 958, 7: 1028, 8: 974, 9: 1009}


### Create Training Datasets

In [3]:
n_teachers = 250
batch_size = 30

_teacher_dataset_len = len(mnist_trainset) // n_teachers

teacher_datasets = [data.Subset(mnist_trainset, list(range(i*_teacher_dataset_len, (i+1)*_teacher_dataset_len))) for i in range(n_teachers)]
dp_dataset       = data.Subset(mnist_testset, list(range(int(len(mnist_testset) * 0.9))))
test_dataset     = data.Subset(mnist_testset, list(range(int(len(mnist_testset) * 0.9), len(mnist_testset))))

teacher_dataloaders = [data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) for dataset in teacher_datasets]
dp_dataloader       = data.DataLoader(dp_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_dataloader     = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

print("Size of each teacher dataset:", _teacher_dataset_len)
print("Size of the (soon to be) differentially private dataset:", len(dp_dataset))
print("Size of the test dataset:", len(test_dataset))

Size of each teacher dataset: 240
Size of the (soon to be) differentially private dataset: 9000
Size of the test dataset: 1000


## Build Models

### Define Classifier

In [4]:
class MNISTClassifier(nn.Module):
    def __init__(self):
        super(MNISTClassifier, self).__init__()
        
        # 1x28x28
        self.bn0        = nn.BatchNorm2d(1)
        self.conv0      = nn.Conv2d(1, 4, 3, padding=1)
        self.bn1        = nn.BatchNorm2d(4)
        self.maxpool0   = nn.MaxPool2d(2)
        # 4x14x14
        self.conv1      = nn.Conv2d(4, 8, 3, padding=1)
        self.bn2        = nn.BatchNorm2d(8)
        self.maxpool1   = nn.MaxPool2d(2)
        # 8x 7x 7
        self.conv2      = nn.Conv2d(8, 16, 3, padding=1)
        self.maxpool2   = nn.MaxPool2d(2, padding=1)
        # 16x4x 4 = 256
        self.bn3        = nn.BatchNorm1d(256)
        self.fc         = nn.Linear(256, 10)

        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.activation(self.conv0(self.bn0(x)))
        x = self.maxpool0(x)
        x = self.activation(self.conv1(self.bn1(x)))
        x = self.maxpool1(x)
        x = self.activation(self.conv2(self.bn2(x)))
        x = self.maxpool2(x)
        x = self.fc(self.bn3(x.view(-1, 256)))
        
        return x

### Create Teacher & DF Models

In [5]:
teachers = [MNISTClassifier().to(device) for _ in range(n_teachers)]
dp_model = MNISTClassifier().to(device)

## Train Teachers

In [None]:
lr       = 1e-2
n_epochs = 5

teacher_optimizers = [optim.Adam(model.parameters(), lr=lr) for model in teachers]
criterion          = nn.CrossEntropyLoss()

for model in teachers:
    model.train()

teachers_train_history = {'avg_losses':{}, 'avg_accuracies': {}}
for i_epoch in range(n_epochs):
    avg_losses      = []
    avg_accuracies  = []

    for i_model in range(n_teachers):
        instance_count = 0
        total_loss     = 0.
        correct_count  = 0

        model      = teachers[i_model]
        dataloader = teacher_dataloaders[i_model]
        optimizer  = teacher_optimizers[i_model]

        n_batches = len(dataloader)
        _prev_str_len = 0
        for i, (imgs, labels) in enumerate(dataloader):
            _batch_str = "Teacher {:d}/{:d}: ({:d}/{:d})".format(i_model, n_teachers-1, i, n_batches-1)
            print(_batch_str + ' ' * (_prev_str_len - len(_batch_str)), end='\r')
            _prev_str_len = len(_batch_str)

            instance_count += imgs.size(0)

            imgs   = imgs.to(device)
            labels = labels.to(device)
            
            outs  = model(imgs)
            preds = torch.argmax(outs, dim=1)
            
            optimizer.zero_grad()
            loss = criterion(outs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * imgs.size(0)
            
            correct_count += (preds == labels).sum().item()

        avg_losses.append(total_loss / instance_count)
        avg_accuracies.append(correct_count / instance_count)

    _epoch_str = "Epoch {:d}/{:d}".format(i_epoch, n_epochs-1)
    _epoch_str += ' ' * (_prev_str_len - len(_epoch_str))
    print(_epoch_str)
    print("    Avg Losses:", [round(avg_loss, 5) for avg_loss in avg_losses])
    print("    Avg Accuracies:", [round(avg_acc, 4) for avg_acc in avg_accuracies])
    print()

    teachers_train_history['avg_losses'][i_epoch]     = avg_losses
    teachers_train_history['avg_accuracies'][i_epoch] = avg_accuracies

Teacher 27/249: (2/7)

## Generate Labels for the Differentially Private Training Set

### Predict Labels Using Teacher Models

In [7]:
for model in teachers:
    model.eval()

dataloader = data.DataLoader(dp_dataset, batch_size=256, shuffle=False, drop_last=False)

batches_of_preds = []

n_batches = len(dataloader)
_prev_str_len = 0
for i, (imgs, _) in enumerate(dataloader):
    imgs = imgs.to(device)

    batch_of_preds = []
    
    with torch.no_grad():
        for j, model in enumerate(teachers):
            _progress_str = "Batch {:d}/{:d} - Teacher {:d}/{:d}".format(i, n_batches-1, j, n_teachers-1)
            print(_progress_str + ' ' * (_prev_str_len - len(_progress_str)), end='\r')
            _prev_str_len = len(_progress_str)

            outs  = model(imgs)
            preds = outs.argmax(dim=1)
            batch_of_preds.append(preds.cpu())
    
    batches_of_preds.append(batch_of_preds)
        
label_preds = torch.cat(
    [torch.stack([preds for preds in batch_of_preds], dim=0)
     for batch_of_preds in batches_of_preds],
    dim=1)

print()
print(label_preds)

Batch 35/35 - Teacher 19/19
tensor([[7, 2, 1,  ..., 6, 9, 0],
        [7, 2, 1,  ..., 6, 9, 0],
        [7, 2, 1,  ..., 6, 9, 0],
        ...,
        [7, 2, 1,  ..., 6, 9, 0],
        [7, 2, 1,  ..., 6, 9, 0],
        [7, 2, 1,  ..., 6, 9, 0]])


### Get Label Counts for Each Image

In [8]:
label_counts = torch.from_numpy(np.apply_along_axis(lambda x: np.bincount(x, minlength=10), axis=0, arr=label_preds.numpy()))
print(label_counts)

tensor([[ 0,  0,  0,  ...,  0,  0, 20],
        [ 0,  0, 20,  ...,  0,  0,  0],
        [ 0, 20,  0,  ...,  0,  0,  0],
        ...,
        [20,  0,  0,  ...,  0,  2,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0, 18,  0]])


### Obtain Labels from Noisy Counts with a Certain $\epsilon$ Value

In [33]:
epsilon = 0.3

noise_dist = dists.Laplace(loc=torch.zeros([], dtype=torch.float),
                           scale=torch.full([], 1 / epsilon, dtype=torch.float))

noisy_counts = label_counts.float() + noise_dist.sample([10, label_counts.size(1)])

generated_labels = noisy_counts.argmax(dim=0)
print(generated_labels)
print()
print("Noisy Accuracy Against Predictions:", (generated_labels == label_counts.argmax(dim=0)).float().mean().item())

tensor([7, 2, 1,  ..., 6, 9, 0])

Noisy Accuracy Against Predictions: 0.9543333053588867


### Perform PATE Analysis to Check Information Leak

In [34]:
pate.perform_analysis(label_preds, generated_labels, epsilon, delta=1e-05, moments=1)



(3251.512925465141, 3251.51292546497)

### Assign the Generated Labels to the DP Training Set

In [None]:
mnist_testset.targets[dp_dataset.indices] = generated_labels

## Train the DP Model

In [None]:
lr       = 3e-3
n_epochs = 10

dp_optimizer = optim.Adam(dp_model.parameters(), lr=lr)
criterion    = nn.CrossEntropyLoss()

dp_model.train()

dp_train_history = {'avg_losses':{}, 'avg_accuracies': {}}
for i_epoch in range(n_epochs):
    instance_count = 0
    total_loss     = 0.
    correct_count  = 0.

    n_batches = len(dp_dataloader)
    _prev_str_len = 0
    for i, (imgs, labels) in enumerate(dp_dataloader):
        _batch_str = "Epoch {:d}/{:d}: ({:d}/{:d})".format(i_epoch, n_epochs-1, i, n_batches-1)
        print(_batch_str + ' ' * (_prev_str_len - len(_batch_str)), end='\r')
        _prev_str_len = len(_batch_str)

        instance_count += imgs.size(0)

        imgs   = imgs.to(device)
        labels = labels.to(device)

        outs  = dp_model(imgs)
        preds = torch.argmax(outs, dim=1)

        dp_optimizer.zero_grad()
        loss = criterion(outs, labels)
        loss.backward()
        dp_optimizer.step()

        total_loss += loss.item() * imgs.size(0)

        correct_count += (preds == labels).sum().item()

    avg_loss = total_loss / instance_count
    avg_accuracy = correct_count / instance_count

    print()
    print("    Avg Loss: {:.6f}".format(avg_loss))
    print("    Avg Accuracy: {:.4f}".format(avg_accuracy))
    print()

    dp_train_history['avg_losses'][i_epoch]     = avg_loss
    dp_train_history['avg_accuracies'][i_epoch] = avg_accuracy

## Evaluate the Result Model on the Test Set

In [None]:
dp_model.eval()

criterion = nn.CrossEntropyLoss(reduction='sum')

instance_count = 0
total_loss     = 0.
correct_count  = 0.

n_batches = len(dataloader)
for i, (imgs, labels) in enumerate(test_dataloader):
    print("Batch {:d}/{:d}".format(i, n_batches-1), end='\r')

    instance_count += imgs.size(0)
    
    imgs = imgs.to(device)
    
    with torch.no_grad():
        outs  = model(imgs)
    
    total_loss += criterion(outs, labels).item()

    preds = outs.argmax(dim=1)
    
    correct_count += (preds == labels).sum().item()

print()
print("Average Loss:", total_loss / instance_count)
print("Average Accuracy:", correct_count / instance_count)

1. Prepare Data
    1. Split the training dataset into `n + 1` smaller datasets where `n` is the number of teacher models
    2. Define a Dataset class and a DataLoader that can give batches of data for all `n` teacher datasets
2. Define Model(s)
    1. A simple ConvNet for both the main model and all teacher models
    2. If too slow: custom `nn.Module` that can process `n` batches at once for all `n` teachers
3. Train Teachers
4. Label Unlabeled Training Dataset in a Differentially Private Manner
    1. Generate raw labels
    2. PATE analysis to find a proper `epsilon` value
    3. Add proper noise to the label counts
    4. Take the labels with most counts
5. Train the Main Model On the Training Dataset with Generated Labels
6. Test on the Test Dataset