In [156]:
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# Single Model

In [141]:
def grad_immediate_sensitivity(model, criterion, inputs, labels, epoch):
    inp = Variable(inputs, requires_grad=True)
    
    outputs = model.forward(inp)
    loss = criterion(outputs, labels)
    
    # (1) first-order gradient (wrt parameters)
    first_order_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True, create_graph=True)
    
    # (2) L2 norm of the gradient from (1)
    grad_l2_norm = torch.norm(torch.cat([x.view(-1) for x in first_order_grads]), p = 2)
    
    # (3) Gradient (wrt inputs) of the L2 norm of the gradient from (2)
    sensitivity_vec = torch.autograd.grad(grad_l2_norm, inp, retain_graph=True)[0]
    
    # (4) L2 norm of (3) - "immediate sensitivity"
    s = [torch.norm(v, p=2).numpy().item() for v in sensitivity_vec]
    
    '''
    if epoch > 5:
        print(f"inputs: ",inp)
        print(f"outputs: ", outputs)
        print(f"loss: ", loss)
        print(f"first_order_grads: ", first_order_grads)
        print(f"grad_l2_norm:: ", grad_l2_norm)
        print(f"sensitivity_vec: ", sensitivity_vec)
        print(f"sensitivies: ", s)
    '''

    loss.backward()
    return loss, s

In [142]:
def mnist_accuracy(model, test_loader):
    correct = 0
    num_data = 0

    #grab a batch from the test loader
    for examples, labels in test_loader:
        outputs = model.forward(examples)
        
        #for each output in the batch, check if the label is correct
        for i, output in enumerate(outputs):
            num_data += 1
            
            max_i = np.argmax(output.detach().numpy())
            if max_i == labels[i]:
                correct += 1

    acc = float(correct)/num_data
    
    return acc

In [143]:
BATCH_SIZE = 16

mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
print(len(mnist_trainset))
print(len(mnist_testset))
train_loader = DataLoader(mnist_trainset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
test_loader = DataLoader(mnist_testset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

# next(iter(train_loader))[0].shape --> torch.Size([16, 1, 28, 28])
# This means we have 16 examples of 28x28 pixels in grayscale (i.e. no rgb channels, hence the one).

60000
10000


In [165]:
class mnist_Classifier(nn.Module):
    def __init__(self):
        super(mnist_Classifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 28, kernel_size=(5,5))
        self.conv2 = nn.Conv2d(28, 32, kernel_size=(5,5))
        self.fc1 = nn.Linear(32*20*20, 16)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        # print(x.size()) --> torch.Size([16, 32, 20, 20])
        x = x.view(-1, 32*20*20)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        return torch.softmax(x,dim=1)

In [166]:
def run_mnist():
    # reset the model
    model = mnist_Classifier()
    model_criterion = nn.CrossEntropyLoss()
    model_optimizer = optim.Adam(model.parameters(),lr=0.001)

    # number of epochs and iterations
    epochs = 30
    iters = epochs * BATCH_SIZE
    
    # plotting criteria
    train_losses = []
    test_accs = []

    for epoch in range(epochs):
        print('Start of epoch %d' % (epoch,))
        all_sensitivities = []
        sigmas = []

        for batch_id, (x_batch_train, y_batch_train) in enumerate(train_loader):
            
            #zero out the gradients from the previous iteration
            model_optimizer.zero_grad()
            
            #compute loss
            outputs = model.forward(x_batch_train)
            loss = model_criterion(outputs, y_batch_train)
            loss.backward()
            train_losses.append(loss.item())
            
            
            # perform the backpropagation
            model_optimizer.step()

        print("Average train loss:", np.mean(train_losses))
        test_accs.append(mnist_accuracy(model, test_loader))
        print("Accuracy:", test_accs[-1])
    return mnist_accuracy(model, test_loader), (train_losses, test_accs)

In [167]:
results = run_mnist()

Start of epoch 0


KeyboardInterrupt: 

# Two Models at the Same Time

In [147]:
def inf_loader_generator(dataloader):
    '''
    Generates a function that infinitely samples a dataloader
    '''
    while True:
        for x, y in dataloader:
            yield x, y

In [148]:
def mnist_accuracy_2(model, inf_gen, test_steps):
    correct = 0
    num_data = 0

    #grab a batch from the test loader
    for j in range(test_steps):
        examples, labels = next(inf_gen)
        outputs = model.forward(examples)

        #for each output in the batch, check if the label is correct
        for i, output in enumerate(outputs):
            num_data += 1

            max_i = np.argmax(output.detach().numpy())
            if max_i == labels[i]:
                correct += 1

    acc = float(correct)/num_data
    
    return acc

In [149]:
BATCH_SIZE_1 = 16
BATCH_SIZE_2 = 16

mnist_trainset_1 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_trainset_2 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_1 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_2 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())

print(len(mnist_trainset_1))
print(len(mnist_testset_1))
print(len(mnist_trainset_2))
print(len(mnist_testset_2))

train_loader_1 = DataLoader(mnist_trainset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
test_loader_1 = DataLoader(mnist_testset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
train_loader_2 = DataLoader(mnist_trainset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)
test_loader_2 = DataLoader(mnist_testset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)

inf_train_1 = inf_loader_generator(train_loader_1)
inf_test_1 = inf_loader_generator(test_loader_1)
inf_train_2 = inf_loader_generator(train_loader_2)
inf_test_2 = inf_loader_generator(test_loader_2)

iters_per_epoch_train_1 = len(mnist_trainset_1)//BATCH_SIZE_1
iters_per_epoch_test_1 = len(mnist_testset_1)//BATCH_SIZE_1
iters_per_epoch_train_2 = len(mnist_trainset_2)//BATCH_SIZE_2
iters_per_epoch_test_2 = len(mnist_testset_2)//BATCH_SIZE_2

# next(iter(train_loader))[0].shape --> torch.Size([16, 1, 28, 28])
# This means we have 16 examples of 28x28 pixels in grayscale (i.e. no rgb channels, hence the one).

60000
10000
60000
10000


In [150]:
#Testing out the infinite loader - the batch_ids reset when the dataloader reshuffles. 
'''
train_gen_1 = inf_loader_generator(train_loader_1)
batch_ids = []
for i in range(10000):
    batch_id, (x_batch_train, y_batch_train) = next(train_gen_1)
    batch_ids.append(batch_id)
print(batch_ids)
'''
#next(iter(train_loader_1))
#next(iter(test_loader_1))

'\ntrain_gen_1 = inf_loader_generator(train_loader_1)\nbatch_ids = []\nfor i in range(10000):\n    batch_id, (x_batch_train, y_batch_train) = next(train_gen_1)\n    batch_ids.append(batch_id)\nprint(batch_ids)\n'

In [151]:
def run_two_mnist():
    # reset the model 1
    model_1 = mnist_Classifier()
    model_criterion_1 = nn.CrossEntropyLoss()
    model_optimizer_1 = optim.Adam(model_1.parameters(),lr=0.001)
    
    # reset the model 2
    model_2 = mnist_Classifier()
    model_criterion_2 = nn.CrossEntropyLoss()
    model_optimizer_2 = optim.Adam(model_2.parameters(),lr=0.001)

    # plotting criteria
    train_losses_1 = []
    test_accs_1 = []
    train_losses_2 = []
    test_accs_2 = []

    training = True
    model_steps_1 = 0
    model_steps_2 = 0
    train_steps_per_epoch_1 = 2000
    test_steps_per_epoch_1 = 300
    train_steps_per_epoch_2 = 2000
    test_steps_per_epoch_2 = 300
    print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
    print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
    while training:
            
        # model 1
        x_batch_train_1, y_batch_train_1 = next(inf_train_1)
        #zero out the gradients from the previous iteration
        model_optimizer_1.zero_grad()

        #compute loss
        outputs_1 = model_1.forward(x_batch_train_1)
        loss_1 = model_criterion_1(outputs_1, y_batch_train_1)
        loss_1.backward()
        train_losses_1.append(loss_1.item())
        
        #perform the backpropagation
        model_optimizer_1.step()
        model_steps_1 += 1
        
        if model_steps_1 % train_steps_per_epoch_1 == 0:
            print("Model 1 Average Train Loss:", np.mean(train_losses_1))
            test_accs_1.append(mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1))
            print("Accuracy:", test_accs_1[-1])
            print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
        
        # model 2
        x_batch_train_2, y_batch_train_2 = next(inf_train_2)
        #zero out the gradients from the previous iteration
        model_optimizer_2.zero_grad()

        #compute loss
        outputs_2 = model_2.forward(x_batch_train_2)
        loss_2 = model_criterion_2(outputs_2, y_batch_train_2)
        loss_2.backward()
        train_losses_2.append(loss_2.item())
        
        #perform the backpropagation
        model_optimizer_2.step()
        model_steps_2 += 1
        
        if model_steps_2 % train_steps_per_epoch_2 == 0:
            print("Model 2 Average Train Loss:", np.mean(train_losses_2))
            test_accs_2.append(mnist_accuracy_2(model_2, inf_test_2, test_steps_per_epoch_2))
            print("Accuracy:", test_accs_2[-1])
            print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
        
        if model_steps_1 > (train_steps_per_epoch_1*50) + 1:
            break
        
    return [mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1*6), (train_losses_1, test_accs_1),
            mnist_accuracy_2(model_1, inf_test_2, test_steps_per_epoch_2*6), (train_losses_2, test_accs_2)]

In [None]:
two_mnist_results = run_two_mnist()

Model 1: Start of Epoch 0
Model 2: Start of Epoch 0
Model 1 Average Train Loss: 1.8432904768586158
Accuracy: 0.6695833333333333
Model 1: Start of Epoch 1
Model 2 Average Train Loss: 1.9315890341997146
Accuracy: 0.6008333333333333
Model 2: Start of Epoch 1
Model 1 Average Train Loss: 1.8133660935759544
Accuracy: 0.684375
Model 1: Start of Epoch 2
Model 2 Average Train Loss: 1.9156818189024925
Accuracy: 0.589375
Model 2: Start of Epoch 2
Model 1 Average Train Loss: 1.7992913405696551
Accuracy: 0.7572916666666667
Model 1: Start of Epoch 3
Model 2 Average Train Loss: 1.9085230160752933
Accuracy: 0.5945833333333334
Model 2: Start of Epoch 3
Model 1 Average Train Loss: 1.7700483030229808
Accuracy: 0.7816666666666666
Model 1: Start of Epoch 4
Model 2 Average Train Loss: 1.905207740932703
Accuracy: 0.600625
Model 2: Start of Epoch 4
Model 1 Average Train Loss: 1.7520967221140862
Accuracy: 0.7829166666666667
Model 1: Start of Epoch 5
Model 2 Average Train Loss: 1.9025734634399414
Accuracy: 0.58

# Passing Gradients from One Model To the Second

In [None]:
BATCH_SIZE_1 = 16
BATCH_SIZE_2 = 16

mnist_trainset_1 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_trainset_2 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_1 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_2 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())

print(len(mnist_trainset_1))
print(len(mnist_testset_1))
print(len(mnist_trainset_2))
print(len(mnist_testset_2))

train_loader_1 = DataLoader(mnist_trainset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
test_loader_1 = DataLoader(mnist_testset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
train_loader_2 = DataLoader(mnist_trainset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)
test_loader_2 = DataLoader(mnist_testset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)

inf_train_1 = inf_loader_generator(train_loader_1)
inf_test_1 = inf_loader_generator(test_loader_1)
inf_train_2 = inf_loader_generator(train_loader_2)
inf_test_2 = inf_loader_generator(test_loader_2)

iters_per_epoch_train_1 = len(mnist_trainset_1)//BATCH_SIZE_1
iters_per_epoch_test_1 = len(mnist_testset_1)//BATCH_SIZE_1
iters_per_epoch_train_2 = len(mnist_trainset_2)//BATCH_SIZE_2
iters_per_epoch_test_2 = len(mnist_testset_2)//BATCH_SIZE_2

# next(iter(train_loader))[0].shape --> torch.Size([16, 1, 28, 28])
# This means we have 16 examples of 28x28 pixels in grayscale (i.e. no rgb channels, hence the one).

In [None]:
def share_gradient_mnist():
    # reset the model 1
    model_1 = mnist_Classifier()
    model_criterion_1 = nn.CrossEntropyLoss()
    model_optimizer_1 = optim.Adam(model_1.parameters(),lr=0.001)
    
    # reset the model 2
    model_2 = mnist_Classifier()
    model_criterion_2 = nn.CrossEntropyLoss()
    model_optimizer_2 = optim.Adam(model_2.parameters(),lr=0.001)

    # plotting criteria
    train_losses_1 = []
    test_accs_1 = []
    train_losses_2 = []
    test_accs_2 = []

    training = True
    model_steps_1 = 0
    model_steps_2 = 0
    train_steps_per_epoch_1 = 2000
    test_steps_per_epoch_1 = 300
    train_steps_per_epoch_2 = 2000
    test_steps_per_epoch_2 = 300
    print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
    print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
    while training:
            
        # model 1
        x_batch_train_1, y_batch_train_1 = next(inf_train_1)
        #zero out the gradients from the previous iteration
        model_optimizer_1.zero_grad()

        #compute loss
        outputs_1 = model_1.forward(x_batch_train_1)
        loss_1 = model_criterion_1(outputs_1, y_batch_train_1)
        loss_1.backward()
        train_losses_1.append(loss_1.item())
        
        model_1_params = list(model_1.parameters())
        
        #perform the backpropagation
        model_optimizer_1.step()
        model_steps_1 += 1
        
        if model_steps_1 % train_steps_per_epoch_1 == 0:
            print("Model 1 Average Train Loss:", np.mean(train_losses_1))
            test_accs_1.append(mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1))
            print("Accuracy:", test_accs_1[-1])
            print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
        
        # model 2
        x_batch_train_2, y_batch_train_2 = next(inf_train_2)
        #zero out the gradients from the previous iteration
        model_optimizer_2.zero_grad()

        #compute loss
        outputs_2 = model_2.forward(x_batch_train_2)
        loss_2 = model_criterion_2(outputs_2, y_batch_train_2)
        loss_2.backward()
        train_losses_2.append(loss_2.item())
        
        # Add model 1's gradient to model 2
        model_2_params = list(model_2.parameters())
        for p,q in zip(model_1_params, model_2_params):
            q.grad += p.grad
        
        #perform the backpropagation
        model_optimizer_2.step()
        model_steps_2 += 1
        
        if model_steps_2 % train_steps_per_epoch_2 == 0:
            print("Model 2 Average Train Loss:", np.mean(train_losses_2))
            test_accs_2.append(mnist_accuracy_2(model_2, inf_test_2, test_steps_per_epoch_2))
            print("Accuracy:", test_accs_2[-1])
            print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
            
        if model_steps_1 > (train_steps_per_epoch_1*50) + 1:
            break
        
    return [mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1*6), (train_losses_1, test_accs_1),
            mnist_accuracy_2(model_2, inf_test_2, test_steps_per_epoch_2*6), (train_losses_2, test_accs_2)]

In [None]:
share_gradient_results = share_gradient_mnist()

In [113]:
print(len(example_params))
print("-----------")

'''
class mnist_Classifier(nn.Module):
    def __init__(self):
        super(mnist_Classifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 28, kernel_size=(5,5))
        self.conv2 = nn.Conv2d(28, 32, kernel_size=(5,5))
        self.fc1 = nn.Linear(32*20*20, 16)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        # print(x.size()) --> torch.Size([16, 32, 20, 20])
        x = x.view(-1, 32*20*20)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        return torch.softmax(x,dim=1)
'''

for i in range(len(example_params)):
    print(len(example_params[i]))
          
print('-----------')
    
print(len(example_params[0]))
print(len(example_params[0][0]))
print(len(example_params[0][0][0]))

#print(example_params[0])
print(example_params[0][0])
print(example_params[0][0][0])
print(example_params[1])
#print(example_params[1][0])
#print(example_params[1][0][0])

print("------------")
print(len(example_params[0].grad))

8
-----------
28
28
32
32
16
16
10
10
-----------
28
1
5
tensor([[[ 0.1573,  0.0239, -0.1093,  0.1152, -0.0214],
         [-0.0398,  0.0588,  0.0438,  0.0889, -0.1593],
         [ 0.0337, -0.0114, -0.0363,  0.0549,  0.0545],
         [-0.0629, -0.0394, -0.0910, -0.0025, -0.0461],
         [ 0.0345, -0.1686, -0.0637, -0.1188, -0.0762]]],
       grad_fn=<SelectBackward>)
tensor([[ 0.1573,  0.0239, -0.1093,  0.1152, -0.0214],
        [-0.0398,  0.0588,  0.0438,  0.0889, -0.1593],
        [ 0.0337, -0.0114, -0.0363,  0.0549,  0.0545],
        [-0.0629, -0.0394, -0.0910, -0.0025, -0.0461],
        [ 0.0345, -0.1686, -0.0637, -0.1188, -0.0762]],
       grad_fn=<SelectBackward>)
Parameter containing:
tensor([ 0.1244, -0.0783,  0.1221,  0.1507, -0.0294, -0.0566,  0.1969,  0.1550,
         0.0330, -0.1946,  0.0554, -0.0257, -0.1056, -0.1283, -0.1869,  0.1958,
         0.1546, -0.1363,  0.0688, -0.0438, -0.0415, -0.1582, -0.1455,  0.1552,
         0.0025,  0.1939,  0.0101, -0.0620], requires_gra

# Both models sharing with each other

In [132]:
BATCH_SIZE_1 = 16
BATCH_SIZE_2 = 16

mnist_trainset_1 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_trainset_2 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_1 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_2 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())

print(len(mnist_trainset_1))
print(len(mnist_testset_1))
print(len(mnist_trainset_2))
print(len(mnist_testset_2))

train_loader_1 = DataLoader(mnist_trainset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
test_loader_1 = DataLoader(mnist_testset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
train_loader_2 = DataLoader(mnist_trainset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)
test_loader_2 = DataLoader(mnist_testset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)

inf_train_1 = inf_loader_generator(train_loader_1)
inf_test_1 = inf_loader_generator(test_loader_1)
inf_train_2 = inf_loader_generator(train_loader_2)
inf_test_2 = inf_loader_generator(test_loader_2)

iters_per_epoch_train_1 = len(mnist_trainset_1)//BATCH_SIZE_1
iters_per_epoch_test_1 = len(mnist_testset_1)//BATCH_SIZE_1
iters_per_epoch_train_2 = len(mnist_trainset_2)//BATCH_SIZE_2
iters_per_epoch_test_2 = len(mnist_testset_2)//BATCH_SIZE_2

# next(iter(train_loader))[0].shape --> torch.Size([16, 1, 28, 28])
# This means we have 16 examples of 28x28 pixels in grayscale (i.e. no rgb channels, hence the one).

60000
10000
60000
10000


In [None]:
def share_both_mnist():
    # reset the model 1
    model_1 = mnist_Classifier()
    model_criterion_1 = nn.CrossEntropyLoss()
    model_optimizer_1 = optim.Adam(model_1.parameters(),lr=0.001)
    
    # reset the model 2
    model_2 = mnist_Classifier()
    model_criterion_2 = nn.CrossEntropyLoss()
    model_optimizer_2 = optim.Adam(model_2.parameters(),lr=0.001)

    # plotting criteria
    train_losses_1 = []
    test_accs_1 = []
    train_losses_2 = []
    test_accs_2 = []

    training = True
    model_steps_1 = 0
    model_steps_2 = 0
    train_steps_per_epoch_1 = 2000
    test_steps_per_epoch_1 = 300
    train_steps_per_epoch_2 = 2000
    test_steps_per_epoch_2 = 300
    print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
    print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
    while training:
            
        # model 1
        x_batch_train_1, y_batch_train_1 = next(inf_train_1)
        #zero out the gradients from the previous iteration
        model_optimizer_1.zero_grad()

        #compute loss
        outputs_1 = model_1.forward(x_batch_train_1)
        loss_1 = model_criterion_1(outputs_1, y_batch_train_1)
        loss_1.backward()
        train_losses_1.append(loss_1.item())
        
        model_1_params = list(model_1.parameters())
        
        #perform the backpropagation
        
        if model_steps_1 % train_steps_per_epoch_1 == 0:
            print("Model 1 Average Train Loss:", np.mean(train_losses_1))
            test_accs_1.append(mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1))
            print("Accuracy:", test_accs_1[-1])
            print("Model 1: Start of Epoch %d" % (model_steps_1/train_steps_per_epoch_1))
        
        # model 2
        x_batch_train_2, y_batch_train_2 = next(inf_train_2)
        #zero out the gradients from the previous iteration
        model_optimizer_2.zero_grad()

        #compute loss
        outputs_2 = model_2.forward(x_batch_train_2)
        loss_2 = model_criterion_2(outputs_2, y_batch_train_2)
        loss_2.backward()
        train_losses_2.append(loss_2.item())
        
        # Add model 1's gradient to model 2
        model_2_params = list(model_2.parameters())
        for p,q in zip(model_1_params, model_2_params):
            q.grad += p.grad
        
        #perform the backpropagation
        # NEED TO MAKE A DEEP COPY HERE?~?
        model_optimizer_1.step()
        model_steps_1 += 1
        model_optimizer_2.step()
        model_steps_2 += 1
        
        if model_steps_2 % train_steps_per_epoch_2 == 0:
            print("Model 2 Average Train Loss:", np.mean(train_losses_2))
            test_accs_2.append(mnist_accuracy_2(model_2, inf_test_2, test_steps_per_epoch_2))
            print("Accuracy:", test_accs_2[-1])
            print("Model 2: Start of Epoch %d" % (model_steps_2/train_steps_per_epoch_2))
            
        if model_steps_1 > (train_steps_per_epoch_1*20) + 1:
            break
        
    return [mnist_accuracy_2(model_1, inf_test_1, test_steps_per_epoch_1*6), (train_losses_1, test_accs_1),
            mnist_accuracy_2(model_2, inf_test_2, test_steps_per_epoch_2*6), (train_losses_2, test_accs_2)]

In [None]:
share_both_results = share_both_mnist()

# No sharing vs 1-Sharing vs 2-Sharing Comparison

In [131]:
share_gradient_results[1][0]

[2.304023265838623,
 2.2921273708343506,
 2.3246054649353027,
 2.307582378387451,
 2.3011348247528076,
 2.310779333114624,
 2.29919171333313,
 2.3031556606292725,
 2.2981133460998535,
 2.3027493953704834,
 2.3015964031219482,
 2.303175449371338,
 2.298534870147705,
 2.294466018676758,
 2.296642780303955,
 2.3009636402130127,
 2.292402505874634,
 2.301198959350586,
 2.2488698959350586,
 2.273270606994629,
 2.2430574893951416,
 2.309187412261963,
 2.3498151302337646,
 2.3289361000061035,
 2.2262306213378906,
 2.2701902389526367,
 2.2388293743133545,
 2.199167490005493,
 2.249704360961914,
 2.238769769668579,
 2.218247652053833,
 2.2530252933502197,
 2.276012420654297,
 2.285748243331909,
 2.1479697227478027,
 2.1857051849365234,
 2.1023106575012207,
 2.128092050552368,
 2.264326810836792,
 2.2410314083099365,
 2.2151641845703125,
 2.2920875549316406,
 2.214625835418701,
 2.22855281829834,
 2.1520938873291016,
 2.134685516357422,
 2.246134042739868,
 2.177459955215454,
 2.1898415088653564

# OK. We're doing shared encoder now. 

In [204]:
def encdec_accuracy(encoder, decoder, inf_gen, test_steps):
    correct = 0
    num_data = 0

    #grab a batch from the test loader
    for j in range(test_steps):
        examples, labels = next(inf_gen)
        embedding = encoder.forward(examples)
        outputs = decoder.forward(embedding)

        #for each output in the batch, check if the label is correct
        for i, output in enumerate(outputs):
            num_data += 1

            max_i = np.argmax(output.detach().numpy())
            if max_i == labels[i]:
                correct += 1

    acc = float(correct)/num_data
    
    return acc

In [203]:
BATCH_SIZE_1 = 16
BATCH_SIZE_2 = 16

mnist_trainset_1 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_trainset_2 = datasets.MNIST(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_1 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
mnist_testset_2 = datasets.MNIST(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())

print(len(mnist_trainset_1))
print(len(mnist_testset_1))
print(len(mnist_trainset_2))
print(len(mnist_testset_2))

max_trainsteps_1 = len(mnist_trainset_1) // BATCH_SIZE_1
max_teststeps_1 = len(mnist_testset_1) // BATCH_SIZE_1
max_trainsteps_2 = len(mnist_trainset_2) // BATCH_SIZE_2
max_teststeps_2 = len(mnist_testset_2) // BATCH_SIZE_2

print(max_trainsteps_1)
print(max_teststeps_1)
print(max_trainsteps_2)
print(max_teststeps_2)

train_loader_1 = DataLoader(mnist_trainset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
test_loader_1 = DataLoader(mnist_testset_1, batch_size=BATCH_SIZE_1, shuffle=True, drop_last=True)
train_loader_2 = DataLoader(mnist_trainset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)
test_loader_2 = DataLoader(mnist_testset_2, batch_size=BATCH_SIZE_2, shuffle=True, drop_last=True)

inf_train_1 = inf_loader_generator(train_loader_1)
inf_test_1 = inf_loader_generator(test_loader_1)
inf_train_2 = inf_loader_generator(train_loader_2)
inf_test_2 = inf_loader_generator(test_loader_2)

iters_per_epoch_train_1 = len(mnist_trainset_1)//BATCH_SIZE_1
iters_per_epoch_test_1 = len(mnist_testset_1)//BATCH_SIZE_1
iters_per_epoch_train_2 = len(mnist_trainset_2)//BATCH_SIZE_2
iters_per_epoch_test_2 = len(mnist_testset_2)//BATCH_SIZE_2

# next(iter(train_loader))[0].shape --> torch.Size([16, 1, 28, 28])
# This means we have 16 examples of 28x28 pixels in grayscale (i.e. no rgb channels, hence the one).

60000
10000
60000
10000
3750
625
3750
625


In [205]:
class mnist_Classifier(nn.Module):
    def __init__(self):
        super(mnist_Classifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 28, kernel_size=(5,5))
        self.conv2 = nn.Conv2d(28, 32, kernel_size=(5,5))
        #ok, i undersatnd this now. the output from conv2 is 32 channels. the remaining width and height after
        # two 5x5 filters with NO PADDING and STRIDE 1 (defaults) is 20x20. So 32 channels * 20 h * 20 w
        self.fc1 = nn.Linear(32*20*20, 16)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        # print(x.size()) --> torch.Size([16, 32, 20, 20])
        x = x.view(-1, 32*20*20)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        return torch.softmax(x,dim=1)

class EncoderCNN(nn.Module):
    def __init__(self, input_size=1, hidden_size=28):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.conv = nn.Conv2d(input_size, hidden_size, kernel_size=(5,5))
        self.output_shape = (hidden_size,24,24)

    def forward(self, x):
        x = self.conv(x)
        x = nn.ReLU()(x)
        return x
    
class DecoderCNN(nn.Module):
    def __init__(self, hidden_size=28):
        super(DecoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.conv = nn.Conv2d(hidden_size, 32, kernel_size=(5,5))
        self.fc1 = nn.Linear(32*20*20, 16)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = self.conv(x)
        x = nn.ReLU()(x)
        x = x.view(-1, 32*20*20)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        return torch.softmax(x,dim=1)

In [None]:
def train_encdec():
    
    # initialize the models
    hidden_size = 28
    encoder = EncoderCNN(1, hidden_size)
    decoder = DecoderCNN(hidden_size)
    
    # encoder 
    encoder_optimizer = optim.Adam(encoder.parameters(),lr=0.001)
    
    # decoder 
    decoder_criterion = nn.CrossEntropyLoss()
    decoder_optimizer = optim.Adam(decoder.parameters(),lr=0.001)
    
    # number of epochs and iterations
    training = True
    epochs = 20
    train_steps_per_epoch = 2000
    test_steps_per_epoch = 300
    model_steps = 0
    
    # plotting criteria
    train_losses = []
    test_accs = []  
    
    print("Start of Epoch %d" % (model_steps/train_steps_per_epoch))
    while training:
        
        # get next batch
        x_batch_train, y_batch_train = next(inf_train_1)
        
        # zero out the gradients from the previous iteration
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        #compute loss
        embedding = encoder.forward(x_batch_train)
        outputs = decoder.forward(embedding)
        loss = decoder_criterion(outputs, y_batch_train)
        loss.backward()
        train_losses.append(loss.item())
        
        # perform the backpropagation
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        model_steps += 1
        
        # perform the backpropagation
        if model_steps % train_steps_per_epoch == 0:
            print("Average Train Loss:", np.mean(train_losses))
            test_accs.append(encdec_accuracy(encoder, decoder, inf_test_1, test_steps_per_epoch))
            print("Test Accuracy:", test_accs[-1])
            print("Start of Epoch %d" % (model_steps/train_steps_per_epoch))
            
        if model_steps > (train_steps_per_epoch*epochs) + 1:
            break
    return [encdec_accuracy(encoder, decoder, inf_test_1, test_steps_per_epoch*2), (train_losses, test_accs)]

In [225]:
def train_enc2dec():
    
    # initialize the models
    hidden_size = 28
    encoder = EncoderCNN(1, hidden_size)
    decoder = DecoderCNN(hidden_size)
    decoder2 = DecoderCNN(hidden_size)
    
    # encoder 
    encoder_optimizer = optim.Adam(encoder.parameters(),lr=0.001)
    
    # decoder 
    decoder_criterion = nn.CrossEntropyLoss()
    decoder_optimizer = optim.Adam(decoder.parameters(),lr=0.001)
    decoder_criterion2 = nn.CrossEntropyLoss()
    decoder_optimizer2 = optim.Adam(decoder2.parameters(),lr=0.001)
    
    
    # number of epochs and iterations
    training = True
    epochs = 20
    train_steps_per_epoch = 2000
    test_steps_per_epoch = 300
    model_steps = 0
    
    # plotting criteria
    train_losses = []
    train_losses2 = []
    test_accs = []
    test_accs2 = []
    
    print("Start of Epoch %d" % (model_steps/train_steps_per_epoch))
    while training:
        
        # get next batch
        x_batch_train, y_batch_train = next(inf_train_1)
        x_batch_train2, y_batch_train2 = next(inf_train_2)
        
        # zero out the gradients from the previous iteration
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        decoder_optimizer2.zero_grad()

        # compute loss
        embedding = encoder.forward(x_batch_train)
        outputs = decoder.forward(embedding)
        loss = decoder_criterion(outputs, y_batch_train)
        loss.backward()
        train_losses.append(loss.item())
        
        # perform the backpropagation
        encoder_optimizer.step()
        decoder_optimizer.step()
        #DON'T FORGET TO RE-ZERO THE ENCODER GRADIENT!!! OTHERWISE YOU'RE REUSING THE FIRST GRAD
        encoder_optimizer.zero_grad()
    
        # compute loss and backprop for second model
        embedding = encoder.forward(x_batch_train2)
        outputs = decoder2.forward(embedding)
        loss = decoder_criterion2(outputs, y_batch_train2)
        loss.backward()
        train_losses2.append(loss.item())
        encoder_optimizer.step()
        decoder_optimizer2.step()
        
        model_steps += 1
        
        # perform the backpropagation
        if model_steps % train_steps_per_epoch == 0:
            print("Average Train Loss:", np.mean(train_losses))
            print("Average Train Loss2:", np.mean(train_losses2))
            test_accs.append(encdec_accuracy(encoder, decoder, inf_test_1, test_steps_per_epoch))
            test_accs2.append(encdec_accuracy(encoder, decoder2, inf_test_2, test_steps_per_epoch))
            print("Test Accuracy:", test_accs[-1])
            print("Test Accuracy2:", test_accs2[-1])
            print("Start of Epoch %d" % (model_steps/train_steps_per_epoch))
            
        if model_steps > (train_steps_per_epoch*epochs) + 1:
            break
    return [
        [encdec_accuracy(encoder, decoder, inf_test_1, test_steps_per_epoch*2), (train_losses, test_accs)],
        [encdec_accuracy(encoder, decoder2, inf_test_2, test_steps_per_epoch*2), (train_losses2, test_accs2)]
    ]

In [208]:
for i in range(20):
    train_encdec()
    print("----------------------")

Start of Epoch 0
Average Train Loss: 1.6856881410479545
Test Accuracy: 0.8764583333333333
Start of Epoch 1
Average Train Loss: 1.6145659938454628
Test Accuracy: 0.9714583333333333
Start of Epoch 2
Average Train Loss: 1.573092342098554
Test Accuracy: 0.9739583333333334
Start of Epoch 3
Average Train Loss: 1.5520765731930732
Test Accuracy: 0.9760416666666667
Start of Epoch 4
Average Train Loss: 1.5392769233345986
Test Accuracy: 0.975
Start of Epoch 5
Average Train Loss: 1.530297676285108
Test Accuracy: 0.9783333333333334
Start of Epoch 6
Average Train Loss: 1.5235731596691269
Test Accuracy: 0.9783333333333334
Start of Epoch 7
Average Train Loss: 1.5183095410987735
Test Accuracy: 0.9754166666666667
Start of Epoch 8
Average Train Loss: 1.5144596832725736
Test Accuracy: 0.9808333333333333
Start of Epoch 9
Average Train Loss: 1.5110546961665154
Test Accuracy: 0.9808333333333333
Start of Epoch 10
Average Train Loss: 1.5083946413018487
Test Accuracy: 0.97875
Start of Epoch 11
Average Train Los

KeyboardInterrupt: 

In [227]:
results = []
for i in range(10):
    results.append(train_enc2dec())

Start of Epoch 0
Average Train Loss: 1.8712317019104958
Average Train Loss2: 1.6492022513747215
Test Accuracy: 0.5779166666666666
Test Accuracy2: 0.8677083333333333
Start of Epoch 1
Average Train Loss: 1.8107159850001335
Average Train Loss2: 1.619957497626543
Test Accuracy: 0.6785416666666667
Test Accuracy2: 0.879375
Start of Epoch 2
Average Train Loss: 1.7870267972548803
Average Train Loss2: 1.6088859236240387
Test Accuracy: 0.6875
Test Accuracy2: 0.8779166666666667
Start of Epoch 3
Average Train Loss: 1.7744839192330837
Average Train Loss2: 1.6020357965826988
Test Accuracy: 0.671875
Test Accuracy2: 0.8820833333333333
Start of Epoch 4
Average Train Loss: 1.7662615366697312
Average Train Loss2: 1.5959362713456153
Test Accuracy: 0.6827083333333334
Test Accuracy2: 0.9610416666666667
Start of Epoch 5
Average Train Loss: 1.7607055297791958
Average Train Loss2: 1.5783692828516165
Test Accuracy: 0.6889583333333333
Test Accuracy2: 0.9772916666666667
Start of Epoch 6
Average Train Loss: 1.7568

Average Train Loss: 1.530325853925485
Average Train Loss2: 1.5966617670242602
Test Accuracy: 0.965
Test Accuracy2: 0.871875
Start of Epoch 13
Average Train Loss: 1.5268194014685494
Average Train Loss2: 1.5956694611523832
Test Accuracy: 0.9797916666666666
Test Accuracy2: 0.8754166666666666
Start of Epoch 14
Average Train Loss: 1.5235993537068366
Average Train Loss2: 1.594873136695226
Test Accuracy: 0.9785416666666666
Test Accuracy2: 0.875625
Start of Epoch 15
Average Train Loss: 1.5209957524612545
Average Train Loss2: 1.5942536441385746
Test Accuracy: 0.98125
Test Accuracy2: 0.8785416666666667
Start of Epoch 16
Average Train Loss: 1.5186903234369615
Average Train Loss2: 1.5939864382743836
Test Accuracy: 0.980625
Test Accuracy2: 0.8764583333333333
Start of Epoch 17
Average Train Loss: 1.516714824590418
Average Train Loss2: 1.59344484651751
Test Accuracy: 0.9829166666666667
Test Accuracy2: 0.873125
Start of Epoch 18
Average Train Loss: 1.5148578729472661
Average Train Loss2: 1.59309546969

Test Accuracy: 0.8747916666666666
Test Accuracy2: 0.968125
Start of Epoch 4
Average Train Loss: 1.60144730104208
Average Train Loss2: 1.5258106170892716
Test Accuracy: 0.885625
Test Accuracy2: 0.9816666666666667
Start of Epoch 5
Average Train Loss: 1.5978533722360928
Average Train Loss2: 1.5195492649277051
Test Accuracy: 0.8841666666666667
Test Accuracy2: 0.9785416666666666
Start of Epoch 6
Average Train Loss: 1.5950458746637617
Average Train Loss2: 1.5147152874299459
Test Accuracy: 0.88625
Test Accuracy2: 0.9802083333333333
Start of Epoch 7
Average Train Loss: 1.5930699144974352
Average Train Loss2: 1.5110556743443013
Test Accuracy: 0.8797916666666666
Test Accuracy2: 0.9722916666666667
Start of Epoch 8
Average Train Loss: 1.5914693107273843
Average Train Loss2: 1.5084449854824278
Test Accuracy: 0.8783333333333333
Test Accuracy2: 0.98
Start of Epoch 9
Average Train Loss: 1.5900892759561538
Average Train Loss2: 1.5062758719086646
Test Accuracy: 0.8877083333333333
Test Accuracy2: 0.97916

Average Train Loss: 1.661788646876812
Average Train Loss2: 1.6618691991392305
Test Accuracy: 0.78625
Test Accuracy2: 0.881875
Start of Epoch 17
Average Train Loss: 1.661386869897445
Average Train Loss2: 1.6609014891584715
Test Accuracy: 0.7785416666666667
Test Accuracy2: 0.8833333333333333
Start of Epoch 18
Average Train Loss: 1.6609882007805925
Average Train Loss2: 1.6599515284048882
Test Accuracy: 0.778125
Test Accuracy2: 0.8852083333333334
Start of Epoch 19
Average Train Loss: 1.6608090828627349
Average Train Loss2: 1.6590979658812284
Test Accuracy: 0.7808333333333334
Test Accuracy2: 0.8820833333333333
Start of Epoch 20
Start of Epoch 0
Average Train Loss: 1.8353909572958946
Average Train Loss2: 1.6326385554671288
Test Accuracy: 0.6875
Test Accuracy2: 0.9572916666666667
Start of Epoch 1
Average Train Loss: 1.7664109604656697
Average Train Loss2: 1.5686374396383762
Test Accuracy: 0.7820833333333334
Test Accuracy2: 0.9670833333333333
Start of Epoch 2
Average Train Loss: 1.734235445082

In [233]:
''' Return value from one training session
results.append(train_enc2dec())

result = 
[
  [encdec_accuracy(encoder, decoder, inf_test_1, test_steps_per_epoch*2), (train_losses, test_accs)],
  [encdec_accuracy(encoder, decoder2, inf_test_2, test_steps_per_epoch*2), (train_losses2, test_accs2)]
]

20 epochs trained 20 times, with two models per training. So 40 models each trained for 20 epochs.
'''

max_accs = []
max_accs2 = []

for result in results:
    max_accs.append(max(result[0][1][1]))
    max_accs2.append(max(result[1][1][1]))

print(max_accs)
print(max_accs2)
print(np.mean(max_accs))
print(np.mean(max_accs2))
print(np.mean(max_accs + max_accs2))

[0.69375, 0.791875, 0.9829166666666667, 0.9825, 0.7945833333333333, 0.8929166666666667, 0.88875, 0.78625, 0.983125, 0.6122916666666667]
[0.98375, 0.480625, 0.8877083333333333, 0.9829166666666667, 0.9804166666666667, 0.9816666666666667, 0.890625, 0.8891666666666667, 0.9802083333333333, 0.9775]
0.8408958333333334
0.9034583333333334
0.8721770833333332
