# Scalability of *in situ* backpropagation

In this notebook, we explore the scalability of *in situ* backpropagation as it pertains to the tradeoff between noise and energy efficiency and latency of photonic devices. 
- As far as scalability of the photonic advantage, we do our best to incorporate all of the different elements that contribute to the total energy consumption in the hybrid photonic neural network design, dominated by optoelectronic conversions and signal amplification, and any assumptions for this calculation are provided in the main text and/or Supplementary Material of the paper.
- As far as noise error scaling, we explore the tradeoffs of various errors (e.g., systematic in the various photonic elements and random noise at the photodetector). We then perform large-scale simulations on MNIST data to show that realistic problems can be solved using our approach in the presence of error.



In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from PIL import Image, ImageFilter
import numpy as np

In [3]:
y_train_path = data_file_path + "/images_manufacturer_train.txt"
y_test_path = data_file_path + "/images_manufacturer_test.txt"

def populate_set(path):
    manufacturer_dict = dict()
    with open(path, 'r') as file:
        for line in file:
            line = line.strip().split()
            if len(line) == 2:
                key = line[0]
                value = line[1]
                manufacturer_dict[key] = value
    return manufacturer_dict

y_train_dict = populate_set(y_train_path)
y_test_dict = populate_set(y_test_path)

In [5]:
# one hot encode the manufacturers
possible_labels = sorted(list(set(y_train_dict.values())))
print(possible_labels)
num_manufacturers = len(possible_labels)

# one_hot_dict = {}
# for manufacturer_idx in range(len(possible_labels)):
#     one_hot_dict[possible_labels[manufacturer_idx]] = np.zeros(num_manufacturers)
#     one_hot_dict[possible_labels[manufacturer_idx]][manufacturer_idx] = 1

manufacturer_dict = {}

for manufacturer_idx in range(len(possible_labels)):
    manufacturer_dict[possible_labels[manufacturer_idx]] = manufacturer_idx

['ATR', 'Airbus', 'Antonov', 'Beechcraft', 'Boeing', 'Canadair', 'Cessna', 'Dornier', 'Embraer', 'Eurofighter', 'Fairchild', 'Fokker', 'Ilyushin', 'Panavia', 'Piper', 'Robin', 'Saab', 'Supermarine', 'Tupolev', 'Yakovlev']


In [8]:
from utils import norm_inputs

WIDTH = 64

x_train_raw = np.zeros((len(y_train_dict), WIDTH, WIDTH))
x_test_raw = np.zeros((len(y_test_dict), WIDTH, WIDTH))
# y_train = np.zeros((len(y_train_dict), num_manufacturers))
y_train = np.zeros((len(y_train_dict),))
# y_test = np.zeros((len(y_test_dict), num_manufacturers))
y_test = np.zeros((len(y_test_dict),))


def populate():
    def populate_dataset(label_dict, data, labels):
        counter = 0
        for img_num in label_dict.keys():
            path = f"/Users/matthewho/Photonic_computing/simphox-notebooks/aircraft/fgvc-aircraft-2013b/data/images/{img_num}.jpg"
            img = Image.open(path)
            img = img.convert('L')                                  # grayscale
            width, height = img.size
            cropped_img = img.crop((0, 0, width, height - 20))        # crop the copyright
            filtered_img = cropped_img.filter(ImageFilter.GaussianBlur(radius = 5))    # not sure what to put as SD, putting 5 for now
            downsampled_img = filtered_img.resize((WIDTH, WIDTH))         # downsample to 256 x 256
            data[counter, :, :] = downsampled_img                     # add to x_train
            # labels[counter,:] = one_hot_dict[label_dict[img_num]]     # add correct label
            labels[counter] = manufacturer_dict[label_dict[img_num]]
            counter += 1
    populate_dataset(y_train_dict, x_train_raw, y_train)
    populate_dataset(y_test_dict, x_test_raw, y_test)

populate()

In [9]:
train_perm = np.random.permutation(x_train_raw.shape[0])
x_train_raw = x_train_raw[train_perm]
y_train = y_train[train_perm]
test_perm = np.random.permutation(x_test_raw.shape[0])
x_test_raw = x_test_raw[test_perm]
y_test = y_test[test_perm]

In [10]:
mean_pixel = np.mean(np.vstack([x_train_raw, x_test_raw]))
std_pixel = np.std(np.vstack([x_train_raw, x_test_raw]))

x_train = (x_train_raw - mean_pixel)/std_pixel
x_test = (x_test_raw - mean_pixel)/std_pixel

In [11]:
NUM_TRAIN = np.shape(x_train)[0]
NUM_TEST = np.shape(x_test)[0]

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(np.expand_dims(x_train, axis = 1)), torch.from_numpy(y_train))
test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(np.expand_dims(x_test, axis = 1)), torch.from_numpy(y_test))

In [13]:
USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cpu


In [14]:
print(train_dataset[0][0].shape)

torch.Size([1, 64, 64])


In [15]:
batch_size = 64

# in theory they should be randomized already, but it doesn't hurt to randomly choose test data anyways
aircraft_train = DataLoader(train_dataset, batch_size=batch_size, 
                            sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

aircraft_test = DataLoader(test_dataset, batch_size=batch_size, 
                           sampler=sampler.SubsetRandomSampler(range(NUM_TEST)))

In [16]:
def flatten(x):
    N = x.shape[0]
    return x.view(N, -1)

class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)

In [17]:
import torch.nn.functional as F

def check_accuracy(loader, model):
    # print('Checking accuracy on set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def train(model, optimizer, epochs=50):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        print(f"Epoch number: {e}")
        for t, (x, y) in enumerate(aircraft_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % print_every == 0:
                print('Loss = %.4f' % (loss.item()))
                print("Train accuracy:")
                check_accuracy(aircraft_train, model)
                print("Test accuracy:")
                check_accuracy(aircraft_test, model)
                print()

In [19]:
channel_1 = 16
channel_2 = 8
learning_rate = 0.001

model = nn.Sequential(
    nn.Conv2d(1, channel_1, 5, padding=2),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Conv2d(channel_1, channel_2, 3, padding=1),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    Flatten(),
    nn.Dropout(p=0.8),
    nn.Linear(channel_2 * WIDTH * WIDTH, num_manufacturers)
)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
train(model, optimizer)

Epoch number: 0
Loss = 2.9950
Train accuracy:
Got 373 / 2367 correct (15.76)
Test accuracy:
Got 375 / 2368 correct (15.84)

Epoch number: 1
Loss = 2.3649
Train accuracy:
Got 733 / 2367 correct (30.97)
Test accuracy:
Got 734 / 2368 correct (31.00)

Epoch number: 2
Loss = 2.1492
Train accuracy:
Got 753 / 2367 correct (31.81)
Test accuracy:
Got 730 / 2368 correct (30.83)

Epoch number: 3
Loss = 1.9978
Train accuracy:
Got 787 / 2367 correct (33.25)
Test accuracy:
Got 684 / 2368 correct (28.89)

Epoch number: 4
Loss = 2.0483
Train accuracy:
Got 839 / 2367 correct (35.45)
Test accuracy:
Got 716 / 2368 correct (30.24)

Epoch number: 5
Loss = 1.8814
Train accuracy:
Got 920 / 2367 correct (38.87)
Test accuracy:
Got 728 / 2368 correct (30.74)

Epoch number: 6
Loss = 1.9257
Train accuracy:
Got 960 / 2367 correct (40.56)
Test accuracy:
Got 749 / 2368 correct (31.63)

Epoch number: 7
Loss = 1.6156
Train accuracy:
Got 1027 / 2367 correct (43.39)
Test accuracy:
Got 765 / 2368 correct (32.31)

Epoch n