In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import torchvision

import torchsummary

from models.vgg import VGG
from utils.run import run_one_epoch

In [3]:
transform_valid = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_train = torchvision.transforms.Compose([
    torchvision.transforms.RandomCrop(32, padding=4),
    torchvision.transforms.RandomHorizontalFlip(),
    transform_valid,
])



trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True, num_workers=16, pin_memory=True)

validset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_valid)
validloader = torch.utils.data.DataLoader(validset, batch_size=512, shuffle=False, num_workers=16, pin_memory=True)

Files already downloaded and verified
Files already downloaded and verified


In [20]:
model = VGG('VGG16')
torchsummary.summary(model.cpu(), (3, 32, 32), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,792
       BatchNorm2d-2           [-1, 64, 32, 32]             128
              ReLU-3           [-1, 64, 32, 32]               0
            Conv2d-4           [-1, 64, 32, 32]          36,928
       BatchNorm2d-5           [-1, 64, 32, 32]             128
              ReLU-6           [-1, 64, 32, 32]               0
         MaxPool2d-7           [-1, 64, 16, 16]               0
            Conv2d-8          [-1, 128, 16, 16]          73,856
       BatchNorm2d-9          [-1, 128, 16, 16]             256
             ReLU-10          [-1, 128, 16, 16]               0
           Conv2d-11          [-1, 128, 16, 16]         147,584
      BatchNorm2d-12          [-1, 128, 16, 16]             256
             ReLU-13          [-1, 128, 16, 16]               0
        MaxPool2d-14            [-1, 12

In [5]:
# Train baseline model

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = model.to(device)

history = {
    'epoch': [],
    'train': {'loss': [], 'accuracy': []},
    'valid': {'loss': [], 'accuracy': []},
}

for epoch in range(num_epochs):
    print(f'{epoch+1} / {num_epochs}')
    train_loss, train_acc = run_one_epoch(model, trainloader, criterion=criterion, optimizer=optimizer, device=device)
    print(f'\tTrain | Loss: {train_loss:.2e} Accuracy: {train_acc:.2%}')
    valid_loss, valid_acc = run_one_epoch(model, validloader, criterion=criterion, device=device)
    print(f'\tTrain | Loss: {valid_loss:.2e} Accuracy: {valid_acc:.2%}')
    scheduler.step()

    history['epoch'].append(epoch)
    history['train']['loss'].append(train_loss)
    history['train']['accuracy'].append(train_acc)
    history['valid']['loss'].append(valid_loss)
    history['valid']['accuracy'].append(valid_acc)
    

1 / 50
	Train | Loss: 2.90e+00 Accuracy: 10.91%
	Train | Loss: 2.32e+00 Accuracy: 10.38%
2 / 50
	Train | Loss: 2.31e+00 Accuracy: 11.34%
	Train | Loss: 2.29e+00 Accuracy: 10.58%
3 / 50
	Train | Loss: 2.26e+00 Accuracy: 14.52%
	Train | Loss: 2.15e+00 Accuracy: 19.22%
4 / 50
	Train | Loss: 2.08e+00 Accuracy: 20.55%
	Train | Loss: 1.99e+00 Accuracy: 22.12%
5 / 50
	Train | Loss: 1.94e+00 Accuracy: 23.45%
	Train | Loss: 2.19e+00 Accuracy: 17.23%
6 / 50
	Train | Loss: 1.87e+00 Accuracy: 25.68%
	Train | Loss: 1.88e+00 Accuracy: 26.17%
7 / 50
	Train | Loss: 1.81e+00 Accuracy: 28.17%
	Train | Loss: 1.80e+00 Accuracy: 27.95%
8 / 50
	Train | Loss: 1.75e+00 Accuracy: 30.98%
	Train | Loss: 1.94e+00 Accuracy: 28.54%
9 / 50
	Train | Loss: 1.66e+00 Accuracy: 34.27%
	Train | Loss: 1.68e+00 Accuracy: 33.25%
10 / 50
	Train | Loss: 1.56e+00 Accuracy: 39.19%
	Train | Loss: 1.63e+00 Accuracy: 36.71%
11 / 50
	Train | Loss: 1.46e+00 Accuracy: 43.96%
	Train | Loss: 1.53e+00 Accuracy: 42.65%
12 / 50
	Train | Lo

In [6]:
# Train model with no activation
model_no_act = VGG('VGG16', act_cls=nn.Identity)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
optimizer = torch.optim.SGD(model_no_act.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_no_act = model_no_act.to(device)

history_no_act = {
    'epoch': [],
    'train': {'loss': [], 'accuracy': []},
    'valid': {'loss': [], 'accuracy': []},
}

for epoch in range(num_epochs):
    print(f'{epoch+1} / {num_epochs}')
    train_loss, train_acc = run_one_epoch(model_no_act, trainloader, criterion=criterion, optimizer=optimizer, device=device)
    print(f'\tTrain | Loss: {train_loss:.2e} Accuracy: {train_acc:.2%}')
    valid_loss, valid_acc = run_one_epoch(model_no_act, validloader, criterion=criterion, device=device)
    print(f'\tTrain | Loss: {valid_loss:.2e} Accuracy: {valid_acc:.2%}')
    scheduler.step()

    history_no_act['epoch'].append(epoch)
    history_no_act['train']['loss'].append(train_loss)
    history_no_act['train']['accuracy'].append(train_acc)
    history_no_act['valid']['loss'].append(valid_loss)
    history_no_act['valid']['accuracy'].append(valid_acc)

1 / 50
	Train | Loss: 4.07e+00 Accuracy: 15.46%
	Train | Loss: 2.08e+00 Accuracy: 22.61%
2 / 50
	Train | Loss: 2.03e+00 Accuracy: 23.07%
	Train | Loss: 1.98e+00 Accuracy: 22.55%
3 / 50
	Train | Loss: 1.86e+00 Accuracy: 28.26%
	Train | Loss: 1.85e+00 Accuracy: 31.15%
4 / 50
	Train | Loss: 1.74e+00 Accuracy: 34.00%
	Train | Loss: 1.77e+00 Accuracy: 34.08%
5 / 50
	Train | Loss: 1.64e+00 Accuracy: 37.73%
	Train | Loss: 1.79e+00 Accuracy: 33.86%
6 / 50
	Train | Loss: 1.58e+00 Accuracy: 40.47%
	Train | Loss: 1.87e+00 Accuracy: 31.94%
7 / 50
	Train | Loss: 1.50e+00 Accuracy: 43.33%
	Train | Loss: 1.55e+00 Accuracy: 42.05%
8 / 50
	Train | Loss: 1.46e+00 Accuracy: 45.29%
	Train | Loss: 1.63e+00 Accuracy: 40.40%
9 / 50
	Train | Loss: 1.40e+00 Accuracy: 48.31%
	Train | Loss: 1.48e+00 Accuracy: 46.85%
10 / 50
	Train | Loss: 1.35e+00 Accuracy: 50.55%
	Train | Loss: 1.47e+00 Accuracy: 45.77%
11 / 50
	Train | Loss: 1.29e+00 Accuracy: 52.96%
	Train | Loss: 1.29e+00 Accuracy: 53.87%
12 / 50
	Train | Lo

In [13]:
# Train model with no pooling, but with ReLU activation

model_no_pool = VGG('VGG16', pool='strided')

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
optimizer = torch.optim.SGD(model_no_pool.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_no_pool = model_no_pool.to(device)

history_no_pool = {
    'epoch': [],
    'train': {'loss': [], 'accuracy': []},
    'valid': {'loss': [], 'accuracy': []},
}

for epoch in range(num_epochs):
    print(f'{epoch+1} / {num_epochs}')
    train_loss, train_acc = run_one_epoch(model_no_pool, trainloader, criterion=criterion, optimizer=optimizer, device=device)
    print(f'\tTrain | Loss: {train_loss:.2e} Accuracy: {train_acc:.2%}')
    valid_loss, valid_acc = run_one_epoch(model_no_pool, validloader, criterion=criterion, device=device)
    print(f'\tTrain | Loss: {valid_loss:.2e} Accuracy: {valid_acc:.2%}')
    scheduler.step()

    history_no_pool['epoch'].append(epoch)
    history_no_pool['train']['loss'].append(train_loss)
    history_no_pool['train']['accuracy'].append(train_acc)
    history_no_pool['valid']['loss'].append(valid_loss)
    history_no_pool['valid']['accuracy'].append(valid_acc)

1 / 50
	Train | Loss: 1.86e+00 Accuracy: 31.64%
	Train | Loss: 2.08e+00 Accuracy: 40.61%
2 / 50
	Train | Loss: 1.55e+00 Accuracy: 43.00%
	Train | Loss: 1.47e+00 Accuracy: 45.65%
3 / 50
	Train | Loss: 1.41e+00 Accuracy: 48.62%
	Train | Loss: 1.57e+00 Accuracy: 46.52%
4 / 50
	Train | Loss: 1.31e+00 Accuracy: 52.44%
	Train | Loss: 1.22e+00 Accuracy: 56.11%
5 / 50
	Train | Loss: 1.19e+00 Accuracy: 57.23%
	Train | Loss: 1.11e+00 Accuracy: 60.31%
6 / 50
	Train | Loss: 1.11e+00 Accuracy: 60.29%
	Train | Loss: 1.04e+00 Accuracy: 63.25%
7 / 50
	Train | Loss: 1.05e+00 Accuracy: 62.39%
	Train | Loss: 1.01e+00 Accuracy: 64.11%
8 / 50
	Train | Loss: 1.00e+00 Accuracy: 64.46%
	Train | Loss: 9.54e-01 Accuracy: 66.56%
9 / 50
	Train | Loss: 9.56e-01 Accuracy: 66.41%
	Train | Loss: 9.81e-01 Accuracy: 66.12%
10 / 50
	Train | Loss: 9.15e-01 Accuracy: 67.80%
	Train | Loss: 8.92e-01 Accuracy: 69.50%
11 / 50
	Train | Loss: 8.83e-01 Accuracy: 69.00%
	Train | Loss: 9.14e-01 Accuracy: 68.00%
12 / 50
	Train | Lo

In [14]:
# Train model with no pooling and no activation

model_no_act_no_pool = VGG('VGG16', pool='strided', act_cls=nn.Identity)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
optimizer = torch.optim.SGD(model_no_act_no_pool.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_no_act_no_pool = model_no_act_no_pool.to(device)

history_no_act_no_pool = {
    'epoch': [],
    'train': {'loss': [], 'accuracy': []},
    'valid': {'loss': [], 'accuracy': []},
}

for epoch in range(num_epochs):
    print(f'{epoch+1} / {num_epochs}')
    train_loss, train_acc = run_one_epoch(model_no_act_no_pool, trainloader, criterion=criterion, optimizer=optimizer, device=device)
    print(f'\tTrain | Loss: {train_loss:.2e} Accuracy: {train_acc:.2%}')
    valid_loss, valid_acc = run_one_epoch(model_no_act_no_pool, validloader, criterion=criterion, device=device)
    print(f'\tTrain | Loss: {valid_loss:.2e} Accuracy: {valid_acc:.2%}')
    scheduler.step()

    history_no_act_no_pool['epoch'].append(epoch)
    history_no_act_no_pool['train']['loss'].append(train_loss)
    history_no_act_no_pool['train']['accuracy'].append(train_acc)
    history_no_act_no_pool['valid']['loss'].append(valid_loss)
    history_no_act_no_pool['valid']['accuracy'].append(valid_acc)

1 / 50
	Train | Loss: 3.21e+00 Accuracy: 16.88%
	Train | Loss: 2.09e+00 Accuracy: 22.68%
2 / 50
	Train | Loss: 2.13e+00 Accuracy: 21.31%
	Train | Loss: 2.05e+00 Accuracy: 25.12%
3 / 50
	Train | Loss: 2.04e+00 Accuracy: 24.74%
	Train | Loss: 1.99e+00 Accuracy: 26.98%
4 / 50
	Train | Loss: 2.02e+00 Accuracy: 25.61%
	Train | Loss: 1.94e+00 Accuracy: 28.30%
5 / 50
	Train | Loss: 2.00e+00 Accuracy: 26.01%
	Train | Loss: 1.92e+00 Accuracy: 28.94%
6 / 50
	Train | Loss: 1.98e+00 Accuracy: 26.99%
	Train | Loss: 1.92e+00 Accuracy: 28.95%
7 / 50
	Train | Loss: 1.97e+00 Accuracy: 27.75%
	Train | Loss: 1.91e+00 Accuracy: 30.14%
8 / 50
	Train | Loss: 1.96e+00 Accuracy: 28.03%
	Train | Loss: 1.92e+00 Accuracy: 28.35%
9 / 50
	Train | Loss: 1.96e+00 Accuracy: 27.85%
	Train | Loss: 1.91e+00 Accuracy: 29.36%
10 / 50
	Train | Loss: 1.95e+00 Accuracy: 28.87%
	Train | Loss: 1.89e+00 Accuracy: 32.63%
11 / 50
	Train | Loss: 1.93e+00 Accuracy: 30.26%
	Train | Loss: 1.85e+00 Accuracy: 33.57%
12 / 50
	Train | Lo

In [30]:
# Train model with no pooling but with quantization function as activation
import math

# Because we will use a straight-through estimator (STE), we need to define the custom Autograd function

def calibrate(X, qmin, qmax):
    xmax = X.abs().max().item()  # Symmetric, signed
    scale = xmax * 2 / (qmax - qmin)
    # offset = -math.round(qmax - xmax / scale)
    return scale  # , offset

class _QuantDequantDynamic(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X, qmin, qmax):
        scale = calibrate(X, qmin, qmax)
        qX = (X / scale).round().clip(qmin, qmax)
        # ctx.save_for_backward(scale)
        # Dequantize, as we are operating in emulation mode
        fX = qX * scale
        return fX

    @staticmethod
    def backward(ctx, dLdy):
        # STE
        return dLdy, None, None


class QdQ(nn.Module):
    # Symmetric, signed
    def __init__(self, num_bits=8, offset=False):
        super().__init__()
        assert not offset  # Ignoring for now
        self.qmin = -(1 << (num_bits - 1))
        self.qmax = -self.qmin
        self.offset = offset
        self._qdq_fn = _QuantDequantDynamic.apply

    def forward(self, X, *args, **kwargs):
        return self._qdq_fn(X, self.qmin, self.qmax)

model_quad_act_no_pool = VGG('VGG16', pool='strided', act_cls=QdQ)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
optimizer = torch.optim.SGD(model_quad_act_no_pool.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_quad_act_no_pool = model_quad_act_no_pool.to(device)

history_quad_act_no_pool = {
    'epoch': [],
    'train': {'loss': [], 'accuracy': []},
    'valid': {'loss': [], 'accuracy': []},
}

for epoch in range(num_epochs):
    print(f'{epoch+1} / {num_epochs}')
    train_loss, train_acc = run_one_epoch(model_quad_act_no_pool, trainloader, criterion=criterion, optimizer=optimizer, device=device)
    print(f'\tTrain | Loss: {train_loss:.2e} Accuracy: {train_acc:.2%}')
    valid_loss, valid_acc = run_one_epoch(model_quad_act_no_pool, validloader, criterion=criterion, device=device)
    print(f'\tTrain | Loss: {valid_loss:.2e} Accuracy: {valid_acc:.2%}')
    scheduler.step()

    history_quad_act_no_pool['epoch'].append(epoch)
    history_quad_act_no_pool['train']['loss'].append(train_loss)
    history_quad_act_no_pool['train']['accuracy'].append(train_acc)
    history_quad_act_no_pool['valid']['loss'].append(valid_loss)
    history_quad_act_no_pool['valid']['accuracy'].append(valid_acc)

1 / 50
	Train | Loss: 3.80e+00 Accuracy: 16.55%
	Train | Loss: 2.10e+00 Accuracy: 19.91%
2 / 50
	Train | Loss: 2.07e+00 Accuracy: 21.64%
	Train | Loss: 2.01e+00 Accuracy: 24.35%
3 / 50
	Train | Loss: 2.04e+00 Accuracy: 23.83%
	Train | Loss: 1.98e+00 Accuracy: 25.35%
4 / 50
	Train | Loss: 1.99e+00 Accuracy: 25.80%
	Train | Loss: 1.92e+00 Accuracy: 28.85%
5 / 50
	Train | Loss: 1.98e+00 Accuracy: 27.04%
	Train | Loss: 1.94e+00 Accuracy: 29.21%
6 / 50
	Train | Loss: 1.97e+00 Accuracy: 27.16%
	Train | Loss: 1.93e+00 Accuracy: 28.25%
7 / 50
	Train | Loss: 1.96e+00 Accuracy: 27.42%
	Train | Loss: 1.91e+00 Accuracy: 29.72%
8 / 50
	Train | Loss: 1.96e+00 Accuracy: 27.76%
	Train | Loss: 1.91e+00 Accuracy: 29.57%
9 / 50
	Train | Loss: 1.96e+00 Accuracy: 27.84%
	Train | Loss: 1.91e+00 Accuracy: 29.34%
10 / 50
	Train | Loss: 1.95e+00 Accuracy: 28.22%
	Train | Loss: 1.90e+00 Accuracy: 30.87%
11 / 50
	Train | Loss: 1.95e+00 Accuracy: 28.64%
	Train | Loss: 1.89e+00 Accuracy: 31.04%
12 / 50
	Train | Lo