# **Embedded AI workshop**
## **Static Quantization**
### *Mohammmad Ali Zamani*

*Senior Machine Learning Scientist at HITeC e.V.*

homepage: [zamani.ai](https://zamani.ai/)




In this tutorial, you will learn how to use `torch.ao.quantization` package to
quantize your neural networks.

For more information:

[1- Pytorch Quantization](https://pytorch.org/docs/stable/quantization.html)

[2- Practical Quantization in PyTorch](https://pytorch.org/blog/quantization-in-practice/)





Setup
======
we installing some packages and import the necessary libraries.

In [None]:
!pip install torch-pruning > /dev/null 2>&1
!pip install -q gwpy > /dev/null 2>&1

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import os
import torch_pruning as tp
from torch.ao.quantization import QuantStub, DeQuantStub
from torch.ao.quantization import QConfig
from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver

from tqdm import tqdm
import matplotlib.pyplot as plt

Creating a model, transform, and dataloader
==============

In this tutorial, we use the
[LeNet](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf) architecture
from LeCun et al., 1998.


In [None]:
# select the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.quant = QuantStub() # new module that converts tensors from floating point to quantized
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.relu2 = nn.ReLU()
        self.fc1 = nn.Linear(320, 50)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(50, 10)
        self.dequant = DeQuantStub() # new module that converts tensors from quantized to floating point

    def forward(self, x):
        # manually specify where tensors will be converted from floating
        # point to quantized in the quantized model
        x = self.quant(x)

        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.reshape(-1, int(x.nelement() / x.shape[0]))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        # manually specify where tensors will be converted from quantized
        # to floating point in the quantized model
        x = self.dequant(x)
        return x

we create transforms with usual data augmentation suitable for the MNIST dataset.

In [None]:
# Transform for MNIST data
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomRotation(degrees=20),
    transforms.RandomInvert(p=0.5),
    transforms.RandomErasing(p=0.2, scale=(0.02, 0.1), ratio=(0.3, 3.3), value=0),
    transforms.Normalize((0.5,), (0.5,)),
    # transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10),
    # transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
    # transforms.RandomResizedCrop(size=(28, 28), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

We use the following lines to download the dataset and create the data loader.

In [None]:
# for hiding the output of the download
%%capture

# Load MNIST training and test sets
trainset = torchvision.datasets.MNIST(root='.', train=True, download=True, transform=train_transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)

testset = torchvision.datasets.MNIST(root='.', train=False, download=True, transform=test_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False)

Visualizing and repots
====
We selected some images from the test set for visualizing and evaluating the prediction. The `get_model_size` function measures how much space the model needs. The `print_model_test_report` function creates a report for the actual network architecture, measurements, accuracy, and sample images.

In [None]:
# Define the indices for the demo images
demo_indices = [3, 2, 1, 18, 4, 23, 11, 17, 61, 9]

# Demo inputs and corresponding digits
demo_inputs = torch.vstack([testset[i][0] for i in demo_indices]).unsqueeze(1).to(device=device)
demos = [(i, index) for i, index in enumerate(demo_indices)]

def demo_image(digit):
    return demo_inputs[digit].squeeze(0).cpu().numpy()

def visualize(model, device):
    plt.figure(figsize=(15,6))
    predictions = model(demo_inputs.to(device)).argmax(dim=1)
    for digit, index in demos:
        plt.subplot(1, 10, digit + 1)
        plt.imshow(demo_image(digit))
        plt.title(f"digit: {digit}\npred: {int(predictions[digit])}")
        plt.axis('off')
    plt.show()

In [None]:
def get_model_size(model):
    torch.save(model.state_dict(), "tmp.pt")
    model_size = os.path.getsize("tmp.pt")/1e3
    os.remove('tmp.pt')
    return model_size

In [None]:
# a pretty print to evaluate the model after each training
def print_model_test_report(msg, model, example_inputs, testloader, device='cuda'):
    acc = evaluate_model(model, testloader, device)
    print("\n%s:\n" % (msg))
    print(model)
    print("\nSize: %.2f KB,  Test_Acc:  %.2f%% \n" % (get_model_size(model), acc))
    visualize(model, device=device)


Training and evaluating
====
The `evaluate_model` can evaluate the testset. The `training_loop` is the main training loop of pytorch.

In [None]:
def evaluate_model(model, testloader, device="cuda"):
    # Test the network on the test data
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Turn off gradients for validation, saves memory and computations
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    return acc

In [None]:
def training_loop(model, trainloader, iter):
    # Train the network
    for epoch in range(1):  # increase it for loop over the dataset multiple times
        model.train()
        running_loss = 0.0

        # Initialize tqdm progress bar
        progress_bar = tqdm(enumerate(trainloader, 0), total=len(trainloader), desc=f"training Epoch {epoch+1}")


        for i, data in progress_bar:

            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()

            running_loss += loss.item()
            if i % 10 == 9:    # print every 10 mini-batches
                progress_bar.set_postfix(loss=running_loss / 10)
                running_loss = 0.0

        progress_bar.close()


Creating model, optimizer and loss function
====
In the following cell, the model, optimizer and loss function are created.

In [None]:
# create a model instance
model_fp32 = Net().to(device=device)

# Define a Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_fp32.parameters(), lr=0.001, momentum=0.9)

Quantization
====


In [None]:
example_inputs = torch.randn(1, 1, 28, 28).to(device=device)
training_loop(model_fp32, trainloader, 0)
print_model_test_report("before quantization", model_fp32, example_inputs, testloader, device=device)

# separate after quantization with one line
print("\n"+ "="*160 + "\n")

# model must be set to eval for fusion to work
model_fp32.eval()

# Define the new QConfig with quant_min and quant_max
activation_observer = HistogramObserver.with_args(quant_min=0, quant_max=127)
weight_observer = PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_affine)
qconfig = QConfig(activation=activation_observer, weight=weight_observer)
model_fp32.qconfig = qconfig


# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [['conv1', 'relu1'], ['conv2', 'relu2']])

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)

# Run calibration data through the model.
# with torch.no_grad():
#     for input_fp32 , _ in trainloader:  # Use your trainloader or a dedicated calibration DataLoader
#         input_fp32  = input_fp32.to(device)
#         model_fp32_prepared(input_fp32 )

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.ao.quantization.convert(model_fp32_prepared.to('cpu'))

print_model_test_report("after quantization", model_int8, example_inputs, testloader, device='cpu')

# Assignment:
#
# 1- uncomment the calibration part, compare accuracy, scales, and zeros before and after calibration
# 2- try different quantization scheme (qscheme):
#   e.g., torch.per_channel_affine, per_channel_symmetric
