<a href="https://colab.research.google.com/github/wileyw/DeepLearningDemos/blob/master/Quantization/Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import glob
from itertools import chain
import os
import random
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm


print(f"Torch: {torch.__version__}")

# Training settings
batch_size = 64
epochs = 20
lr = 3e-5
gamma = 0.7
seed = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

device = 'cuda'

# Necessary to setup quantization
qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')
torch.backends.quantized.engine = 'qnnpack'

Torch: 1.9.0+cu111


In [2]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses

Collecting boto3
  Downloading boto3-1.20.3-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 5.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 31.0 MB/s 
Collecting botocore<1.24.0,>=1.23.3
  Downloading botocore-1.23.3-py3-none-any.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 21.5 MB/s 
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.6 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.7-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 44.7 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 44.6 MB/s 

# Load Data & Augmentations

Download dogs vs cats dataset and input your kaggle key information

In [3]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data")

Skipping, found downloaded files in "./dogs-vs-cats-redux-kernels-edition" (use force=True to force download)


In [4]:
!ls dogs-vs-cats-redux-kernels-edition/

sample_submission.csv  test.zip  train.zip


In [5]:
os.makedirs('data', exist_ok=True)
train_dir = 'data/train'
test_dir = 'data/test'

with zipfile.ZipFile('dogs-vs-cats-redux-kernels-edition/train.zip') as train_zip:
    train_zip.extractall('data')
    
with zipfile.ZipFile('dogs-vs-cats-redux-kernels-edition/test.zip') as test_zip:
    test_zip.extractall('data')

train_list = glob.glob(os.path.join(train_dir,'*.jpg'))
test_list = glob.glob(os.path.join(test_dir, '*.jpg'))

print(f"Train Data: {len(train_list)}")
print(f"Test Data: {len(test_list)}")

labels = [path.split('/')[-1].split('.')[0] for path in train_list]

Train Data: 25000
Test Data: 12500


In [6]:
train_list, valid_list = train_test_split(train_list, 
                                          test_size=0.2,
                                          stratify=labels,
                                          random_state=seed)



print(f"Train Data: {len(train_list)}")
print(f"Validation Data: {len(valid_list)}")
print(f"Test Data: {len(test_list)}")

Train Data: 20000
Validation Data: 5000
Test Data: 12500


In [7]:
train_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]
)

val_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]
)


test_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]
)

In [8]:
class CatsDogsDataset(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform

    def __len__(self):
        self.filelength = len(self.file_list)
        return self.filelength

    def __getitem__(self, idx):
        img_path = self.file_list[idx]
        img = Image.open(img_path)
        img_transformed = self.transform(img)

        label = img_path.split("/")[-1].split(".")[0]
        label = 1 if label == "dog" else 0

        return img_transformed, label

train_data = CatsDogsDataset(train_list, transform=train_transforms)
valid_data = CatsDogsDataset(valid_list, transform=test_transforms)
test_data = CatsDogsDataset(test_list, transform=test_transforms)

train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True )
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True)

print(len(train_data), len(train_loader))
print(len(valid_data), len(valid_loader))

20000 313
5000 79


# Load Model

In [9]:
"""
model = torchvision.models.resnet18(pretrained=True)
# or any of these variants
model = torchvision.models.resnet34(pretrained=True)
model = torchvision.models.resnet50(pretrained=True)
model = torchvision.models.resnet101(pretrained=True)
model = torchvision.models.resnet152(pretrained=True)
"""
model = torchvision.models.resnet101(pretrained=True)

model = model.train()  # Set model to training mode.
# model = model.eval()

In [10]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

# Setup Quantization

Quantization Implementations:

*   Fine-tune as-is & post-training static quantization
*   Quantization aware training
*   [Dynamic quantization
](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html)


## 1. Dynamic Quantization

In [11]:
float_model = model

dq_model_fc = torch.quantization.quantize_dynamic(float_model, {torch.nn.Linear}, dtype=torch.qint8)
dq_model_conv2d = torch.quantization.quantize_dynamic(float_model, {torch.nn.Conv2d}, dtype=torch.qint8)
dq_model_bn = torch.quantization.quantize_dynamic(float_model, {torch.nn.BatchNorm2d}, dtype=torch.qint8)
dq_model = torch.quantization.quantize_dynamic(float_model, dtype=torch.qint8)

In [12]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

f=print_size_of_model(float_model,"fp32")
print()

print('Quantize fc layer')
q1=print_size_of_model(dq_model_fc,"int8")
print("{0:.2f} times smaller".format(f/q1))
print()

print('Quantize convolution layer')
q2=print_size_of_model(dq_model_conv2d,"int8")
print("{0:.2f} times smaller".format(f/q2))
print()

print('Quantize batch norm layer')
q3=print_size_of_model(dq_model_bn,"int8")
print("{0:.2f} times smaller".format(f/q3))
print()

print('Quantize all layers')
q4=print_size_of_model(dq_model,"int8")
print("{0:.2f} times smaller".format(f/q4))
print()

model:  fp32  	 Size (KB): 178814.045

Quantize fc layer
model:  int8  	 Size (KB): 172670.809
1.04 times smaller

Quantize convolution layer
model:  int8  	 Size (KB): 178814.045
1.00 times smaller

Quantize batch norm layer
model:  int8  	 Size (KB): 178814.045
1.00 times smaller

Quantize all layers
model:  int8  	 Size (KB): 172670.809
1.04 times smaller



https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic

"This is the simplest to apply form of quantization where the weights are quantized ahead of time but the activations are dynamically quantized during inference. This is used for situations where the model execution time is dominated by loading weights from memory rather than computing the matrix multiplications. This is true for for LSTM and Transformer type models with small batch size."

In [13]:
# Let's try with transformers
!pip install transformers



In [14]:
from transformers import BertModel

bert_model_float = BertModel.from_pretrained('bert-base-uncased')
bert_model_dq = torch.quantization.quantize_dynamic(bert_model_float, dtype=torch.qint8)

f=print_size_of_model(bert_model_float, "fp32")
q=print_size_of_model(bert_model_dq, "int8")
print("{0:.2f} times smaller".format(f/q))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model:  fp32  	 Size (KB): 438007.537
model:  int8  	 Size (KB): 181488.333
2.41 times smaller


**TODO**
1. Finetune model to cat v. dog problem
2. Use training data as representative dataset for post-training static quantization on finetuned model & evaluate against finetuned model
3. Use training data to finetune Quantization Aware Training model

Create model for cat v. dog -> finetuning -> Post-training static quantization


 -> quantization aware training

# Finetune model for post-training static quantization

In [15]:
model_fp32 = torchvision.models.resnet18(pretrained=True)
class CatDogFinetuneClassifier(nn.Module):
    def __init__(self, cnn):
        super(CatDogFinetuneClassifier, self).__init__()
        self.cnn = cnn
        self.fc = nn.Linear(in_features=512, out_features=2, bias=True)

    def forward(self, img):
        x = self.cnn(img).flatten(start_dim=1)
        return self.fc(x)

model_ptsq_finetune = CatDogFinetuneClassifier(torch.nn.Sequential(*list(model_fp32.children())[:-1]))
model_ptsq_finetune = model_ptsq_finetune.to('cuda')

In [16]:
# Training settings
batch_size = 64
epochs = 10
lr = 1e-3
gamma = 0.7

In [17]:
# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)
# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0

    for data, label in tqdm(train_loader):
        data = data.to(device)
        label = label.to(device)

        output = model_ptsq_finetune(data)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data.to(device)
            label = label.to(device)

            val_output = model_ptsq_finetune(data)
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 1 - loss : 0.5945 - acc: 0.6972 - val_loss : 0.6021 - val_acc: 0.6915



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 2 - loss : 0.5985 - acc: 0.6929 - val_loss : 0.5977 - val_acc: 0.6960



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 3 - loss : 0.5975 - acc: 0.6918 - val_loss : 0.6022 - val_acc: 0.6871



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 4 - loss : 0.5955 - acc: 0.6961 - val_loss : 0.5974 - val_acc: 0.6934



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 5 - loss : 0.5966 - acc: 0.6982 - val_loss : 0.6009 - val_acc: 0.6972



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 6 - loss : 0.5970 - acc: 0.6969 - val_loss : 0.5973 - val_acc: 0.7008



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 7 - loss : 0.5952 - acc: 0.6958 - val_loss : 0.5964 - val_acc: 0.6970



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 8 - loss : 0.5955 - acc: 0.6965 - val_loss : 0.6000 - val_acc: 0.6893



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 9 - loss : 0.5962 - acc: 0.6950 - val_loss : 0.5972 - val_acc: 0.7013



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 10 - loss : 0.5942 - acc: 0.7019 - val_loss : 0.6011 - val_acc: 0.6889



## 2. Post-training Static Quantization

In [19]:
model_fp32 = model_ptsq_finetune.to('cpu')  # move model to cpu
model_fp32.eval()
model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv1', 'bn1', 'relu']])
model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)

# TODO: Run representative dataset here.
# input_fp32 = torch.randn(4, 1, 4, 4)
# model_fp32_prepared(input_fp32)
input_fp32 = test_loader
for data, label in tqdm(test_loader):
    model_fp32_prepared(data)

model_int8 = torch.quantization.convert(model_fp32_prepared)

f=print_size_of_model(model_fp32, "fp32")
q=print_size_of_model(model_int8, "int8")
print("{0:.2f} times smaller".format(f/q))

AssertionError: ignored

In [None]:
#(TODO): use validation dataset? to evaluate classifier

## 3. Quantization Aware Training


In [None]:
float_model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') 
# TODO: When writing post, explore other qconfigs (fbgemm == server inference, qnnpack == mobile, what else?)
# Also talk about symmetic vs assymetric quantization, etc.
float_model_fused = torch.quantization.fuse_modules(float_model,
    [['conv1', 'bn1', 'relu']])
float_model_prepared = torch.quantization.prepare_qat(float_model_fused)
model_int8 = torch.quantization.convert(float_model_prepared)

f=print_size_of_model(float_model,"fp32")
q=print_size_of_model(model_int8,"int8")
print("{0:.2f} times smaller".format(f/q))

  reduce_range will be deprecated in a future release of PyTorch."
  Returning default scale and zero point "


model:  fp32  	 Size (KB): 178814.045
model:  int8  	 Size (KB): 46699.104
3.83 times smaller


In [None]:
print(model_int8)

ResNet(
  (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=1.0, zero_point=0, padding=(3, 3))
  (bn1): Identity()
  (relu): Identity()
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
      (bn1): QuantizedBatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
      (bn2): QuantizedBatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): QuantizedConv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
      (bn3): QuantizedBatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Quant