# AutoQ


## Imports and Settings

Import NNCF and all auxiliary packages from your Python code.

> **NOTE**: All NNCF logging messages below ERROR level (INFO and WARNING) are disabled to simplify the tutorial. For production use, it is recommended to enable logging by removing `set_log_level(logging.ERROR)`.


In [1]:
import sys
import time
import warnings
from pathlib import Path
import logging

import torch
import nncf  # Important - should be imported directly after torch.

import torch.nn as nn
import torch.optim
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms

from nncf.common.utils.logger import set_log_level
set_log_level(logging.ERROR)  # Disables all NNCF info and warning messages.

from nncf import NNCFConfig
from nncf.torch import register_default_init_args
from nncf.torch import create_compressed_model

from openvino.runtime import Core

from model_utils import MobileNetV2

sys.path.append("../utils")
from notebook_utils import download_file

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using %s device." % device)

MODEL_DIR = Path("model")
OUTPUT_DIR = Path("output")
DATA_DIR = Path("data")
BASE_MODEL_NAME = "mobilenet-V2"

OUTPUT_DIR.mkdir(exist_ok=True)
MODEL_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(exist_ok=True)

# Path for the pretrained fp32 model weights
fp32_pth_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".pth")
fp32_onnx_path = fp32_pth_path.with_suffix(".onnx")
quantized_pth_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + '_quantized')).with_suffix(".pth")
quantized_onnx_path = quantized_pth_path.with_suffix(".onnx")
if not fp32_pth_path.exists():
    fp32_pth_url = "http://hsw1.jf.intel.com/share/bootstrapNAS/checkpoints/cifar10/mobilenet_v2.pt"
    download_file(fp32_pth_url, directory=MODEL_DIR, filename=fp32_pth_path.name)



Using cuda device.


## Datasets


In [2]:
DATASET_DIR = DATA_DIR / "cifar10"

image_size = 32
normalize = transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2471, 0.2435, 0.2616))
val_transform = transforms.Compose([transforms.ToTensor(), normalize])
train_transform = transforms.Compose([
    transforms.RandomCrop(image_size, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor(),
    normalize
])

train_dataset = CIFAR10(DATASET_DIR, train=True, transform=train_transform, download=True)
val_dataset = CIFAR10(DATASET_DIR, train=False, transform=val_transform, download=False)

batch_size = 64
batch_size_val = 2000
workers = 4
pin_memory = device != 'cpu'

val_loader = DataLoader(val_dataset, batch_size=batch_size_val, shuffle=False,
                        num_workers=workers, pin_memory=pin_memory, drop_last=False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=workers, pin_memory=pin_memory, drop_last=True)

Files already downloaded and verified


## Helpers


In [3]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self, name, fmt=":f"):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print("\t".join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = "{:" + str(num_digits) + "d}"
        return "[" + fmt + "/" + fmt.format(num_batches) + "]"


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

## Validation Function


In [4]:
def validate(model, val_loader, criterion, verbose=False):
    batch_time = AverageMeter("Time", ":3.3f")
    losses = AverageMeter("Loss", ":2.3f")
    top1 = AverageMeter("Acc@1", ":2.2f")
    top5 = AverageMeter("Acc@5", ":2.2f")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.to(device)
            target = target.to(device)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

        if verbose:
            print(" * Test Loss {losses.avg:.3f} Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(losses=losses, top1=top1, top5=top5))
    return top1.avg, top5.avg, losses.val

## Train Function


In [5]:
def train_epoch(model, train_loader, criterion, optimizer, epoch, compression_ctrl):
    batch_time = AverageMeter("Time", ":3.3f")
    losses = AverageMeter("Loss", ":2.3f")
    top1 = AverageMeter("Acc@1", ":2.2f")
    top5 = AverageMeter("Acc@5", ":2.2f")
    progress = ProgressMeter(
        len(train_loader), [batch_time, losses, top1, top5], prefix="Epoch:[{}]".format(epoch)
    )

    # switch to train mode
    model.train()
    compression_scheduler = compression_ctrl.scheduler

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        images = images.to(device)
        target = target.to(device)

        # compute output
        output = model(images)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        # compute gradient and do opt step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        print_frequency = 50
        if i % print_frequency == 0 or i == len(train_loader) - 1:
            progress.display(i)

## Float32 Model Preparation

In [6]:
model = MobileNetV2()
model.load_state_dict(torch.load(fp32_pth_path, map_location="cpu"))
model.to(device)

criterion = nn.CrossEntropyLoss()
acc1, acc5, _ = validate(model, val_loader, criterion) 
print('FP32 model accuracy: acc1: %.2f acc5: %.2f' % (acc1, acc5))

dummy_input = torch.randn(1, 3, image_size, image_size).to(device)
torch.onnx.export(model, dummy_input, fp32_onnx_path)
print("FP32 ONNX model was exported to %s." % fp32_onnx_path)

FP32 model accuracy: acc1: 93.91 acc5: 99.83
FP32 ONNX model was exported to model/mobilenet-V2_fp32.onnx.


## AutoQ


In [7]:
warnings.filterwarnings("ignore") # Avoid warnings in torchvision and pandas.

autoq_iter_number = 20 # The number of search episodes by AutoQ.
autoq_compression_ratio = 0.20 # Target quantized model size relative to FP32 model, 0.25 for uniform int8 quantization and 0.125 for uniform int4.
eval_subset_ratio = 1.0 # Evaluation ratio of the subset for AutoQ.

config = {
    "model": BASE_MODEL_NAME,
    "dataset": "cifar10",
    "input_info": {
        "sample_size": [1, 3, image_size, image_size]
    },
    "target_device": "VPU",
    "compression": {
        "algorithm": "quantization",
        "initializer": {
            "batchnorm_adaptation": {
                "num_bn_adaptation_samples": 512
            },
            "range": {
                "type": "mean_min_max",
                "num_init_samples": 512
            },
            "precision": {
                "type": "autoq",
                "bits": [2, 4, 8],
                "iter_number": autoq_iter_number,
                "compression_ratio": autoq_compression_ratio,
                "eval_subset_ratio": eval_subset_ratio,
            },
        },
    },
}

# AutoQ evaluation function to decide the best policy.
def autoq_eval_fn(model, eval_loader):
    acc1, acc5, _ = validate(model, eval_loader, criterion)
    print('Trial evaluation acc1: %.2f.' % acc1)
    return acc1


nncf_config = NNCFConfig.from_dict(config)
nncf_config = register_default_init_args(
    nncf_config,
    train_loader=train_loader,
    criterion=criterion,
    val_loader=val_loader,
    autoq_eval_fn=autoq_eval_fn,
    device=device,
)

print('Start AutoQ.')
compression_ctrl, model = create_compressed_model(model, nncf_config)
print(compression_ctrl.statistics().to_str())

Start AutoQ.
Trial evaluation acc1: 93.91.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.05.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 9.84.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.02.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 9.81.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 40.74.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 10.00.
Trial evaluation acc1: 32.01.
Statistics of the quantization algorithm:
+--------------------------------+-------+
|        Statistic's name        | Value |
| Ratio of enabled quantizations | 100   |
+--------------------------------+-------+

Statistics of the quantization share:
+----------------------------------+--------------------+
|         Statistic's name         |       Value

## Finetune the Quantized Model


In [12]:
n_epochs = 1
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Start training
best_acc1 = 0.0
for epoch in range(n_epochs):    
    # Train for one epoch
    train_epoch(model, train_loader, criterion, optimizer, epoch, compression_ctrl)

    # Validation
    acc1, acc5, _ = validate(model, val_loader, criterion) 
    print('Epoch %d, Test acc1: %.2f acc5: %.2f' % (epoch, acc1, acc5))

    if acc1 > best_acc1:
        torch.save(model.state_dict(), quantized_pth_path)

Epoch:[0][  0/781]	Time 0.566 (0.566)	Loss 0.895 (0.895)	Acc@1 71.88 (71.88)	Acc@5 96.88 (96.88)
Epoch:[0][ 50/781]	Time 0.139 (0.162)	Loss 0.738 (0.749)	Acc@1 76.56 (74.57)	Acc@5 100.00 (98.19)
Epoch:[0][100/781]	Time 0.235 (0.160)	Loss 0.655 (0.733)	Acc@1 76.56 (74.68)	Acc@5 98.44 (98.33)
Epoch:[0][150/781]	Time 0.131 (0.162)	Loss 0.578 (0.726)	Acc@1 81.25 (75.19)	Acc@5 98.44 (98.46)
Epoch:[0][200/781]	Time 0.129 (0.156)	Loss 0.609 (0.716)	Acc@1 79.69 (75.67)	Acc@5 98.44 (98.47)
Epoch:[0][250/781]	Time 0.133 (0.155)	Loss 0.688 (0.707)	Acc@1 75.00 (75.89)	Acc@5 100.00 (98.46)
Epoch:[0][300/781]	Time 0.170 (0.157)	Loss 0.624 (0.695)	Acc@1 79.69 (76.37)	Acc@5 100.00 (98.54)
Epoch:[0][350/781]	Time 0.131 (0.157)	Loss 0.841 (0.690)	Acc@1 73.44 (76.62)	Acc@5 98.44 (98.58)
Epoch:[0][400/781]	Time 0.132 (0.155)	Loss 0.702 (0.683)	Acc@1 76.56 (76.77)	Acc@5 100.00 (98.63)
Epoch:[0][450/781]	Time 0.147 (0.154)	Loss 0.661 (0.678)	Acc@1 76.56 (76.82)	Acc@5 100.00 (98.65)
Epoch:[0][500/781]	Time 0

## Export Quantized Model to ONNX

In [13]:
model.load_state_dict(torch.load(quantized_pth_path, map_location='cpu'))
compression_ctrl.export_model(quantized_onnx_path)
print(f"Quantized ONNX model exported to {quantized_onnx_path}.")

Quantized ONNX model exported to output/mobilenet-V2_quantized.onnx.


## Benchmark Model Performance

Convert the fp32 and quantized model to OpenVINO Intermediate Representation (IR). 

In [14]:
fp32_ir_path = fp32_onnx_path.with_suffix('.xml')
quantized_ir_path = quantized_onnx_path.with_suffix('.xml')

if not fp32_ir_path.exists():
    !mo --input_model $fp32_onnx_path --output_dir $MODEL_DIR

if not quantized_ir_path.exists():
    !mo --input_model $quantized_onnx_path --output_dir $OUTPUT_DIR

Measure the inference performance of the `FP32` and `quantized` models, using [Benchmark Tool](https://docs.openvino.ai/latest/openvino_inference_engine_tools_benchmark_tool_README.html) - inference performance measurement tool in OpenVINO.

> **NOTE**: This notebook runs `benchmark_app` for 15 seconds to give a quick indication of performance. For more accurate performance, please see `benchmark_app` document.

In [15]:
def parse_benchmark_output(benchmark_output):
    parsed_output = [line for line in benchmark_output if not (line.startswith(r"[") or line.startswith("  ") or line == "")]
    print(*parsed_output, sep='\n')


print('Benchmark FP32 model (IR)')
benchmark_output = ! benchmark_app -m $fp32_ir_path -d CPU -api async -t 15
parse_benchmark_output(benchmark_output)

print('Benchmark quantized model (IR)')
benchmark_output = ! benchmark_app -m $quantized_ir_path -d CPU -api async -t 15
parse_benchmark_output(benchmark_output)

ie = Core()
ie.get_property("CPU", "FULL_DEVICE_NAME")

Benchmark FP32 model (IR)
Count:          88968 iterations
Duration:       15002.47 ms
Latency:
Throughput: 5930.22 FPS
Benchmark quantized model (IR)
Count:          73208 iterations
Duration:       15001.85 ms
Latency:
Throughput: 4879.93 FPS


'Intel(R) Xeon(R) Gold 5218 CPU @ 2.30GHz'