<a href="https://colab.research.google.com/github/verneh/transformers/blob/main/vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vision Transformer

In [1]:
!pip install timm pandas requests

Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[?25l[K     |▊                               | 10 kB 18.3 MB/s eta 0:00:01[K     |█▌                              | 20 kB 23.5 MB/s eta 0:00:01[K     |██▎                             | 30 kB 11.5 MB/s eta 0:00:01[K     |███                             | 40 kB 4.6 MB/s eta 0:00:01[K     |███▉                            | 51 kB 4.6 MB/s eta 0:00:01[K     |████▋                           | 61 kB 5.5 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 5.6 MB/s eta 0:00:01[K     |██████                          | 81 kB 5.6 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 6.3 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 5.3 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 5.3 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 5.3 MB/s eta 0:00:01[K     |

# Classifying images with ViT Transformer.

In [16]:
from PIL import Image
import torch
import timm
import requests
import torchvision.transforms as transforms
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

import warnings
warnings.filterwarnings("ignore")

print(torch.__version__)
# should be 1.8.0


model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True)
model.eval()

transform = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])

img = Image.open(requests.get("https://cdn.livekindly.co/wp-content/uploads/2018/04/dominos-garlic-bread-logo-e1524900209427.jpg", stream=True).raw)
img = transform(img)[None,]
out = model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

1.10.0+cu111


Downloading: "https://github.com/facebookresearch/deit/archive/main.zip" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth" to /root/.cache/torch/hub/checkpoints/deit_base_patch16_224-b5f2ef4d.pth


  0%|          | 0.00/330M [00:00<?, ?B/s]

930


Tells us that the image is classified as a french loaf.

# Scripting it for Mobile

In [17]:
scripted_model = torch.jit.script(model)
scripted_model.save("fbvit_scripted.pt")

# Quantizing it

To reduce the trained model size significantly while
keeping the inference accuracy about the same, quantization can be
applied to the model. 

Works best for LSTM and Transformer models.

In [18]:
# Use 'fbgemm' for server inference and 'qnnpack' for mobile inference
backend = "fbgemm" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook
model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.backends.quantized.engine = backend

quantized_model = torch.quantization.quantize_dynamic(model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
scripted_quantized_model = torch.jit.script(quantized_model)
scripted_quantized_model.save("fbdeit_scripted_quantized.pt")

Scripted quantized model should have same output.

In [19]:
out = scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

930


# Optimizing it

In [20]:
from torch.utils.mobile_optimizer import optimize_for_mobile
optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model)
optimized_scripted_quantized_model.save("fbdeit_optimized_scripted_quantized.pt")

In [21]:
out = optimized_scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())


930


# Lite version of the model.

In [22]:
optimized_scripted_quantized_model._save_for_lite_interpreter("fbdeit_optimized_scripted_quantized_lite.ptl")
ptl = torch.jit.load("fbdeit_optimized_scripted_quantized_lite.ptl")

# Compare inference speed

In [23]:
with torch.autograd.profiler.profile(use_cuda=False) as prof1:
    out = model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof2:
    out = scripted_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof3:
    out = scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof4:
    out = optimized_scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof5:
    out = ptl(img)

print("original model: {:.2f}ms".format(prof1.self_cpu_time_total/1000))
print("scripted model: {:.2f}ms".format(prof2.self_cpu_time_total/1000))
print("scripted & quantized model: {:.2f}ms".format(prof3.self_cpu_time_total/1000))
print("scripted & quantized & optimized model: {:.2f}ms".format(prof4.self_cpu_time_total/1000))
print("lite model: {:.2f}ms".format(prof5.self_cpu_time_total/1000))

original model: 752.68ms
scripted model: 818.04ms
scripted & quantized model: 584.41ms
scripted & quantized & optimized model: 577.38ms
lite model: 582.17ms


In [24]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'Model': ['original model','scripted model', 'scripted & quantized model', 'scripted & quantized & optimized model', 'lite model']})
df = pd.concat([df, pd.DataFrame([
    ["{:.2f}ms".format(prof1.self_cpu_time_total/1000), "0%"],
    ["{:.2f}ms".format(prof2.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof2.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof3.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof3.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof4.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof4.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof5.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof5.self_cpu_time_total)/prof1.self_cpu_time_total*100)]],
    columns=['Inference Time', 'Reduction'])], axis=1)

print(df)

                                    Model Inference Time Reduction
0                          original model       752.68ms        0%
1                          scripted model       818.04ms    -8.68%
2              scripted & quantized model       584.41ms    22.36%
3  scripted & quantized & optimized model       577.38ms    23.29%
4                              lite model       582.17ms    22.65%
