# Benchmarking ViT with OpenVINO

Pytorch source code and models from: https://github.com/lukemelas/PyTorch-Pretrained-ViT

In [14]:
import torch

In [1]:
from pytorch_pretrained_vit import ViT

In [10]:
from torchvision import transforms as T

In [12]:
from PIL import Image

In [13]:
img = T.Compose([
    T.Resize((384, 384)), 
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])(Image.open('./assets/cat_dog.jpeg')).unsqueeze(0)
print(img.shape) # torch.Size([1, 3, 384, 384])

torch.Size([1, 3, 384, 384])


In [15]:
model.eval()
with torch.no_grad():
    outputs = model(img)

## Convert to OpenVINO model

In [None]:
from openvino.tools import mo

In [29]:
import openvino as ov
print(ov.__version__)

2023.2.0-12538-e7c1344d3c3


In [6]:
from pathlib import Path

### Model Base-patch16

In [72]:
model_id = 'b_16'

In [83]:
model = ViT('B_16_imagenet1k', pretrained=True)

Loaded pretrained weights.


In [34]:
from openvino import Core
core = Core()
ov_model_read = core.read_model(ov_vit_path)
ov_model = core.compile_model(ov_model_read)
results = ov_model.infer_new_request({"x": img})

In [54]:
y = results.to_tuple()

In [None]:
input_size = 224
inputs = torch.randn([1,3,input_size,input_size])

In [94]:
ov_vit_path = Path(f"./models/vit_{model_id}_384.xml")
if not ov_vit_path.exists():
    ov_model = mo.convert_model(model, example_input=img, compress_to_fp16=True)
    ov.save_model(ov_model, ov_vit_path)
else:
    print(f"{ov_vit_path} exists")

  shape[shape.index(-1)] = int(x.size(-1) / -np.prod(shape))


In [27]:
!benchmark_app -m ./models/vit_b_16.xml -data_shape "x[1,3,384,384]" -hint latency -d GPU

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] GPU
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 25.40 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc/aten::linear/Add) : f32 / [...] / [?,1000]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc

### Model Base-patch32

In [64]:
model_id = 'b_32'

In [31]:
model = ViT('B_32_imagenet1k', pretrained=True)

Downloading: "https://github.com/lukemelas/PyTorch-Pretrained-ViT/releases/download/0.0.2/B_32_imagenet1k.pth" to /home/wayne/.cache/torch/hub/checkpoints/B_32_imagenet1k.pth
100%|████████████████████████████████████████████████████████████████████████████████████████████| 337M/337M [01:04<00:00, 5.50MB/s]


Loaded pretrained weights.


In [65]:
ov_vit_path = Path(f"./models/vit_{model_id}.xml")
if not ov_vit_path.exists():
    ov_model = mo.convert_model(model, example_input=img, compress_to_fp16=True)
    ov.save_model(ov_model, ov_vit_path)
else:
    print(f"{ov_vit_path} already exists")

  shape[shape.index(-1)] = int(x.size(-1) / -np.prod(shape))


In [66]:
!benchmark_app -m ./models/vit_b_32.xml -data_shape "x[1,3,384,384]" -hint latency -d GPU

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] GPU
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 25.40 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc/aten::linear/Add) : f32 / [...] / [?,1000]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc

### Model large-patch16

In [68]:
model_id = 'l_16'

In [67]:
model = ViT('L_16_imagenet1k', pretrained=True)

Downloading: "https://github.com/lukemelas/PyTorch-Pretrained-ViT/releases/download/0.0.2/L_16_imagenet1k.pth" to /home/wayne/.cache/torch/hub/checkpoints/L_16_imagenet1k.pth
100%|██████████████████████████████████████████████████████████████████████████████████████████| 1.14G/1.14G [01:40<00:00, 12.2MB/s]


Loaded pretrained weights.


In [69]:
ov_vit_path = Path(f"./models/vit_{model_id}.xml")
if not ov_vit_path.exists():
    ov_model = mo.convert_model(model, example_input=img, compress_to_fp16=True)
    ov.save_model(ov_model, ov_vit_path)
else:
    print(f"{ov_vit_path} already exists")

  shape[shape.index(-1)] = int(x.size(-1) / -np.prod(shape))


In [70]:
!benchmark_app -m ./models/vit_l_16.xml -data_shape "x[1,3,384,384]" -hint latency -d GPU

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] GPU
[ INFO ] Build ................................. 2023.2.0-12538-e7c1344d3c3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 45.34 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc/aten::linear/Add) : f32 / [...] / [?,1000]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     x (node: x) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     ***NO_NAME*** (node: __module.fc

## Huggingface Model