# Import

In [None]:
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from onnxruntime import InferenceSession
from onnxruntime.quantization import QuantType
from transformers import AutoFeatureExtractor
import torch

from PIL import Image
import time

# Convert Onnx

## Vit google

In [None]:
# convert to onnx
vit_gg_onnx = ORTModelForFeatureExtraction.from_pretrained('google/vit-base-patch16-224-in21k', export=True)
processor_vit_gg_onnx = AutoFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

In [None]:
# quantization
quantizer = ORTQuantizer.from_pretrained(vit_gg_onnx)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
dqconfig.weights_dtype = QuantType.QUInt8

In [87]:
# save model onnx
save_directory = "/home/music/Desktop/measure_model/models/"
vit_gg_onnx.save_pretrained(save_directory+"vit_gg_onnx")

In [None]:
# save model onnx quantized
model_quantized_path = quantizer.quantize(
    save_dir=save_directory+"vit_gg_onnx_quantize",
    quantization_config=dqconfig,
)

# test onnx

In [4]:
img = Image.open("/home/music/Desktop/measure_model/data/image_net/n01514668_cock.JPEG")

## ViT google

In [None]:
# vit google onnx
vit_gg_onnx_path = save_directory+"vit_gg_onnx/model.onnx"
vit_gg_onnx = InferenceSession(vit_gg_onnx_path, providers=['CUDAExecutionProvider'])
processor_vit_gg_onxx = AutoFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

In [100]:
inputs = processor_vit_gg_onxx(images=img.convert("RGB"), return_tensors="np")
st = time.time()
outputs = vit_gg_onnx.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
delta = time.time() - st
print(f"runtime : {delta*1000} ms")

runtime : 287.0745658874512 ms


In [None]:
# vit google onnx quantized
vit_gg_onnx_quantized_path = "/home/music/Desktop/measure_model/models/vit_gg_onnx_quantize"
vit_gg_onnx_quantized_model_path = vit_gg_onnx_quantized_path+"/model_quantized.onnx"
vit_gg_onnx_quantized = InferenceSession(vit_gg_onnx_quantized_model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
processor_vit_gg_onxx = AutoFeatureExtractor.from_pretrained(vit_gg_onnx_quantized_path)

In [8]:
inputs = processor_vit_gg_onxx(images=img.convert("RGB"), return_tensors="np")
st = time.time()
outputs = vit_gg_onnx_quantized.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
delta = time.time() - st
print(f"runtime : {delta*1000} ms")

runtime : 264.401912689209 ms


In [109]:
# Original ViT google
from transformers import ViTImageProcessor, ViTModel
vit_gg = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
processor_vit_gg = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_gg.eval().to('cpu')
inputs = processor_vit_gg(images=img, return_tensors="pt").to('cpu')

In [129]:
start_time_torch = time.time()
outputs = vit_gg(**inputs)
delta_time_torch = time.time() - start_time_torch
print("runtime :", delta_time_torch*1000, "ms")

runtime : 351.8397808074951 ms
