# Benchmarking ViT with IPEX

Pytorch source code and models from: https://github.com/lukemelas/PyTorch-Pretrained-ViT

In [15]:
import numpy as np
import torch
import intel_extension_for_pytorch as ipex

from pytorch_pretrained_vit import ViT

In [19]:
from utils import measure_latency, measure_throught

In [16]:
def measure_latency(model, input, niter=100):
    # INIT LOGGERS
    starter, ender = torch.xpu.Event(enable_timing=True), torch.xpu.Event(enable_timing=True)
    timings = np.zeros((niter,1))
    #GPU-WARM-UP
    for _ in range(10):
        _ = model(input)
    # MEASURE PERFORMANCE
    with torch.no_grad():
        for i in range(niter):
            starter.record()
            _ = model(input)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.xpu.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[i] = curr_time
    
    mean = np.sum(timings) / niter
    std = np.std(timings)
    return mean, std

In [17]:
def measure_throught(model, input, batch_size=1, niter=100):
    total_time = 0
    with torch.no_grad():
        for i in range(niter):
            starter, ender = torch.xpu.Event(enable_timing=True), torch.xpu.Event(enable_timing=True)
            starter.record()
            _ = model(input)
            ender.record()
            torch.xpu.synchronize()
            curr_time = starter.elapsed_time(ender) / 1000
            total_time += curr_time
    throughput = (niter * batch_size) / total_time
    return throughput

In [2]:
from PIL import Image
from torchvision import transforms as T

img = T.Compose([
    T.Resize((384, 384)), 
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])(Image.open('./assets/cat_dog.jpeg')).unsqueeze(0)
#print(img.shape) # torch.Size([1, 3, 384, 384])

In [7]:
device = torch.device('xpu')

In [8]:
input_size = 384
input_data = torch.randn([1,3,input_size,input_size]).to(device)

In [9]:
bs = 128
batch_data = torch.randn([bs,3,input_size,input_size]).to(device)

### Model Base-patch16

In [4]:
model = ViT('B_16_imagenet1k', pretrained=True)

Loaded pretrained weights.


In [5]:
_ = model.eval()
model = model.to(device)
model = ipex.optimize(model)



In [7]:
img = img.to(device)

In [8]:
results = model(img)

In [54]:
y = results.to_tuple()

In [6]:
%timeit _ = model(input_data)

14.2 ms ± 754 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
s = torch.xpu.Event(enable_timing=True)

In [8]:
measure_latency(model, input_data)

(22.9734481048584, 0.44333826704895934)

In [9]:
measure_throught(model, input)

42.553386870452805

In [None]:
measure_throught(model, input, batch_size=bs)

#### float16

In [5]:
model = model.half().eval()
model = model.to(device)
model = ipex.optimize(model)



In [6]:
input_size = 384
input_data = torch.randn([1,3,input_size,input_size], dtype=torch.float16).to(device)

In [8]:
%timeit _ = model(input_data)

12.8 ms ± 75.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
measure_latency(model, input)

(13.805777060190836, 0.5359536506847442)

In [8]:
bs = 64
batch = torch.randn([bs,3,input_size,input_size], dtype=torch.float16).to(device)

In [9]:
measure_throught(model, batch, batch_size=bs)

-35.2996321036514

### Model Base-patch32

In [11]:
model = ViT('B_32_imagenet1k', pretrained=True)

Loaded pretrained weights.


In [14]:
_ = model.eval()
model = model.to(device)
model = ipex.optimize(model)

In [15]:
%timeit _ = model(input)

12.8 ms ± 355 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
measure_latency(model, input)

(12.99974452972412, 0.3156719000938009)

### Model large-patch16

In [9]:
model = ViT('L_16_imagenet1k', pretrained=True)

Loaded pretrained weights.


In [10]:
_ = model.eval()
model = model.to(device)
model = ipex.optimize(model)



In [13]:
%timeit _ = model(input_data)

40.2 ms ± 4.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
measure_latency(model, input_data, niter=100)

(72.87622802734376, 0.4488054516503352)

#### float16

In [21]:
model = model.half()
#model = model.to(device)
#model = ipex.optimize(model)

In [22]:
#input_size = 384
#input_data = torch.randn([1,3,input_size,input_size], dtype=torch.float16).to(device)
data = input_data.to(torch.float16)

In [23]:
%timeit _ = model(data)

28.7 ms ± 710 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
measure_latency(model, data)

(30.921906077067057, 0.26349938907625875)

In [26]:
bs = 32
batch = torch.randn([bs,3,input_size,input_size], dtype=torch.float16).to(device)

In [None]:
measure_throught(model, batch, batch_size=bs)

### Model large-patch32

In [10]:
model = ViT('L_32_imagenet1k', pretrained=True)

Downloading: "https://github.com/lukemelas/PyTorch-Pretrained-ViT/releases/download/0.0.2/L_32_imagenet1k.pth" to /home/wayne/.cache/torch/hub/checkpoints/L_32_imagenet1k.pth
100%|█████████████████████████████████████████████████████████████████████████| 1.14G/1.14G [01:38<00:00, 12.4MB/s]


Loaded pretrained weights.
