In [1]:
!pip install transformers onnxruntime onnx



In [2]:
!nvidia-smi

Thu Nov 20 16:54:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Imports

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import time
import numpy as np
import torch.nn as nn
import os
from typing import Literal
import onnxruntime as ort
import torch.onnx


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [1]:
import torch
import torch
import torch.onnx

  from .autonotebook import tqdm as notebook_tqdm


# 1. Evaluation mode

In [4]:
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [5]:
input_texts = [
    "Transformers are amazing for natural language processing tasks.",
    "ArithmeticError is a built-in exception in Python.",
    "Short texts work well too!",
]
inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, padding=True)

In [6]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '[UNK]',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [7]:
# Checking special ids and tokens
special_ids = tokenizer.all_special_ids
for idx in special_ids:
    print(f"ID: {idx}, Token: {tokenizer.convert_ids_to_tokens(idx)}")

ID: 0, Token: <s>
ID: 2, Token: </s>
ID: 104, Token: [UNK]
ID: 1, Token: <pad>
ID: 30526, Token: <mask>


In [8]:
inputs
# Just checking if special tokens were added
# 0 - bos_token
# 1 - pad_token
# 2 - eos_token

{'input_ids': tensor([[    0, 19085,  2028,  6433,  2009,  3023,  2657,  6368,  8522,  1016,
             2,     1,     1,     1],
        [    0, 20208,  2125, 29169,  2007,  1041,  2332,  1015,  2003,  6457,
          2003, 18754,  1016,     2],
        [    0,  2464,  6985,  2151,  2096,  2209,  1003,     2,     1,     1,
             1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [9]:
inputs["input_ids"].shape

torch.Size([3, 14])

In [6]:
def compare_basic_pytorch_time(model, inputs, device, n_iters=100):
    model.to(device)
    inputs.to(device)

    time_lst = []
    for _ in range(n_iters):
        start = time.time()
        _ = model(**inputs)
        end = time.time()
        time_lst.append(end - start)

    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    print(
        f"(default) Average inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds"
    )
    return avg_time, std_time


def compare_pytorch_model_eval(
    model, inputs, device, compile_model: bool = False, n_iters=100
):
    model.to(device)
    inputs.to(device)
    time_lst = []
    model.eval()

    if compile_model:
        start = time.time()
        model = torch.compile(model)
        _ = model(**inputs)  # Warm-up after compilation
        end = time.time()
        print(f"Model compilation time and warm-up: {end - start:.6f} seconds")

    for _ in range(n_iters):
        start = time.time()
        _ = model(**inputs)
        end = time.time()
        time_lst.append(end - start)

    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    text = "(eval mode + compiled)" if compile_model else "(eval mode)"
    print(
        f"{text} Average inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds"
    )
    return avg_time, std_time


def compare_pytorch_model_eval_no_grad(
    model, inputs, device, compile_model: bool = False, n_iters=100
):
    model.to(device)
    inputs.to(device)
    time_lst = []
    model.eval()

    if compile_model:
        start = time.time()
        model = torch.compile(model)
        _ = model(**inputs)  # Warm-up after compilation
        end = time.time()
        print(f"Model compilation time and warm-up: {end - start:.6f} seconds")

    with torch.no_grad():
        for _ in range(n_iters):
            start = time.time()
            _ = model(**inputs)
            end = time.time()
            time_lst.append(end - start)

    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    print_text = (
        "(eval mode + no_grad + compiled)" if compile_model else "(eval mode + no_grad)"
    )
    print(
        f"{print_text} Average inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds"
    )
    return avg_time, std_time


def compare_pytorch_model_eval_inference_mode(
    model, inputs, device, compile_model: bool = False, n_iters=100
):
    model.to(device)
    inputs.to(device)
    time_lst = []
    model.eval()

    if compile_model:
        start = time.time()
        model = torch.compile(model)
        _ = model(**inputs)  # Warm-up after compilation
        end = time.time()
        print(f"Model compilation time and warm-up: {end - start:.6f} seconds")

    with torch.inference_mode():
        for _ in range(n_iters):
            start = time.time()
            _ = model(**inputs)
            end = time.time()
            time_lst.append(end - start)
    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    print_text = (
        "(eval mode + inference_mode + compiled)"
        if compile_model
        else "(eval mode + inference_mode)"
    )
    print(
        f"{print_text} Average inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds",
        end="\n\n",
    )
    return avg_time, std_time

In [11]:
def compare_different_inference_modes(model, inputs, device, n_iters=100):
    print("Comparing different inference modes:")
    basic_results = compare_basic_pytorch_time(model, inputs, device, n_iters=n_iters)
    eval_results = compare_pytorch_model_eval(
        model, inputs, device, compile_model=False, n_iters=n_iters
    )
    eval_no_grad_results = compare_pytorch_model_eval_no_grad(
        model, inputs, device, compile_model=False, n_iters=n_iters
    )
    eval_inference_mode_results = compare_pytorch_model_eval_inference_mode(
        model, inputs, device, compile_model=False, n_iters=n_iters
    )

    eval_speedup = basic_results[0] / eval_results[0]
    eval_no_grad_speedup = basic_results[0] / eval_no_grad_results[0]
    eval_inference_mode_speedup = basic_results[0] / eval_inference_mode_results[0]

    print(f"Speedup with eval mode: {eval_speedup:.2f}x")
    print(f"Speedup with eval mode + no_grad: {eval_no_grad_speedup:.2f}x")
    print(
        f"Speedup with eval mode + inference_mode: {eval_inference_mode_speedup:.2f}x"
    )

In [12]:
compare_different_inference_modes(model, inputs, device)

Comparing different inference modes:
(default) Average inference time over 100 iterations: 0.011900 seconds, std: 0.011750 seconds
(eval mode) Average inference time over 100 iterations: 0.010750 seconds, std: 0.001262 seconds
(eval mode + no_grad) Average inference time over 100 iterations: 0.008669 seconds, std: 0.000985 seconds
(eval mode + inference_mode) Average inference time over 100 iterations: 0.007724 seconds, std: 0.000973 seconds

Speedup with eval mode: 1.11x
Speedup with eval mode + no_grad: 1.37x
Speedup with eval mode + inference_mode: 1.54x


Each successive inference mode improved execution speed. This shows that with just two simple changes — calling model.eval() and using torch.inference_mode() — we can nearly halve inference time. We should always run models in evaluation and inference mode when serving them because it's very easy change in code and gave us a lot if we think that we can save e.g half of our cost spend on VM


# 2. PyTorch model compilation <a id="pytorch-model-compilation"></a>

In [5]:
def compare_pytorch_compile_speedup(model, inputs, device, n_iters=100):
    print("Comparing PyTorch compilation speedup:")
    basic_results = compare_basic_pytorch_time(model, inputs, device, n_iters=n_iters)
    eval_results = compare_pytorch_model_eval(
        model, inputs, device, compile_model=True, n_iters=n_iters
    )
    eval_no_grad_results = compare_pytorch_model_eval_no_grad(
        model, inputs, device, compile_model=True, n_iters=n_iters
    )
    eval_inference_mode_results = compare_pytorch_model_eval_inference_mode(
        model, inputs, device, compile_model=True, n_iters=n_iters
    )

    eval_speedup = basic_results[0] / eval_results[0]
    eval_no_grad_speedup = basic_results[0] / eval_no_grad_results[0]
    eval_inference_mode_speedup = basic_results[0] / eval_inference_mode_results[0]

    print(f"Speedup with eval mode + compilation: {eval_speedup:.2f}x")
    print(
        f"Speedup with eval mode + no_grad + compilation: {eval_no_grad_speedup:.2f}x"
    )
    print(
        f"Speedup with eval mode + inference_mode + compilation: {eval_inference_mode_speedup:.2f}x"
    )

In [None]:
# First run
compare_pytorch_compile_speedup(model, inputs, device)

Comparing PyTorch compilation speedup:
(default) Average inference time over 100 iterations: 0.010302 seconds, std: 0.001018 seconds
Model compilation time and warm-up: 15.941382 seconds
(eval mode + compiled) Average inference time over 100 iterations: 0.006355 seconds, std: 0.001495 seconds
Model compilation time and warm-up: 0.006354 seconds
(eval mode + no_grad + compiled) Average inference time over 100 iterations: 0.024031 seconds, std: 0.180440 seconds
Model compilation time and warm-up: 0.006768 seconds
(eval mode + inference_mode + compiled) Average inference time over 100 iterations: 0.023997 seconds, std: 0.179867 seconds

Speedup with eval mode + compilation: 1.62x
Speedup with eval mode + no_grad + compilation: 0.43x
Speedup with eval mode + inference_mode + compilation: 0.43x


In [None]:
# Second run
compare_pytorch_compile_speedup(model, inputs, device)

Comparing PyTorch compilation speedup:
(default) Average inference time over 100 iterations: 0.010652 seconds, std: 0.001046 seconds
Model compilation time and warm-up: 0.007268 seconds
(eval mode + compiled) Average inference time over 100 iterations: 0.005664 seconds, std: 0.000649 seconds
Model compilation time and warm-up: 0.006289 seconds
(eval mode + no_grad + compiled) Average inference time over 100 iterations: 0.005666 seconds, std: 0.000471 seconds
Model compilation time and warm-up: 0.006843 seconds
(eval mode + inference_mode + compiled) Average inference time over 100 iterations: 0.005778 seconds, std: 0.000270 seconds

Speedup with eval mode + compilation: 1.88x
Speedup with eval mode + no_grad + compilation: 1.88x
Speedup with eval mode + inference_mode + compilation: 1.84x


Using `model.compile()` yields even larger speedups—roughly 2×—compared to the ~1.5× observed for eval mode + inference mode. With compilation, the differences between other optimization methods diminish, and I no longer see consistent improvements when stacking them. This may be because `model.compile()` applies such strong optimizations that contexts like `torch.no_grad()` or `torch.inference_mode()` have less impact.

Compilation and warm-up take a noticeable amount of time (≈15 seconds), but this cost is incurred only once; the compiled model can then be reused (cached) for serving. On the first run, I observed a speedup only for the eval mode + compilation configuration, while other combinations were slower. I initially thought this was a mistake, but I couldn't find issues in my code. On the second run, timings were consistent. I lack a clear explanation for the initial anomaly, especially since the explicit warm-up time was excluded from the inference measurements.

# 3. Quantization

In [8]:
# Making sure that model runs on cpu in the exercise 3
device = "cpu"
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
model.to(device)
inputs.to(device)

{'input_ids': tensor([[    0, 19085,  2028,  6433,  2009,  3023,  2657,  6368,  8522,  1016,
             2,     1,     1,     1],
        [    0, 20208,  2125, 29169,  2007,  1041,  2332,  1015,  2003,  6457,
          2003, 18754,  1016,     2],
        [    0,  2464,  6985,  2151,  2096,  2209,  1003,     2,     1,     1,
             1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [9]:
qunatized_model = torch.ao.quantization.quantize_dynamic(
    model, dtype=torch.qint8, qconfig_spec={nn.Linear}
)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qunatized_model = torch.ao.quantization.quantize_dynamic(model, dtype=torch.qint8, qconfig_spec={nn.Linear})


In [38]:
model

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [39]:
qunatized_model

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout(p=0.1, inplace=F

As we can see, in quantized model there are `dtype=torch.qint8` types in weights

In [10]:
fp_32_path = "fp32_model.pth"
quantized_model_path = "quantized_model.pth"
torch.save(model.state_dict(), fp_32_path)
torch.save(qunatized_model.state_dict(), quantized_model_path)

In [11]:
fp32_model_mb = os.path.getsize(fp_32_path) / (1024 * 1024)
quantized_model_mb = os.path.getsize(quantized_model_path) / (1024 * 1024)
print(f"Disk ussage of fp32 model: {fp32_model_mb:.2f} MB")
print(f"Disk ussage of quantized model: {quantized_model_mb:.2f} MB")
print(f"Compression ratio: {fp32_model_mb / quantized_model_mb:.2f}x")

Disk ussage of fp32 model: 417.73 MB
Disk ussage of quantized model: 173.10 MB
Compression ratio: 2.41x


The FP32 model took 417 MB on disk, while the quantized model took 173 MB, which is about a 2.4× compression ratio. This confirms that weight quantization reduces model size. The lab description states that model size can be reduced by up to 4×, but it depends on the model type, so I think the results in my case are fine.

In [12]:
def compare_inference_times(model, quantized_model, inputs, device, n_iters=100):
    print("Checking initial model inference time:")
    original_model_results = compare_pytorch_model_eval_inference_mode(
        model, inputs, device, compile_model=False, n_iters=n_iters
    )

    print("\n\n")
    print("Checking quantized model inference time:")
    quantized_model_results = compare_pytorch_model_eval_inference_mode(
        quantized_model, inputs, device, n_iters=n_iters
    )

    speed_up_ratio = original_model_results[0] / quantized_model_results[0]

    print(f"Speedup with quantized model: {speed_up_ratio:.2f}x")

In [13]:
compare_inference_times(model, qunatized_model, inputs, device)

Checking initial model inference time:
(eval mode + inference_mode) Average inference time over 100 iterations: 0.129314 seconds, std: 0.018578 seconds




Checking quantized model inference time:
(eval mode + inference_mode) Average inference time over 100 iterations: 0.059912 seconds, std: 0.004888 seconds

Speedup with quantized model: 2.16x


Inference time comparison shows that the quantized model is ~2× faster than the initial model, which aligns with the laboratory description claiming a 1.5×–2× speed-up ratio. I evaluated only one mode—using `model.eval()` with `torch.inference_mode()` without checking other combination that were examined previously. However, I expect the results to be similar across different inference modes.

Quantization appears to be a useful option for speeding up models in production. However, we need to keep in mind that it is a lossy compression technique. Compressing model weights can lead to reduced accuracy, so we must decide what trade-off is acceptable: a larger model with higher quality but slower inference, or a smaller, faster model with potentially lower quality. If the accuracy drop is small or acceptable for the application, quantization is a great choice.

# 4. GPU optimization strategies

In [12]:
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [15]:
short_text = "MLOps is great!"
medium_text = "Transformers are amazing for natural language processing tasks."
long_text = "MLOps is one of the best subject in 9t semester. It's not an easy subject, but it provides valuable knowledge about deploying and maintaining machine learning models in production environments. I really enjoy that subcject anf highly recommend it to others interested in the field of machine learning and operations."
short_inputs = tokenizer(short_text, return_tensors="pt", truncation=True, padding=True)
medium_inputs = tokenizer(
    medium_text, return_tensors="pt", truncation=True, padding=True
)
long_inputs = tokenizer(long_text, return_tensors="pt", truncation=True, padding=True)

In [None]:
def test_compile_mode_inference(
    model,
    inputs,
    device,
    compile_mode: Literal["default", "max-autotune", "max-autotune-no-cudagraphs"],
    n_iters=100,
):
    model.to(device)
    inputs_gpu = {k: v.to(device) for k, v in inputs.items()}
    time_lst = []
    model.eval()

    start = time.time()
    compiled_model = torch.compile(model, mode=compile_mode)  # compilation
    _ = compiled_model(**inputs_gpu)  # Warm-up after compilation
    end = time.time()
    print(
        f"Model compilation time and warm-up ({compile_mode}): {end - start:.6f} seconds"
    )

    with torch.inference_mode():
        for _ in range(n_iters):
            start = time.time()
            _ = compiled_model(**inputs_gpu)
            end = time.time()
            time_lst.append(end - start)

    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    print(
        f"({compile_mode}) Average inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds"
    )
    return avg_time, std_time


def test_different_compile_modes(model, inputs, device, n_iters=100):
    default_results = test_compile_mode_inference(
        model, inputs, device, compile_mode="default", n_iters=n_iters
    )
    autotune_results = test_compile_mode_inference(
        model, inputs, device, compile_mode="max-autotune", n_iters=n_iters
    )
    autotune_no_cudagraphs_results = test_compile_mode_inference(
        model,
        inputs,
        device,
        compile_mode="max-autotune-no-cudagraphs",
        n_iters=n_iters,
    )

    autotune_speedup = default_results[0] / autotune_results[0]
    autotune_no_cudagraphs_speedup = (
        default_results[0] / autotune_no_cudagraphs_results[0]
    )
    print(f"Speedup with max-autotune: {autotune_speedup:.2f}x")
    print(
        f"Speedup with max-autotune-no-cudagraphs: {autotune_no_cudagraphs_speedup:.2f}x"
    )

In [None]:
test_different_compile_modes(model, short_inputs, device, n_iters=1000)

Model compilation time and warm-up (default): 41.452659 seconds
(default) Average inference time over 1000 iterations: 0.090628 seconds, std: 0.403689 seconds


AUTOTUNE bmm(12x7x7, 12x7x64)
strides: [49, 7, 1], [64, 768, 1]
dtypes: torch.float32, torch.float32
  cpp_CppMicroGemmFP32Vec_0 0.0019 ms 100.0% 
  bmm 0.0223 ms 8.6% 
SingleProcess AUTOTUNE benchmarking takes 0.2543 seconds and 2.4037 seconds precompiling for 2 choices


Model compilation time and warm-up (max-autotune): 24.445202 seconds
(max-autotune) Average inference time over 1000 iterations: 0.092841 seconds, std: 0.461298 seconds
Model compilation time and warm-up (max-autotune-no-cudagraphs): 19.472042 seconds
(max-autotune-no-cudagraphs) Average inference time over 1000 iterations: 0.093017 seconds, std: 0.470914 seconds
Speedup with max-autotune: 0.98x
Speedup with max-autotune-no-cudagraphs: 0.97x


In [18]:
test_different_compile_modes(model, medium_inputs, device, n_iters=1000)

Model compilation time and warm-up (default): 49.261932 seconds


W1119 17:02:42.729000 14556 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
W1119 17:02:42.729000 14556 torch/_dynamo/convert_frame.py:1016] [0/8]    function: 'forward' (/usr/local/lib/python3.12/dist-packages/transformers/models/mpnet/modeling_mpnet.py:449)
W1119 17:02:42.729000 14556 torch/_dynamo/convert_frame.py:1016] [0/8]    last reason: 0/7: GLOBAL_STATE changed: grad_mode 
W1119 17:02:42.729000 14556 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1119 17:02:42.729000 14556 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


(default) Average inference time over 1000 iterations: 0.115173 seconds, std: 0.619407 seconds
Model compilation time and warm-up (max-autotune): 0.120194 seconds
(max-autotune) Average inference time over 1000 iterations: 0.102635 seconds, std: 0.007542 seconds
Model compilation time and warm-up (max-autotune-no-cudagraphs): 0.104730 seconds
(max-autotune-no-cudagraphs) Average inference time over 1000 iterations: 0.103895 seconds, std: 0.009855 seconds
Speedup with max-autotune: 1.12x
Speedup with max-autotune-no-cudagraphs: 1.11x


In [19]:
test_different_compile_modes(model, long_inputs, device, n_iters=1000)

Model compilation time and warm-up (default): 0.215998 seconds
(default) Average inference time over 1000 iterations: 0.170848 seconds, std: 0.023272 seconds
Model compilation time and warm-up (max-autotune): 0.159622 seconds
(max-autotune) Average inference time over 1000 iterations: 0.167672 seconds, std: 0.023845 seconds
Model compilation time and warm-up (max-autotune-no-cudagraphs): 0.157727 seconds
(max-autotune-no-cudagraphs) Average inference time over 1000 iterations: 0.168254 seconds, std: 0.023500 seconds
Speedup with max-autotune: 1.02x
Speedup with max-autotune-no-cudagraphs: 1.02x


When comparing the `max-autotune` and `max-autotune-no-cudagraphs` compilation modes with the `default` mode, performance on short input text was slightly worse than the normal mode. This is strange, but it might be because I couldn't find the `pin_memory` parameter in the transformers tokenizer, even though the lab description mentioned it was worth using.

For medium input text lengths, there were some gains from using `max-autotune` and `max-autotune-no-cudagraphs`, with speedups of 1.12x and 1.11x respectively. For long input texts, the situation was similar to that of short inputs: results were very close to the `default` mode, though currently slightly better (1.02x speedup).

I expected more consistency in the patterns of results and a greater speedup. I also expected that for long sentences, `max-autotune-no-cudagraphs` would perform better because it should allow the model to handle dynamic inputs without recompilation. Maybe something is wrong with my setup, or the test should be performed with batched inputs to better utilize the GPU.

# 5. Changing numerical precision

In [6]:
capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

# Tensor Cores are available on NVidia GPUs with CUDA >= 7 (e.g. Volta, Turing, Ampere, Hopper)
if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

# tensor cores are supported

CUDA device capability: (7, 5)
Tensor Cores available: fast float16 supported.


In [7]:
def compare_inference_time_different_precissions(model, inputs, device, n_iters=100):
    print("Comparing inference time with different precisions:")

    # FP32
    model.to(device)
    device_fp32_inputs = {k: v.to(device) for k, v in inputs.items()}
    time_lst_fp32 = []
    model.eval()
    with torch.inference_mode():
        for _ in range(n_iters):
            start = time.time()
            _ = model(**device_fp32_inputs)
            end = time.time()
            time_lst_fp32.append(end - start)
    avg_time_fp32 = np.mean(time_lst_fp32)
    std_time_fp32 = np.std(time_lst_fp32)
    print(
        f"(FP32) Average inference time over {n_iters} iterations: {avg_time_fp32:.6f} seconds, std: {std_time_fp32:.6f} seconds"
    )

    # FP16
    model_fp16 = model.half().to(device.type)
    # NOTE
    # For input I don't perform .half() as in input i have indexs_ids and attention_mask. Those value have to be intgerers
    # because otherwise embedding layer from transformer will throw error
    # it's like we want to retrieve embedding for index 5, but with .half() we converted it to 5.0 float and model doesn't handle that
    # same for attention mask, most likely it would be converted to bool_maks, and it also can throw error
    device_fp16_inputs = {k: v.to(device) for k, v in inputs.items()}
    time_lst_fp16 = []
    model_fp16.eval()
    with torch.inference_mode():
        for _ in range(n_iters):
            start = time.time()
            _ = model_fp16(**device_fp16_inputs)
            end = time.time()
            time_lst_fp16.append(end - start)
    avg_time_fp16 = np.mean(time_lst_fp16)
    std_time_fp16 = np.std(time_lst_fp16)
    print(
        f"(FP16) Average inference time over {n_iters} iterations: {avg_time_fp16:.6f} seconds, std: {std_time_fp16:.6f} seconds"
    )

    device_autocast_inputs = {k: v.to(device) for k, v in inputs.items()}
    time_lst_autocast = []
    model.to(device)
    model.eval()
    with torch.inference_mode():
        with torch.amp.autocast(device_type=device.type, enabled=True):
            for _ in range(n_iters):
                start = time.time()
                _ = model(**device_autocast_inputs)
                end = time.time()
                time_lst_autocast.append(end - start)
    avg_time_autocast = np.mean(time_lst_autocast)
    std_time_autocast = np.std(time_lst_autocast)
    print(
        f"(Autocast) Average inference time over {n_iters} iterations: {avg_time_autocast:.6f} seconds, std: {std_time_autocast:.6f} seconds"
    )

    speedup_fp_16 = avg_time_fp32 / avg_time_fp16
    speedup_autocast = avg_time_fp32 / avg_time_autocast
    print(f"Speedup with FP16: {speedup_fp_16:.2f}x")
    print(f"Speedup with Autocast: {speedup_autocast:.2f}x")

In [8]:
compare_inference_time_different_precissions(model, inputs, device, n_iters=100)

Comparing inference time with different precisions:
(FP32) Average inference time over 100 iterations: 0.012918 seconds, std: 0.049498 seconds
(FP16) Average inference time over 100 iterations: 0.009761 seconds, std: 0.018628 seconds
(Autocast) Average inference time over 100 iterations: 0.009600 seconds, std: 0.001223 seconds
Speedup with FP16: 1.32x
Speedup with Autocast: 1.35x


Both half precision and torch Autocast speed up inference by approximately 1.35x. In practice, I prefer `Autocast` mode because it seems slightly faster and safer, as it lets PyTorch handle precision casting automatically.

I also encountered a tricky scenario when using the `.half()` method. I couldn't apply it to my inputs because they were token IDs and attention masks. Since these values must remain integers, mapping them to fp16 threw an error in my model. Therefore, we must carefully consider input types and model requirements when changing numerical precision.

# 6. ONNX

In [86]:
# loading model and tokenizer to make sure there are prepared and no overwritten
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

# exporting model to onnx
device = "cpu"
# Put the model in eval mode and move to CPU
model_cpu = model.eval().cpu()

# Example input for tracking (for onnx export)
sample_input = tokenizer(
    "This is a sample input text for ONNX export.",
    padding=True,
    truncation=True,
    return_tensors="pt",
)

# Export to ONNX format
torch.onnx.export(
    model_cpu,
    (sample_input["input_ids"], sample_input["attention_mask"]),
    "model.onnx",
    opset_version=17,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},
    },
)

  torch.onnx.export(


In [87]:
# Make sure that onnx model is saved
!ls 

model.onnx  sample_data


In [88]:
# Load the model
ort_session = ort.InferenceSession("model.onnx")

# Prepare input data
sample_input = tokenizer(
    "This is a sample input text for ONNX inference.",
    padding=True,
    truncation=True,
    return_tensors="np",
)


# Create input dictionary, in same format as during export
inputs_onnx = {
    "input_ids": sample_input["input_ids"],
    "attention_mask": sample_input["attention_mask"],
}

# Run inference
outputs_onnx = ort_session.run(None, inputs_onnx)
print(outputs_onnx)

[array([[[ 0.10015495, -0.24121654, -0.07321592, ...,  0.07796387,
         -0.21866202, -0.2590373 ],
        [ 0.14760065, -0.13897762, -0.14370751, ..., -0.01106242,
         -0.2662107 , -0.12384814],
        [ 0.08314157, -0.223754  , -0.14671417, ..., -0.09309337,
         -0.1952658 , -0.19207902],
        ...,
        [-0.0013935 ,  0.2468841 ,  0.02294915, ...,  0.04206365,
         -0.30782154, -0.23525402],
        [ 0.09198561, -0.18079105,  0.00285578, ...,  0.04924215,
         -0.17577925, -0.2860438 ],
        [ 0.0876145 , -0.20475739, -0.05056114, ...,  0.11682078,
         -0.2020796 , -0.2533966 ]]], dtype=float32), array([[ 1.52873486e-01, -5.95023558e-02, -6.43299222e-02,
         8.02019760e-02, -4.88571338e-02, -2.81683151e-02,
         5.04173413e-02,  8.66596587e-03, -5.80217317e-02,
        -1.00995872e-04,  3.33862961e-03,  3.56380381e-02,
         1.18111536e-01, -7.24004731e-02,  4.40806411e-02,
         1.08006254e-01, -1.30637512e-01,  5.13480194e-02,
  

In [89]:
# Saving optimized onxx model for offline optimization
sess_options = ort.SessionOptions()

# Explicitly set optimization level to ALL (it's setup by default but just to play with it)
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# Save the optimized model to this path
sess_options.optimized_model_filepath = "model_optimized.onnx"

# Create InferenceSession, which will perform offline optimization and save the optimized model
# explicitly setup cpu provider as we are testing cpu in that example
ort.InferenceSession("model.onnx", sess_options, providers=["CPUExecutionProvider"])

<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x7b763eb5ca10>

In [90]:
# Make sure that optimized model is saved
!ls

model.onnx  model_optimized.onnx  sample_data


In [13]:
def measure_onnx_coldsart(
    onnx_model_path: str, inputs, optimization_mode: Literal["online", "offline"]
):
    start = time.time()
    sess_options = ort.SessionOptions()

    if optimization_mode == "online":
        # enable for online mode
        sess_options.graph_optimization_level = (
            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        )
    else:
        # disable for offline mode
        sess_options.graph_optimization_level = (
            ort.GraphOptimizationLevel.ORT_DISABLE_ALL
        )

    # explicitly setup cpu provider as we are testing cpu in that example
    _ = ort.InferenceSession(
        onnx_model_path, sess_options=sess_options, providers=["CPUExecutionProvider"]
    )

    end = time.time()
    print(
        f"ONNX Runtime coldstart ({optimization_mode} optimization): {end - start:.6f} seconds"
    )

In [92]:
measure_onnx_coldsart("model.onnx", sample_input, optimization_mode="online")
measure_onnx_coldsart("model_optimized.onnx", sample_input, optimization_mode="offline")

ONNX Runtime coldstart (online optimization): 0.978385 seconds
ONNX Runtime coldstart (offline optimization): 0.589654 seconds


As we can observe, cold start for online mode is larger as in that case each time we create `ort.InferenceSession` object we apply optimization on onnx model. There is also some trick I found in description that, for exporting model to `onnx` we use `pt` type tensors but later with inference we are using `np` type tensor. I believe that happens because `onnx` format is framework/language agnostic and we don't want to use pytorch `pt` tensor types (or even we can't).

In [98]:
def measure_onnx_inference_time(
    onnx_model_path,
    inputs,
    optimization_mode: Literal["online", "offline"],
    n_iters=100,
):
    sess_options = ort.SessionOptions()

    if optimization_mode == "online":
        # enable for online mode
        sess_options.graph_optimization_level = (
            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        )
    else:
        # disable for offline mode
        sess_options.graph_optimization_level = (
            ort.GraphOptimizationLevel.ORT_DISABLE_ALL
        )

    # Create InferenceSession
    # explicitly setup cpu provider as we are testing cpu in that example
    session_cpu = ort.InferenceSession(
        onnx_model_path, sess_options=sess_options, providers=["CPUExecutionProvider"]
    )

    # Prepare inputs for ONNX Runtime
    inputs_onnx = {
        "input_ids": sample_input["input_ids"],
        "attention_mask": sample_input["attention_mask"],
    }

    # Measure inference time
    time_lst = []
    for _ in range(n_iters):
        start = time.time()
        _ = session_cpu.run(None, inputs_onnx)
        end = time.time()
        time_lst.append(end - start)

    avg_time = np.mean(time_lst)
    std_time = np.std(time_lst)
    print(
        f"({optimization_mode}) Average ONNX inference time over {n_iters} iterations: {avg_time:.6f} seconds, std: {std_time:.6f} seconds"
    )
    return avg_time, std_time


def compare_onnx_modes_inference_times(inputs, n_iters=100):
    online_results = measure_onnx_inference_time(
        "model.onnx", inputs, optimization_mode="online", n_iters=n_iters
    )
    offline_results = measure_onnx_inference_time(
        "model_optimized.onnx", inputs, optimization_mode="offline", n_iters=n_iters
    )

    speedup_offline = online_results[0] / offline_results[0]
    print(f"Speedup with offline optimized ONNX model: {speedup_offline:.2f}x")

In [99]:
compare_onnx_modes_inference_times(sample_input, n_iters=1000)

(online) Average ONNX inference time over 1000 iterations: 0.041703 seconds, std: 0.003944 seconds
(offline) Average ONNX inference time over 1000 iterations: 0.041459 seconds, std: 0.003448 seconds
Speedup with offline optimized ONNX model: 1.01x


There are no differences in inference time between `online` mode and `offline` mode. This happens because the difference only occurs during the cold start when the `ort.InferenceSession` object is created. After that, the inference times are the same because both models are equally optimized after the cold start, regardless of whether `online` or `offline` mode was used. In my benchmark, I measure only the inference time without the cold start, as the cold start was measured previously. This explains why no difference appears in this benchmark - the performance difference only shows up during the cold start phase, which was measured separately.

In [10]:
device = "cpu"
model = AutoModel.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)
inputs = tokenizer(
    ["test text for checking outputs"],
    return_tensors="pt",
    truncation=True,
    padding=True,
).to(device)

In [None]:
# saving pytorch compiled model. Onnx ofline optimized model will be used from above execution
model.eval()
compiled_model = torch.compile(model)
with torch.inference_mode():
    # warmup
    _ = compiled_model(**inputs)
torch.save(compiled_model.state_dict(), "compiled_model.pth")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

![image.png](assets/image_sizes.png)

The Docker image for ONNX is lighter, as we don’t need PyTorch for pure ONNX inference.

In [24]:
!docker run --rm pytorch-model  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Average inference time over 100 runs: 0.033357372283935545 seconds
Stddev of inference time over 100 runs: 0.00849718876790392 seconds


In [27]:
!docker run --rm onnx-model     

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Average inference time over 100 runs: 0.015625667572021485 seconds
Stddev of inference time over 100 runs: 0.0034382682264247942 seconds


For the Docker test, I used:

- an ONNX model with offline optimization

- a PyTorch compiled model (I didn’t use the quantized version to keep the same model size and weight precision)

The ONNX configuration was faster: average inference time was 0.015 seconds vs. 0.03 seconds for pytorch. I think it is preferable for production to use ONNX because it’s a framework-agnostic approach, allows us to save resources by not explicitly adding PyTorch to the Docker image, and it’s faster.

The ONNX configuration can be found in `Dockerfile.onnx` and `serve_onnx_model.py`.
The PyTorch configuration can be found in `Dockerfile.pytorch` and `serve_pytorch_model.py`.

Please note that the absolute execution times in this case may differ from the previous tests, because the earlier tests were performed in a Google Colab environment, while the Docker-based tests were carried out in a local environment.