In [None]:
!pip install torch numpy pandas onnxruntime-gpu onnx transformers optimum evaluate datasets --quiet

In [None]:
!nvidia-smi

Thu Aug 10 12:40:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from pathlib import Path
import timeit
from copy import deepcopy
from onnxruntime import InferenceSession
from onnxruntime.transformers.optimizer import optimize_model
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTModelForFeatureExtraction
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from evaluate import evaluator
from transformers import ZeroShotClassificationPipeline
import pandas as pd

In [None]:
model_base_id = "amberoad/bert-multilingual-passage-reranking-msmarco"

In [None]:
device = 'cuda'

In [None]:
model_path = Path("models/bert")
tokenizer = AutoTokenizer.from_pretrained(model_base_id)
model_base = AutoModelForSequenceClassification.from_pretrained(model_base_id).to(device)
model_base.save_pretrained(model_path)

In [None]:
model_onnx_path = Path("models/bert_onnx_pruned")
model_onnx = ORTModelForFeatureExtraction.from_pretrained("models/bert/", from_transformers=True)
model_onnx.save_pretrained(model_onnx_path)

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



In [None]:
optimized_onnx_path = str(model_onnx_path / "optimized.onnx")
optimized_model = optimize_model(input=str(model_onnx_path / "model.onnx"), model_type="bert", use_gpu=True)
optimized_model.save_model_to_file(optimized_onnx_path)

In [None]:
optimized_fp16_model_path = str(model_onnx_path / "optimized_fp16.onnx")
optimized_fp16_model = deepcopy(optimized_model)
optimized_fp16_model.convert_float_to_float16()
optimized_fp16_model.save_model_to_file(optimized_fp16_model_path)

In [None]:
providers=["CUDAExecutionProvider"]
sess = InferenceSession(str(model_onnx_path / "model.onnx"), providers=providers)
optimized_sess = InferenceSession(str(model_onnx_path / "optimized.onnx"), providers=providers)
optimized_fp16_sess = InferenceSession(str(model_onnx_path / "optimized_fp16.onnx"), providers=providers)

In [None]:
import numpy as np
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs_base = tokenizer(question, text, return_tensors="pt").to(device)
#inputs = tokenizer_pruned(question, text, return_tensors="pt").to(device)
inputs_onnx = dict(tokenizer(question, text, return_tensors="np"))

In [None]:
def benchmark(f, name=""):
  for i in range(10):
    f()
  seconds_per_iter = timeit.timeit(f, number=100)/100
  print(f"{name}:", f"{seconds_per_iter*1000:.3f} ms")

In [None]:
# %% Test inference times for all variants
benchmark(lambda: model_base(**inputs_base), "Pytorch")
#benchmark(lambda: model_pruned(**inputs), "Pruned Pytorch")
benchmark(lambda: sess.run(None, input_feed=inputs_onnx), "Pruned ONNX")
benchmark(lambda: optimized_sess.run(None, input_feed=inputs_onnx), "Pruned ONNX optimized")
benchmark(lambda: optimized_fp16_sess.run(None, input_feed=inputs_onnx), "Pruned ONNX optimized fp16")

Pytorch: 13.378 ms
Pruned ONNX: 4.000 ms
Pruned ONNX optimized: 2.725 ms
Pruned ONNX optimized fp16: 1.385 ms


In [None]:
#@title PUSH TO HUB
from huggingface_hub import notebook_login
notebook_login() # нужен токен с hugging face

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_base_id)
onnx_model = ORTModelForFeatureExtraction.from_pretrained("models/bert_onnx_pruned/", file_name="optimized_fp16.onnx",use_io_binding=True).to(device)
onnx_model.save_pretrained("a_local_path_for_convert_onnx_model" )

NAME = model_base_id.split('/')[1]+'-onnx-fe-optimized-fp16'
tokenizer.push_to_hub("yuraz28/" + NAME)
onnx_model.push_to_hub("a_local_path_for_convert_onnx_model" , repository_id=ID+"/"+NAME, use_auth_token= True)

The ONNX file optimized_fp16.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


optimized_fp16.onnx:   0%|          | 0.00/344M [00:00<?, ?B/s]