In [1]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
from pathlib import Path

onnx_path = Path("onnx")

model_name = "syke9p3/bert-multilabel-tagalog-hate-speech-classifier"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model = ORTModelForSequenceClassification.from_pretrained("syke9p3/bert-multilabel-tagalog-hate-speech-classifier", from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained("syke9p3/bert-multilabel-tagalog-hate-speech-classifier")

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.

***** Exporting submodel 1/1: BertForSequenceClassification *****
Using framework PyTorch: 2.3.1+cu121
Overriding 1 configuration item(s)
	- use_cache -> False


In [3]:
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

In [4]:
from transformers import pipeline
 
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None)
classifier("Parang gago naman tong katolikong arabo")

[[{'label': 'Religion', 'score': 0.8994330167770386},
  {'label': 'Race', 'score': 0.8859398365020752},
  {'label': 'Age', 'score': 0.04573800414800644},
  {'label': 'Physical', 'score': 0.04536525532603264},
  {'label': 'Gender', 'score': 0.03270665556192398},
  {'label': 'Others', 'score': 0.006826996803283691}]]

In [5]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations
 
# apply the optimization configuration to the model
optimizer.optimize(
    save_dir=onnx_path,
    optimization_config=optimization_config,
)

Optimizing model...
[0;93m2024-07-17 15:02:59.811315473 [W:onnxruntime:, inference_session.cc:1978 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.[m
Configuration saved in onnx/ort_config.json
Optimized model saved at: onnx (external data format: False; saved all tensor to one file: True)


PosixPath('onnx')

In [5]:
# load optimized model
optimized_model = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name="model_optimized.onnx")
 
# create optimized pipeline
optimized_clf = pipeline("text-classification", model=optimized_model, tokenizer=tokenizer, top_k=None)
optimized_clf("Parang gago naman tong katolikong arabo")

[[{'label': 'Religion', 'score': 0.8994330167770386},
  {'label': 'Race', 'score': 0.88593989610672},
  {'label': 'Age', 'score': 0.045737992972135544},
  {'label': 'Physical', 'score': 0.04536525532603264},
  {'label': 'Gender', 'score': 0.03270664066076279},
  {'label': 'Others', 'score': 0.006826999597251415}]]

In [6]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model)
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: False)
Quantizing model...
Saving quantized model at: onnx (external data format: False)
Configuration saved in onnx/ort_config.json


In [8]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model_optimized.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model_quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")

Model file size: 481.00 MB
Quantized Model file size: 121.10 MB


In [10]:
model_quantized = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name="model_quantized.onnx")

q8_clf = pipeline("text-classification",model=model_quantized, tokenizer=tokenizer, top_k=None)


In [13]:
q8_clf("Putangina naman netong mga arabong bakla")


[[{'label': 'Gender', 'score': 0.9606606364250183},
  {'label': 'Race', 'score': 0.6761333346366882},
  {'label': 'Religion', 'score': 0.08153875172138214},
  {'label': 'Physical', 'score': 0.044747352600097656},
  {'label': 'Age', 'score': 0.033716924488544464},
  {'label': 'Others', 'score': 0.011751459911465645}]]