# Experiment Notebook
Load .onnx and Verify Embedding without ML-Commons API to see if the problem is with ML-Commons API or the .onnx file itself

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
CLUSTER_URL = 'https://localhost:9200'

In [35]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

In [36]:
client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)



## Trace the Model in Onnx Using save_as_onnx
See `opensearch_py_ml/ml_models/sentencetransformermodel.py`

In [6]:
model_id = "sentence-transformers/distiluse-base-multilingual-cased-v1"
folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v1'
model_name = str(model_id.split("/")[-1] + ".onnx")
model_path = os.path.join(folder_path, "onnx", model_name)

In [7]:
# Case I: Initiate SentenceTransformerModel and Call save_as_onnx

# pre_trained_model = SentenceTransformerModel(model_id=model_id, folder_path=folder_path, overwrite=True)
# model_path_onnx = pre_trained_model.save_as_onnx(model_id=model_id)

In [8]:
# Case II: Repeat what save_as_onnx function does

# from transformers.convert_graph_to_onnx import convert
# from pathlib import Path

# model = SentenceTransformer(model_id)
# folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v1'

# model_name = str(model_id.split("/")[-1] + ".onnx")

# model_path = os.path.join(folder_path, "onnx", model_name)
        
# convert(
#     framework="pt",
#     model=model_id,
#     output=Path(model_path),
#     opset=15,
# )

In [9]:
# Case III: Already run demo_tracing_model_torch_script_onnx_dense notebook 

# Skip to next step since we already have .onnx at model_path

## Load Onnx Model to Check Our .onnx file

In [10]:
import onnx

onnx_model = onnx.load(model_path)

# Check that the model is well formed
onnx.checker.check_model(onnx_model)

In [11]:
# Print a human readable representation of the graph
# print(onnx.helper.printable_graph(onnx_model.graph))

## Verify Embedidngs

In [12]:
import onnxruntime as ort

ort_session = ort.InferenceSession(model_path)

In [13]:
from transformers import AutoTokenizer

input_sentences = ["first sentence", "second sentence", "very very long random sentence for testing"]
autotokenizer = AutoTokenizer.from_pretrained(model_id)
auto_features = autotokenizer(
            input_sentences, return_tensors="pt", padding=True, truncation=True
        )

In [14]:
autotokenizer

DistilBertTokenizerFast(name_or_path='sentence-transformers/distiluse-base-multilingual-cased-v1', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [15]:
auto_features

{'input_ids': tensor([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [43]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {k: v.cpu().detach().numpy() for k, v in auto_features.items()}
ort_outs = ort_session.run(None, ort_inputs)

# Wrong Output Shape

In [44]:
len(ort_outs)

1

In [45]:
len(ort_outs[0])

3

In [46]:
ort_outs[0][0].shape

(9, 768)

In [21]:
import numpy as np

from sentence_transformers import SentenceTransformer

original_pre_trained_model = SentenceTransformer(model_id) # From Huggingface
original_embedding_data = list(
    original_pre_trained_model.encode(input_sentences, convert_to_numpy=True)
)

In [22]:
embedding_data_onnx = [
            ort_outs[0][i]
            for i in range(len(input_sentences))
        ]

In [23]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(original_embedding_data[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

(shapes (512,), (9, 768) mismatch)
 x: array([ 1.097567e-02,  6.483248e-02, -4.571173e-02,  9.350104e-02,
       -2.485733e-02, -3.051357e-02,  8.830560e-03,  1.258769e-02,
        8.662871e-03, -4.904142e-02,  5.009779e-04, -6.247674e-03,...
 y: array([[-9.318674e-02,  4.319992e-02,  1.432451e-01, ..., -1.102899e-01,
         9.420902e-02, -3.694843e-03],
       [-1.011766e-01,  2.413086e-02,  1.626917e-01, ..., -7.624748e-02,...

## More Info: convert function
https://github.com/huggingface/transformers/blob/main/src/transformers/convert_graph_to_onnx.py#L387

In [24]:
# def convert(
#     framework: str,
#     model: str,
#     output: Path,
#     opset: int,
#     tokenizer: Optional[str] = None,
#     use_external_format: bool = False,
#     pipeline_name: str = "feature-extraction",
#     **model_kwargs,
# ):
#     """
#     Convert the pipeline object to the ONNX Intermediate Representation (IR) format

#     Args:
#         framework: The framework the pipeline is backed by ("pt" or "tf")
#         model: The name of the model to load for the pipeline
#         output: The path where the ONNX graph will be stored
#         opset: The actual version of the ONNX operator set to use
#         tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
#         use_external_format:
#             Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
#         pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
#         model_kwargs: Keyword arguments to be forwarded to the model constructor

#     Returns:

#     """
#     warnings.warn(
#         "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
#         " Transformers",
#         FutureWarning,
#     )
#     print(f"ONNX opset version set to: {opset}")

#     # Load the pipeline
#     nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

#     if not output.parent.exists():
#         print(f"Creating folder {output.parent}")
#         makedirs(output.parent.as_posix())
#     elif len(listdir(output.parent.as_posix())) > 0:
#         raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

#     # Export the graph
#     if framework == "pt":
#         convert_pytorch(nlp, opset, output, use_external_format)
#     else:
#         convert_tensorflow(nlp, opset, output)

## More Info: Output

In [25]:
onnx_model.graph.output

[name: "output_0"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
      dim {
        dim_value: 768
      }
    }
  }
}
]

In [26]:
onnx_model.graph.input

[name: "input_ids"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
    }
  }
}
, name: "attention_mask"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
    }
  }
}
]

## More Info: Modules (convert function calls load_graph_from_args)

In [31]:
from transformers.convert_graph_to_onnx import load_graph_from_args
nlp = load_graph_from_args("feature-extraction", "pt", model_id, None)

Loading pipeline (model: sentence-transformers/distiluse-base-multilingual-cased-v1, tokenizer: sentence-transformers/distiluse-base-multilingual-cased-v1)


In [32]:
nlp.model.modules

<bound method Module.modules of DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias

In [37]:
# https://huggingface.co/docs/transformers/serialization
# https://github.com/oborchers/sentence-transformers/blob/master/examples/onnx_inference/onnx_inference.ipynb
# https://github.com/UKPLab/sentence-transformers/pull/668

## Compare with Embedding from ML-Commons API

In [39]:
ml_commons_embedding_output_onnx = ml_client.generate_embedding("WpL35okBqzTvNQeA6Vit", input_sentences)
ml_commons_embedding_data_onnx = [
            ml_commons_embedding_output_onnx["inference_results"][i]["output"][0]["data"]
            for i in range(len(input_sentences))
        ]

In [47]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(ml_commons_embedding_data_onnx[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

(shapes (768,), (9, 768) mismatch)
 x: array([-8.156287e-02,  3.827157e-02,  1.071574e-01, -1.202194e-03,
       -5.243819e-02, -4.310886e-02, -1.101768e-01, -1.975697e-03,
       -5.380076e-03,  9.543222e-02, -2.704400e-02, -6.535825e-02,...
 y: array([[-9.318674e-02,  4.319992e-02,  1.432451e-01, ..., -1.102899e-01,
         9.420902e-02, -3.694843e-03],
       [-1.011766e-01,  2.413086e-02,  1.626917e-01, ..., -7.624748e-02,...

In [None]:
# https://neuml.hashnode.dev/export-and-run-models-with-onnx
# https://colab.research.google.com/github/neuml/txtai/blob/master/examples/18_Export_and_run_models_with_ONNX.ipynb