# Experiment Notebook
Load .onnx and Verify Embedding without ML-Commons API to see if the problem is with ML-Commons API or the .onnx file itself

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CLUSTER_URL = 'https://localhost:9200'

In [4]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

In [5]:
client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)



## Trace the Model in Onnx Using save_as_onnx
See `opensearch_py_ml/ml_models/sentencetransformermodel.py`

In [6]:
model_id = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
folder_path='sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1'
model_name = str(model_id.split("/")[-1] + ".onnx")
model_path = os.path.join(folder_path, "onnx", model_name)

In [8]:
# Case I: Initiate SentenceTransformerModel and Call save_as_onnx

pre_trained_model = SentenceTransformerModel(model_id=model_id, folder_path=folder_path, overwrite=True)
model_path_onnx = pre_trained_model.save_as_onnx(model_id=model_id)

Downloading (…)e891a/.gitattributes: 100%|███████████████████████████████████████████████████████████████████████| 737/737 [00:00<00:00, 121kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 32.1kB/s]
Downloading (…)92a80e891a/README.md: 100%|██████████████████████████████████████████████████████████████████| 9.19k/9.19k [00:00<00:00, 4.56MB/s]
Downloading (…)a80e891a/config.json: 100%|███████████████████████████████████████████████████████████████████████| 571/571 [00:00<00:00, 316kB/s]
Downloading (…)ce_transformers.json: 100%|██████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 66.8kB/s]
Downloading (…)91a/data_config.json: 100%|██████████████████████████████████████████████████████████████████| 25.5k/25.5k [00:00<00:00, 12.5MB/s]
Downloading pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████████| 438M/438M [

ONNX opset version set to: 15
Loading pipeline (model: sentence-transformers/multi-qa-mpnet-base-cos-v1, tokenizer: sentence-transformers/multi-qa-mpnet-base-cos-v1)


Downloading (…)lve/main/config.json: 100%|███████████████████████████████████████████████████████████████████████| 571/571 [00:00<00:00, 332kB/s]
Downloading pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████████| 438M/438M [00:01<00:00, 302MB/s]
Downloading (…)okenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████| 363/363 [00:00<00:00, 203kB/s]
Downloading (…)solve/main/vocab.txt: 100%|████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 3.16MB/s]
Downloading (…)/main/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 6.32MB/s]
Downloading (…)cial_tokens_map.json: 100%|███████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 133kB/s]


Creating folder sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1/onnx
Using framework PyTorch: 1.13.1+cu117
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']
model file is saved to  sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1/onnx/multi-qa-mpnet-base-cos-v1.onnx
zip file is saved to  sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1/multi-qa-mpnet-base-cos-v1.zip 



In [8]:
# Case II: Repeat what save_as_onnx function does

# from transformers.convert_graph_to_onnx import convert
# from pathlib import Path

# model = SentenceTransformer(model_id)
# folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v1'

# model_name = str(model_id.split("/")[-1] + ".onnx")

# model_path = os.path.join(folder_path, "onnx", model_name)
        
# convert(
#     framework="pt",
#     model=model_id,
#     output=Path(model_path),
#     opset=15,
# )

In [9]:
# Case III: Already run demo_tracing_model_torch_script_onnx_dense notebook 

# Skip to next step since we already have .onnx at model_path

## Load Onnx Model to Check Our .onnx file

In [9]:
import onnx

onnx_model = onnx.load(model_path)

# Check that the model is well formed
onnx.checker.check_model(onnx_model)

In [10]:
# Print a human readable representation of the graph
# print(onnx.helper.printable_graph(onnx_model.graph))

## Verify Embedidngs

In [11]:
import onnxruntime as ort

ort_session = ort.InferenceSession(model_path)

In [12]:
from transformers import AutoTokenizer

input_sentences = ["first sentence", "second sentence", "very very long random sentence for testing"]
autotokenizer = AutoTokenizer.from_pretrained(model_id)
auto_features = autotokenizer(
            input_sentences, return_tensors="pt", padding=True, truncation=True
        )

In [13]:
autotokenizer

MPNetTokenizerFast(name_or_path='sentence-transformers/multi-qa-mpnet-base-cos-v1', vocab_size=30527, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [14]:
auto_features

{'input_ids': tensor([[   0, 2038, 6255,    2,    1,    1,    1,    1,    1],
        [   0, 2121, 6255,    2,    1,    1,    1,    1,    1],
        [   0, 2204, 2204, 2150, 6725, 6255, 2009, 5608,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {k: v.cpu().detach().numpy() for k, v in auto_features.items()}
ort_outs = ort_session.run(None, ort_inputs)

In [35]:
ort_inputs

{'input_ids': array([[   0, 2038, 6255,    2,    1,    1,    1,    1,    1],
        [   0, 2121, 6255,    2,    1,    1,    1,    1,    1],
        [   0, 2204, 2204, 2150, 6725, 6255, 2009, 5608,    2]]),
 'attention_mask': array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Wrong Output Shape

In [16]:
len(ort_outs)

2

In [17]:
len(ort_outs[0])

3

In [18]:
ort_outs[0][0].shape

(9, 768)

In [19]:
len(ort_outs[1])

3

In [20]:
ort_outs[1][0].shape

(768,)

In [21]:
import numpy as np

from sentence_transformers import SentenceTransformer

original_pre_trained_model = SentenceTransformer(model_id) # From Huggingface
original_embedding_data = list(
    original_pre_trained_model.encode(input_sentences, convert_to_numpy=True)
)

In [22]:
embedding_data_onnx = [
            ort_outs[1][i]
            for i in range(len(input_sentences))
        ]

In [23]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(original_embedding_data[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 768 / 768 (100%)
Max absolute difference: 0.26189557
Max relative difference: 238.45244
 x: array([ 0.084509, -0.025992, -0.011085,  0.018524, -0.006226,  0.0201  ,
       -0.028214, -0.038654, -0.023896,  0.022964,  0.077998, -0.024981,
        0.014886,  0.003473,  0.013484,  0.00244 ,  0.016318,  0.024735,...
 y: array([ 9.101760e-02,  4.704750e-02, -1.231821e-01, -8.055858e-03,
       -5.737207e-02,  9.479519e-02,  1.622807e-02, -2.144495e-02,
        6.364862e-02, -6.209429e-02,  1.010141e-02,  1.183928e-01,...

## More Info: convert function
https://github.com/huggingface/transformers/blob/main/src/transformers/convert_graph_to_onnx.py#L387

In [24]:
# def convert(
#     framework: str,
#     model: str,
#     output: Path,
#     opset: int,
#     tokenizer: Optional[str] = None,
#     use_external_format: bool = False,
#     pipeline_name: str = "feature-extraction",
#     **model_kwargs,
# ):
#     """
#     Convert the pipeline object to the ONNX Intermediate Representation (IR) format

#     Args:
#         framework: The framework the pipeline is backed by ("pt" or "tf")
#         model: The name of the model to load for the pipeline
#         output: The path where the ONNX graph will be stored
#         opset: The actual version of the ONNX operator set to use
#         tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
#         use_external_format:
#             Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
#         pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
#         model_kwargs: Keyword arguments to be forwarded to the model constructor

#     Returns:

#     """
#     warnings.warn(
#         "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
#         " Transformers",
#         FutureWarning,
#     )
#     print(f"ONNX opset version set to: {opset}")

#     # Load the pipeline
#     nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

#     if not output.parent.exists():
#         print(f"Creating folder {output.parent}")
#         makedirs(output.parent.as_posix())
#     elif len(listdir(output.parent.as_posix())) > 0:
#         raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

#     # Export the graph
#     if framework == "pt":
#         convert_pytorch(nlp, opset, output, use_external_format)
#     else:
#         convert_tensorflow(nlp, opset, output)

## More Info: Output

In [25]:
onnx_model.graph.output

[name: "output_0"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
      dim {
        dim_value: 768
      }
    }
  }
}
, name: "output_1"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_value: 768
      }
    }
  }
}
]

In [26]:
onnx_model.graph.input

[name: "input_ids"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
    }
  }
}
, name: "attention_mask"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
    }
  }
}
]

## More Info: Modules (convert function calls load_graph_from_args)

In [27]:
from transformers.convert_graph_to_onnx import load_graph_from_args
nlp = load_graph_from_args("feature-extraction", "pt", model_id, None)

Loading pipeline (model: sentence-transformers/multi-qa-mpnet-base-cos-v1, tokenizer: sentence-transformers/multi-qa-mpnet-base-cos-v1)


In [28]:
nlp.model.modules

<bound method Module.modules of MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0): MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
     

In [29]:
# https://huggingface.co/docs/transformers/serialization
# https://github.com/oborchers/sentence-transformers/blob/master/examples/onnx_inference/onnx_inference.ipynb
# https://github.com/UKPLab/sentence-transformers/pull/668

## Compare with Embedding from ML-Commons API

In [32]:
pre_trained_model.make_model_config_json(model_format='ONNX')
model_config_path_onnx = 'sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1/ml-commons_model_config.json'
ml_client.register_model(model_path_onnx, model_config_path_onnx, isVerbose=True)

ml-commons_model_config.json file is saved at :  sentence-transformers-onxx/multi-qa-mpnet-base-cos-v1/ml-commons_model_config.json
Total number of chunks 44
Sha1 value of the model file:  6bc4472985de4ddbbc219e77c9d02d84c79aa00a3beb0000b7e73eecafe44518
Model meta data was created successfully. Model Id:  XJIi54kBqzTvNQeAz1gs
uploading chunk 1 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 2 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 3 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 4 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 5 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 6 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 7 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 8 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 9 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 10 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 11 of 44
Model id: {'status': 'Uploaded'}
uploading chunk 12 of 44
Model id: {'status

'XJIi54kBqzTvNQeAz1gs'

In [33]:
ml_commons_embedding_output_onnx = ml_client.generate_embedding("XJIi54kBqzTvNQeAz1gs", input_sentences)
ml_commons_embedding_data_onnx = [
            ml_commons_embedding_output_onnx["inference_results"][i]["output"][0]["data"]
            for i in range(len(input_sentences))
        ]

In [34]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(ml_commons_embedding_data_onnx[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 768 / 768 (100%)
Max absolute difference: 0.26189553
Max relative difference: 238.45251557
 x: array([ 0.084509, -0.025992, -0.011085,  0.018524, -0.006226,  0.020099,
       -0.028215, -0.038654, -0.023896,  0.022964,  0.077998, -0.024981,
        0.014886,  0.003473,  0.013484,  0.00244 ,  0.016318,  0.024735,...
 y: array([ 9.101760e-02,  4.704750e-02, -1.231821e-01, -8.055858e-03,
       -5.737207e-02,  9.479519e-02,  1.622807e-02, -2.144495e-02,
        6.364862e-02, -6.209429e-02,  1.010141e-02,  1.183928e-01,...

In [None]:
# https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb