# Experiment Notebook
Load .onnx and Verify Embedding without ML-Commons API to see if the problem is with ML-Commons API or the .onnx file itself

Reference: https://github.com/SidJain1412/sentence-transformers/blob/master/examples/onnx/onnx_example.ipynb

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CLUSTER_URL = 'https://localhost:9200'

In [4]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

In [5]:
client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)



## Trace the Model in Onnx Using save_as_onnx
See `opensearch_py_ml/ml_models/sentencetransformermodel.py`

In [6]:
model_id = "sentence-transformers/distiluse-base-multilingual-cased-v2"
folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v2'
model_name = str(model_id.split("/")[-1] + ".onnx")
model_path = os.path.join(folder_path, "onnx", model_name)

In [12]:
## Case I: Initiate SentenceTransformerModel and Call save_as_onnx

# pre_trained_model = SentenceTransformerModel(model_id=model_id, folder_path=folder_path, overwrite=True)
# model_path_onnx = pre_trained_model.save_as_onnx(model_id=model_id)

Downloading (…)6015c/.gitattributes: 100%|██████| 690/690 [00:00<00:00, 112kB/s]
Downloading (…)_Pooling/config.json: 100%|█████| 190/190 [00:00<00:00, 32.3kB/s]
Downloading (…)/2_Dense/config.json: 100%|█████| 114/114 [00:00<00:00, 69.7kB/s]
Downloading pytorch_model.bin: 100%|███████| 1.58M/1.58M [00:00<00:00, 29.0MB/s]
Downloading (…)ff6066015c/README.md: 100%|█| 2.38k/2.38k [00:00<00:00, 1.54MB/s]
Downloading (…)6066015c/config.json: 100%|██████| 610/610 [00:00<00:00, 367kB/s]
Downloading (…)ce_transformers.json: 100%|█████| 122/122 [00:00<00:00, 75.2kB/s]
Downloading pytorch_model.bin: 100%|██████████| 539M/539M [00:01<00:00, 293MB/s]
Downloading (…)nce_bert_config.json: 100%|███| 53.0/53.0 [00:00<00:00, 10.1kB/s]
Downloading (…)cial_tokens_map.json: 100%|█████| 112/112 [00:00<00:00, 65.9kB/s]
Downloading (…)6015c/tokenizer.json: 100%|█| 1.96M/1.96M [00:00<00:00, 26.8MB/s]
Downloading (…)okenizer_config.json: 100%|██████| 531/531 [00:00<00:00, 332kB/s]
Downloading (…)ff6066015c/vo

ONNX opset version set to: 15
Loading pipeline (model: sentence-transformers/distiluse-base-multilingual-cased-v2, tokenizer: sentence-transformers/distiluse-base-multilingual-cased-v2)


Downloading (…)lve/main/config.json: 100%|██████| 610/610 [00:00<00:00, 372kB/s]
Downloading pytorch_model.bin: 100%|██████████| 539M/539M [00:01<00:00, 316MB/s]
Downloading (…)okenizer_config.json: 100%|██████| 531/531 [00:00<00:00, 317kB/s]
Downloading (…)solve/main/vocab.txt: 100%|███| 996k/996k [00:00<00:00, 96.3MB/s]
Downloading (…)/main/tokenizer.json: 100%|█| 1.96M/1.96M [00:00<00:00, 8.59MB/s]
Downloading (…)cial_tokens_map.json: 100%|█████| 112/112 [00:00<00:00, 71.5kB/s]
  mask, torch.tensor(torch.finfo(scores.dtype).min)


Creating folder sentence-transformers-onxx/distiluse-base-multilingual-cased-v2/onnx
Using framework PyTorch: 1.13.1+cu117
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch', 1: 'sequence'}
Found output output_2 with shape: {0: 'batch', 1: 'sequence'}
Found output output_3 with shape: {0: 'batch', 1: 'sequence'}
Found output output_4 with shape: {0: 'batch', 1: 'sequence'}
Found output output_5 with shape: {0: 'batch', 1: 'sequence'}
Found output output_6 with shape: {0: 'batch', 1: 'sequence'}
Found output output_7 with shape: {0: 'batch', 1: 'sequence'}
Ensuring inputs are in correct order
head_mask is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']
model file is saved to  sentence-transformers-onxx/distiluse-base-multilingual-cased-v2/onnx/distiluse-ba

In [13]:
## Case II: Repeat what save_as_onnx function does

# from transformers.convert_graph_to_onnx import convert
# from pathlib import Path

# model = SentenceTransformer(model_id)
# folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v1'

# model_name = str(model_id.split("/")[-1] + ".onnx")

# model_path = os.path.join(folder_path, "onnx", model_name)
        
# convert(
#     framework="pt",
#     model=model_id,
#     output=Path(model_path),
#     opset=15,
# )

In [14]:
## Case III: Already run demo_tracing_model_torch_script_onnx_dense notebook 

# Skip to next step since we already have .onnx at model_path

## Creating an ONNX Inference Session

In [15]:
from os import environ
from psutil import cpu_count

# Constants from the performance optimization available in onnxruntime
# It needs to be done before importing onnxruntime
environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'

from onnxruntime import InferenceSession, SessionOptions, get_all_providers

In [16]:
ort_session = InferenceSession(model_path, providers=["CPUExecutionProvider"])

## Initialize pooling function to convert model sequence outputs to pooled outputs

In [17]:
import torch

# def cls_pooling(model_output, attention_mask):
#     return model_output[0][:,0]

def mean_pooling(model_output, attention_mask):
    model_output = torch.from_numpy(model_output[0])
    token_embeddings = model_output #First element of model_output contains all token embeddings
    attention_mask = torch.from_numpy(attention_mask)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask #, input_mask_expanded, sum_mask

## Initialize tokenizer

In [18]:
from transformers import AutoTokenizer

input_sentences = ["first sentence", "second sentence", "very very long random sentence for testing"]
autotokenizer = AutoTokenizer.from_pretrained(model_id)
auto_features = autotokenizer(
            input_sentences, return_tensors="pt", padding=True, truncation=True
        )

In [19]:
autotokenizer

DistilBertTokenizerFast(name_or_path='sentence-transformers/distiluse-base-multilingual-cased-v2', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [20]:
auto_features

{'input_ids': tensor([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {k: v.cpu().detach().numpy() for k, v in auto_features.items()}

In [22]:
ort_inputs

{'input_ids': array([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]),
 'attention_mask': array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Get model embedding outputs

In [23]:
ort_outs = ort_session.run(None, ort_inputs)

In [24]:
len(ort_outs)

8

In [25]:
len(ort_outs[0])

3

In [37]:
ort_outs[0].shape

(3, 9, 768)

In [27]:
intermediate_embeddings = mean_pooling(ort_outs, ort_inputs['attention_mask'])

In [28]:
intermediate_embeddings

tensor([[-0.1320,  0.0142,  0.1433,  ..., -0.0988,  0.0680, -0.0085],
        [-0.0686,  0.0620,  0.0606,  ..., -0.0303,  0.0438, -0.0155],
        [-0.0180, -0.0203,  0.0263,  ...,  0.0434,  0.0379,  0.0609]])

In [29]:
len(intermediate_embeddings)

3

In [30]:
intermediate_embeddings[1].shape

torch.Size([768])

In [60]:
intermediate_embeddings[0]

tensor([-1.3203e-01,  1.4189e-02,  1.4332e-01,  2.1492e-02, -5.9820e-02,
        -8.6129e-02, -1.8077e-01, -2.0854e-02, -8.5716e-03,  9.3090e-02,
        -1.1219e-01, -1.0462e-01,  1.0459e-01,  1.0192e-01, -7.2701e-02,
         3.8279e-02,  8.7345e-02,  1.0205e-01, -2.1158e-02,  3.7020e-02,
        -7.8975e-02,  5.7269e-02, -9.3101e-02,  7.7821e-02,  1.2747e-01,
        -6.2213e-02, -1.9867e-02,  1.1089e-02,  4.6685e-02, -9.2737e-02,
         5.6247e-02, -8.1862e-02,  4.7888e-02,  5.7801e-03,  2.1205e-02,
        -6.7854e-03, -2.9559e-02,  7.7260e-02,  1.5944e-02,  1.1960e-01,
        -5.7907e-03, -3.2759e-02,  6.6717e-02,  9.7194e-02, -3.3541e-02,
         9.4459e-02, -1.6428e-02,  4.2112e-02, -2.7776e-02, -1.1537e-01,
         1.5754e-02,  3.9559e-03, -4.7661e-02,  7.5460e-02, -8.5082e-02,
        -3.3376e-02, -3.6117e-03,  1.0537e-01, -7.6681e-02,  3.6040e-02,
        -5.4378e-02, -7.9316e-03, -8.1346e-02,  5.9338e-03,  7.3633e-02,
         4.7098e-02,  5.8577e-02,  1.1896e-02,  2.1

In [31]:
from torch import nn
def my_dense_layer(intermediate_embeddings, in_features, out_features, bias, activation_func):
    linear_func = nn.Linear(in_features, out_features, bias=bias)
    sentence_embedding = activation_func(linear_func(intermediate_embeddings))
    return sentence_embedding

In [32]:
activation_func = nn.Tanh()

In [33]:
# {"in_features": 768, "out_features": 512, "bias": true, "activation_function": "torch.nn.modules.activation.Tanh"}
sentence_embeddings = my_dense_layer(intermediate_embeddings, 768, 512, True, activation_func)

In [34]:
sentence_embeddings.shape

torch.Size([3, 512])

In [38]:
len(sentence_embeddings)

3

## Verify Embedidngs

In [39]:
import numpy as np

from sentence_transformers import SentenceTransformer

original_pre_trained_model = SentenceTransformer(model_id) # From Huggingface
original_embedding_data = list(
    original_pre_trained_model.encode(input_sentences, convert_to_numpy=True)
)

In [40]:
embedding_data_onnx = [
            sentence_embeddings[i].cpu().detach().numpy()
            for i in range(len(input_sentences))
        ]

In [41]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(original_embedding_data[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 512 / 512 (100%)
Max absolute difference: 0.23659697
Max relative difference: 360.91922
 x: array([ 2.924565e-02,  6.141233e-02, -4.720755e-02,  7.542610e-02,
       -1.127940e-02, -2.926224e-02, -9.203134e-04,  7.731898e-03,
        9.389913e-03, -5.170760e-02,  1.561495e-02, -1.805861e-02,...
 y: array([-7.064316e-02, -8.215571e-03, -7.865465e-02,  9.385258e-02,
       -3.384552e-02, -2.568348e-02, -1.472403e-02,  1.271165e-02,
       -1.272434e-01,  2.143593e-02, -3.877740e-02, -1.124453e-01,...

In [42]:
intermediate_embeddings.shape

torch.Size([3, 768])

In [43]:
from sentence_transformers.models import Dense

In [44]:
dense_layer = Dense(768, 512)

In [45]:
feature_out = {'sentence_embedding':intermediate_embeddings}
dense_layer.forward(feature_out)

{'sentence_embedding': tensor([[ 0.0488,  0.0400, -0.0102,  ...,  0.0047, -0.0151,  0.0428],
         [ 0.0909, -0.0186, -0.0149,  ...,  0.0123, -0.0201,  0.0643],
         [ 0.1176,  0.0234, -0.0471,  ..., -0.0152,  0.0131,  0.0464]],
        grad_fn=<TanhBackward0>)}

In [46]:
feature_out

{'sentence_embedding': tensor([[ 0.0488,  0.0400, -0.0102,  ...,  0.0047, -0.0151,  0.0428],
         [ 0.0909, -0.0186, -0.0149,  ...,  0.0123, -0.0201,  0.0643],
         [ 0.1176,  0.0234, -0.0471,  ..., -0.0152,  0.0131,  0.0464]],
        grad_fn=<TanhBackward0>)}

In [47]:
embedding_data_onnx_dense = [
            feature_out['sentence_embedding'][i].cpu().detach().numpy()
            for i in range(len(input_sentences))
        ]

In [48]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(embedding_data_onnx_dense[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 511 / 512 (99.8%)
Max absolute difference: 0.2221277
Max relative difference: 516.7122
 x: array([ 4.880410e-02,  3.999550e-02, -1.015490e-02, -4.416445e-02,
       -6.098824e-02,  2.730919e-02, -2.198596e-02, -5.392851e-02,
       -1.930884e-02, -1.697238e-02,  5.076087e-02,  1.768776e-02,...
 y: array([-7.064316e-02, -8.215571e-03, -7.865465e-02,  9.385258e-02,
       -3.384552e-02, -2.568348e-02, -1.472403e-02,  1.271165e-02,
       -1.272434e-01,  2.143593e-02, -3.877740e-02, -1.124453e-01,...

In [49]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(embedding_data_onnx_dense[i], original_embedding_data[i], rtol=1e-03, atol=1e-05))

0


AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 512 / 512 (100%)
Max absolute difference: 0.24952845
Max relative difference: 1191.9329
 x: array([ 4.880410e-02,  3.999550e-02, -1.015490e-02, -4.416445e-02,
       -6.098824e-02,  2.730919e-02, -2.198596e-02, -5.392851e-02,
       -1.930884e-02, -1.697238e-02,  5.076087e-02,  1.768776e-02,...
 y: array([ 2.924565e-02,  6.141233e-02, -4.720755e-02,  7.542610e-02,
       -1.127940e-02, -2.926224e-02, -9.203134e-04,  7.731898e-03,
        9.389913e-03, -5.170760e-02,  1.561495e-02, -1.805861e-02,...

In [50]:
ort_inputs

{'input_ids': array([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
        [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]),
 'attention_mask': array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [51]:
test_features = {
    'input_ids':  torch.from_numpy(ort_inputs['input_ids']),
    'attention_mask':  torch.from_numpy(ort_inputs['attention_mask']),
    'token_embeddings': torch.from_numpy(ort_outs[0])
}

In [52]:
from sentence_transformers.models import Pooling
pooling_layer = Pooling(768)

In [53]:
pooling_layer.forward(test_features)

{'input_ids': tensor([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
         [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
         [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_embeddings': tensor([[[-1.8313e-01,  1.6266e-02,  1.4183e-01,  ..., -1.4128e-01,
            1.0959e-01, -3.3882e-02],
          [-2.0680e-01, -7.3514e-04,  2.7081e-01,  ..., -1.0606e-01,
            6.6462e-02,  3.1641e-02],
          [-1.2847e-01,  5.6789e-02,  1.0754e-01,  ..., -4.9264e-02,
            1.1284e-01, -5.2882e-02],
          ...,
          [-1.4783e-01, -4.6210e-02,  1.8090e-02,  ..., -1.6317e-01,
            1.6244e-01,  6.5027e-02],
          [-4.3051e-03,  5.6201e-02,  2.1051e-02,  ..., -1.8682e-01,
            1.6163e-01,  5.4055e-02],
          [-6.2831e-02,  2.9120e-02, -1.0469e-03,  ..., -1.

In [54]:
dense_layer.forward(test_features)

{'input_ids': tensor([[  101, 10422, 49219,   102,     0,     0,     0,     0,     0],
         [  101, 11132, 49219,   102,     0,     0,     0,     0,     0],
         [  101, 12558, 12558, 11695, 61952, 49219, 10142, 38306,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_embeddings': tensor([[[-1.8313e-01,  1.6266e-02,  1.4183e-01,  ..., -1.4128e-01,
            1.0959e-01, -3.3882e-02],
          [-2.0680e-01, -7.3514e-04,  2.7081e-01,  ..., -1.0606e-01,
            6.6462e-02,  3.1641e-02],
          [-1.2847e-01,  5.6789e-02,  1.0754e-01,  ..., -4.9264e-02,
            1.1284e-01, -5.2882e-02],
          ...,
          [-1.4783e-01, -4.6210e-02,  1.8090e-02,  ..., -1.6317e-01,
            1.6244e-01,  6.5027e-02],
          [-4.3051e-03,  5.6201e-02,  2.1051e-02,  ..., -1.8682e-01,
            1.6163e-01,  5.4055e-02],
          [-6.2831e-02,  2.9120e-02, -1.0469e-03,  ..., -1.

In [55]:
np.testing.assert_allclose(original_embedding_data, test_features['sentence_embedding'].cpu().detach().numpy(), rtol=1e-03, atol=1e-05)

AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 1536 / 1536 (100%)
Max absolute difference: 0.24952845
Max relative difference: 1938.1483
 x: array([[ 0.029246,  0.061412, -0.047208, ..., -0.039461,  0.016063,
        -0.03649 ],
       [ 0.004959,  0.037416, -0.027066, ..., -0.030502,  0.031936,...
 y: array([[ 0.048804,  0.039995, -0.010155, ...,  0.00474 , -0.015072,
         0.042837],
       [ 0.090853, -0.018568, -0.014919, ...,  0.012304, -0.020059,...