# Experiment Notebook
Load .onnx and Verify Embedding without ML-Commons API to see if the problem is with ML-Commons API or the .onnx file itself

Reference: https://github.com/SidJain1412/sentence-transformers/blob/master/examples/onnx/onnx_example.ipynb

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CLUSTER_URL = 'https://localhost:9200'

In [4]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

In [5]:
client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)



## Trace the Model in Onnx Using save_as_onnx
See `opensearch_py_ml/ml_models/sentencetransformermodel.py`

In [6]:
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
folder_path='sentence-transformers-onxx/msmarco-distilbert-base-tas-b'
model_name = str(model_id.split("/")[-1] + ".onnx")
model_path = os.path.join(folder_path, "onnx", model_name)

In [7]:
# Case I: Initiate SentenceTransformerModel and Call save_as_onnx

# pre_trained_model = SentenceTransformerModel(model_id=model_id, folder_path=folder_path, overwrite=True)
# model_path_onnx = pre_trained_model.save_as_onnx(model_id=model_id)

ONNX opset version set to: 15
Loading pipeline (model: sentence-transformers/msmarco-distilbert-base-tas-b, tokenizer: sentence-transformers/msmarco-distilbert-base-tas-b)
Creating folder sentence-transformers-onxx/msmarco-distilbert-base-tas-b/onnx
Using framework PyTorch: 1.13.1+cu117
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Ensuring inputs are in correct order
head_mask is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']


  mask, torch.tensor(torch.finfo(scores.dtype).min)


model file is saved to  sentence-transformers-onxx/msmarco-distilbert-base-tas-b/onnx/msmarco-distilbert-base-tas-b.onnx
zip file is saved to  sentence-transformers-onxx/msmarco-distilbert-base-tas-b/msmarco-distilbert-base-tas-b.zip 



In [8]:
# Case II: Repeat what save_as_onnx function does

# from transformers.convert_graph_to_onnx import convert
# from pathlib import Path

# model = SentenceTransformer(model_id)
# folder_path='sentence-transformers-onxx/distiluse-base-multilingual-cased-v1'

# model_name = str(model_id.split("/")[-1] + ".onnx")

# model_path = os.path.join(folder_path, "onnx", model_name)
        
# convert(
#     framework="pt",
#     model=model_id,
#     output=Path(model_path),
#     opset=15,
# )

In [9]:
# Case III: Already run demo_tracing_model_torch_script_onnx_dense notebook 

# Skip to next step since we already have .onnx at model_path

## Creating an ONNX Inference Session

In [10]:
from os import environ
from psutil import cpu_count

# Constants from the performance optimization available in onnxruntime
# It needs to be done before importing onnxruntime
environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'

from onnxruntime import InferenceSession, SessionOptions, get_all_providers

In [11]:
ort_session = InferenceSession(model_path, providers=["CPUExecutionProvider"])

## Initialize pooling function to convert model sequence outputs to pooled outputs

In [38]:
# import torch

# def cls_pooling(model_output, attention_mask):
#     return model_output[0][:,0]

from sentence_transformers.models import Pooling
pooling_layer = Pooling(768, pooling_mode_cls_token=True, pooling_mode_mean_tokens=False)

## Initialize tokenizer

In [14]:
from transformers import AutoTokenizer

input_sentences = ["first sentence", "second sentence", "very very long random sentence for testing"]
autotokenizer = AutoTokenizer.from_pretrained(model_id)
auto_features = autotokenizer(
            input_sentences, return_tensors="pt", padding=True, truncation=True
        )

In [15]:
autotokenizer

DistilBertTokenizerFast(name_or_path='sentence-transformers/msmarco-distilbert-base-tas-b', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [16]:
auto_features

{'input_ids': tensor([[ 101, 2034, 6251,  102,    0,    0,    0,    0,    0],
        [ 101, 2117, 6251,  102,    0,    0,    0,    0,    0],
        [ 101, 2200, 2200, 2146, 6721, 6251, 2005, 5604,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [17]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {k: v.cpu().detach().numpy() for k, v in auto_features.items()}

In [18]:
ort_inputs

{'input_ids': array([[ 101, 2034, 6251,  102,    0,    0,    0,    0,    0],
        [ 101, 2117, 6251,  102,    0,    0,    0,    0,    0],
        [ 101, 2200, 2200, 2146, 6721, 6251, 2005, 5604,  102]]),
 'attention_mask': array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Get model embedding outputs

In [19]:
ort_outs = ort_session.run(None, ort_inputs)

In [20]:
len(ort_outs)

1

In [21]:
len(ort_outs[0])

3

In [22]:
ort_outs[0].shape

(3, 9, 768)

In [39]:
import torch
features = {
    'token_embeddings':  torch.from_numpy(ort_outs[0]),
    'attention_mask': torch.from_numpy(ort_inputs['attention_mask'])
}
pooling_layer.forward(features)
# sentence_embeddings = cls_pooling(ort_outs, ort_inputs['attention_mask'])

{'token_embeddings': tensor([[[ 0.2192, -0.2669, -0.2511,  ...,  0.0324, -0.3444, -0.0308],
          [-0.1659, -0.2614, -0.5352,  ...,  0.2587, -0.1672, -0.2693],
          [ 0.3557, -0.1637, -0.1883,  ...,  0.0900, -0.5482, -0.4730],
          ...,
          [ 0.3443, -0.3381, -0.1119,  ..., -0.0030, -0.3614, -0.1020],
          [ 0.3628, -0.3531, -0.0856,  ..., -0.0109, -0.3690, -0.0990],
          [ 0.3631, -0.3556, -0.1083,  ..., -0.0015, -0.3709, -0.0968]],
 
         [[-0.0738, -0.4391, -0.0976,  ...,  0.0442, -0.1356, -0.3168],
          [ 0.1501, -0.7653, -0.1249,  ...,  0.0637,  0.1336, -0.4225],
          [ 0.1538, -0.2468,  0.0779,  ...,  0.1835, -0.3914, -0.5732],
          ...,
          [ 0.0661, -0.4476,  0.0596,  ...,  0.0086, -0.2125, -0.3663],
          [ 0.0722, -0.4625,  0.0890,  ...,  0.0094, -0.2128, -0.3687],
          [ 0.0680, -0.4786,  0.0622,  ...,  0.0096, -0.2001, -0.3937]],
 
         [[ 0.4373, -0.6149, -0.1181,  ...,  0.2216, -0.3373, -0.0765],
        

In [40]:
sentence_embeddings = features['sentence_embedding']

In [41]:
sentence_embeddings.shape

torch.Size([3, 768])

## Verify Embedidngs

In [42]:
import numpy as np

from sentence_transformers import SentenceTransformer

original_pre_trained_model = SentenceTransformer(model_id) # From Huggingface
original_embedding_data = list(
    original_pre_trained_model.encode(input_sentences, convert_to_numpy=True)
)

In [43]:
embedding_data_onnx = [
            sentence_embeddings[i]
            for i in range(len(input_sentences))
        ]

In [44]:
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(original_embedding_data[i], embedding_data_onnx[i], rtol=1e-03, atol=1e-05))

0
None
1
None
2
None
