In [1]:
#!pip install opensearch-py opensearch-py-ml

import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

In [3]:
CLUSTER_URL = 'http://localhost:9200'

In [4]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

In [5]:
client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)

In [8]:
def fill_null_truncation_field(
    save_json_folder_path: str,
    max_length: int,
) -> None:
    tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json")
    with open(tokenizer_file_path) as user_file:
        parsed_json = json.load(user_file)
    if "truncation" not in parsed_json or parsed_json["truncation"] is None:
        parsed_json["truncation"] = {
                "direction": "Right",
                "max_length": max_length,
                "strategy": "LongestFirst",
                "stride": 0,
        }
        with open(tokenizer_file_path, "w") as file:
            json.dump(parsed_json, file, indent=2)
                
def trace_cross_encoder(
    model_id,
    input_examples,
    folder_path
):
    # cross_encoder_model = CrossEncoder(model_id, device='cpu') # Could not get name of python class object
    model = AutoModelForSequenceClassification.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model_name = str(model_id.split("/")[-1] + ".pt")
    model_path = os.path.join(folder_path, model_name)
    save_json_folder_path = folder_path
    model_output_path = folder_path
    zip_file_name = str(model_id.split("/")[-1] + ".zip")
    zip_file_path = os.path.join(model_output_path, zip_file_name)

    if tokenizer.model_max_length == 1000000000000000019884624838656:
        tokenizer.model_max_length = model.get_max_seq_length()
        print(
            f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tokenizer.model_max_length}"
        )

    model.save_pretrained(save_json_folder_path)
    tokenizer.save_pretrained(save_json_folder_path)
    fill_null_truncation_field(
            save_json_folder_path, tokenizer.model_max_length
    )


    device = torch.device("cpu")
    cpu_model = model.to(device)
    features = tokenizer(
        input_examples, return_tensors="pt", padding=True, truncation=True
    ).to(device)
    
    print(features)

    compiled_model = torch.jit.trace(
        cpu_model,
        (
            (
                features["input_ids"],
                features["token_type_ids"],
                features["attention_mask"],
            )
        ),
        strict=False,
    )
    torch.jit.save(compiled_model, model_path)
    print("model file is saved to ", model_path)

    # zip model file along with tokenizer.json as output
    with ZipFile(str(zip_file_path), "w") as zipObj:
        zipObj.write(
            model_path,
            arcname=str(model_name),
        )
        zipObj.write(
            os.path.join(save_json_folder_path, "tokenizer.json"),
            arcname="tokenizer.json",
        )
    print("zip file is saved to ", zip_file_path, "\n")

In [9]:
import torch
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
from zipfile import ZipFile

model_id = "cross-encoder/ms-marco-MiniLM-L-12-v2"
input_examples = [('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Query', 'Paragraph3')]
trace_cross_encoder(model_id, input_examples, model_id)

{'input_ids': tensor([[  101, 23032,   102, 20423,  2487,   102],
        [  101, 23032,   102, 20423,  2475,   102],
        [  101, 23032,   102, 20423,  2509,   102]]), 'token_type_ids': tensor([[0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])}
model file is saved to  cross-encoder/ms-marco-MiniLM-L-12-v2/ms-marco-MiniLM-L-12-v2.pt
zip file is saved to  cross-encoder/ms-marco-MiniLM-L-12-v2/ms-marco-MiniLM-L-12-v2.zip 



In [10]:
loaded_model = torch.jit.load("cross-encoder/ms-marco-MiniLM-L-12-v2/ms-marco-MiniLM-L-12-v2.pt")

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [12]:
test_sentences = [('How many people live in Berlin?', 'How many people live in Berlin?'), ('Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.')]
test_features = tokenizer(test_sentences,  padding=True, truncation=True, return_tensors="pt")
test_features

{'input_ids': tensor([[  101,  2129,  2116,  2111,  2444,  1999,  4068,  1029,   102,  2129,
          2116,  2111,  2444,  1999,  4068,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4068,  2038,  1037,  2313,  1997,  1017,  1010, 19611,  1010,
          6021,  2487,  5068,  4864,  1999,  2019,  2181,  1997,  6486,  2487,
          1012,  6445,  2675,  7338,  1012,   102,  2047,  2259,  2103,  2003,
          3297,  2005,  1996,  4956,  2688,  1997,  2396,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [13]:
pt_prediction = loaded_model(**test_features)
pt_prediction

{'logits': tensor([[ 4.7880],
         [-9.6987]], grad_fn=<AddmmBackward0>)}

In [14]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(model_id)
original_embedding = model.predict(test_sentences)
original_embedding

array([ 4.7880187, -9.698673 ], dtype=float32)

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

features = tokenizer(test_sentences,  padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print(scores)

tensor([[ 4.7880],
        [-9.6987]])


In [28]:
def trace_cross_encoder_onnx(
    model_id,
    folder_path
):
    # cross_encoder_model = CrossEncoder(model_id, device='cpu') # Could not get name of python class object
    model = AutoModelForSequenceClassification.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model_name = str(model_id.split("/")[-1] + ".onnx")
    model_path = os.path.join(folder_path, "onnx", model_name)
    save_json_folder_path = folder_path
    model_output_path = folder_path
    zip_file_name = str(model_id.split("/")[-1] + ".zip")
    zip_file_path = os.path.join(model_output_path, zip_file_name)

    if tokenizer.model_max_length == 1000000000000000019884624838656:
        tokenizer.model_max_length = model.get_max_seq_length()
        print(
            f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tokenizer.model_max_length}"
        )

    model.save_pretrained(save_json_folder_path)
    tokenizer.save_pretrained(save_json_folder_path)
    fill_null_truncation_field(
            save_json_folder_path, tokenizer.model_max_length
    )


    convert(
            framework="pt",
            model=model_id,
            output=Path(model_path),
            opset=15,
    )
    print("model file is saved to ", model_path)

    # zip model file along with tokenizer.json as output
    with ZipFile(str(zip_file_path), "w") as zipObj:
        zipObj.write(
            model_path,
            arcname=str(model_name),
        )
        zipObj.write(
            os.path.join(save_json_folder_path, "tokenizer.json"),
            arcname="tokenizer.json",
        )
    print("zip file is saved to ", zip_file_path, "\n")

In [29]:
from transformers.convert_graph_to_onnx import convert
from pathlib import Path
trace_cross_encoder_onnx(model_id, 'onnx')

ONNX opset version set to: 15
Loading pipeline (model: cross-encoder/ms-marco-MiniLM-L-12-v2, tokenizer: cross-encoder/ms-marco-MiniLM-L-12-v2)


Some weights of the model checkpoint at cross-encoder/ms-marco-MiniLM-L-12-v2 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating folder onnx/onnx
Using framework PyTorch: 2.0.1
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']
verbose: False, log level: Level.ERROR

model file is saved to  onnx/onnx/ms-marco-MiniLM-L-12-v2.onnx
zip file is saved to  onnx/ms-marco-MiniLM-L-12-v2.zip 



In [30]:
features

{'input_ids': tensor([[  101,  2129,  2116,  2111,  2444,  1999,  4068,  1029,   102,  2129,
          2116,  2111,  2444,  1999,  4068,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4068,  2038,  1037,  2313,  1997,  1017,  1010, 19611,  1010,
          6021,  2487,  5068,  4864,  1999,  2019,  2181,  1997,  6486,  2487,
          1012,  6445,  2675,  7338,  1012,   102,  2047,  2259,  2103,  2003,
          3297,  2005,  1996,  4956,  2688,  1997,  2396,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [35]:
from os import environ
from psutil import cpu_count
from onnxruntime import InferenceSession, SessionOptions, get_all_providers

environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'

ort_session = InferenceSession("onnx/onnx/ms-marco-MiniLM-L-12-v2.onnx", providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

ort_inputs = {k: v.cpu().detach().numpy() for k, v in features.items()}
ort_outs = ort_session.run(None, ort_inputs)

In [36]:
ort_inputs

{'input_ids': array([[  101,  2129,  2116,  2111,  2444,  1999,  4068,  1029,   102,
          2129,  2116,  2111,  2444,  1999,  4068,  1029,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  4068,  2038,  1037,  2313,  1997,  1017,  1010, 19611,
          1010,  6021,  2487,  5068,  4864,  1999,  2019,  2181,  1997,
          6486,  2487,  1012,  6445,  2675,  7338,  1012,   102,  2047,
          2259,  2103,  2003,  3297,  2005,  1996,  4956,  2688,  1997,
          2396,  1012,   102]]),
 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [42]:
ort_inputs['input_ids'].shape

(2, 39)

In [37]:
print(len(ort_outs))
print(ort_outs[0].shape)

2
(2, 39, 384)


In [47]:
ort_outs[0][0][0,:]

0.002038736