In [24]:
import json
import os
import re
import torch
from mdutils.fileutils import MarkDownFile
from zipfile import ZipFile
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Normalize, Pooling, Transformer

In [4]:
# 1. Run the three lines below
model_id = "hkunlp/instructor-large"
model = SentenceTransformer(model_id, cache_folder="cache_folder")
folder_path = "sentence-transformer-torchscript"

In [12]:
# 2. Go to cache_folder/hkunlp_instructor-large/1_Pooling/config.json 
# and remove "pooling_mode_weightedmean_tokens": false" & "pooling_mode_lasttoken": false
# as follows:
# {
#   "word_embedding_dimension": 768,
#   "pooling_mode_cls_token": false,
#   "pooling_mode_mean_tokens": true,
#   "pooling_mode_max_tokens": false,
#   "pooling_mode_mean_sqrt_len_tokens": false
# }

# 3. Run the function below so that you have model_zip_file saved at sentence-transformer-torchscript/instructor-large.zip
def save_as_pt(
    model,
    model_id,
    traced_folder,
    sentences: [str],
) -> str:
    model_name = str(model_id.split("/")[-1] + ".pt")

    model_path = os.path.join(traced_folder, model_name)
    save_json_folder_path = traced_folder
    model_output_path = traced_folder
    zip_file_name = str(model_id.split("/")[-1] + ".zip")
    zip_file_path = os.path.join(model_output_path, zip_file_name)

    if model.tokenizer.model_max_length > model.get_max_seq_length():
        model.tokenizer.model_max_length = model.get_max_seq_length()
        print(
            f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
        )

    # save tokenizer.json in save_json_folder_name
    model.save(save_json_folder_path)


    # convert to pt format will need to be in cpu,
    # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format
    device = torch.device("cpu")
    cpu_model = model.to(device)
    features = cpu_model.tokenizer(
        sentences, return_tensors="pt", padding=True, truncation=True
    ).to(device)

    compiled_model = torch.jit.trace(
        cpu_model,
        (
            {
                "input_ids": features["input_ids"],
                "attention_mask": features["attention_mask"],
            }
        ),
        strict=False,
    )
    torch.jit.save(compiled_model, model_path)
    print("model file is saved to ", model_path)

    # zip model file along with tokenizer.json as output
    with ZipFile(str(zip_file_path), "w") as zipObj:
        zipObj.write(
            model_path,
            arcname=str(model_name),
        )
        zipObj.write(
            os.path.join(save_json_folder_path, "tokenizer.json"),
            arcname="tokenizer.json",
        )
    print("zip file is saved to ", zip_file_path, "\n")
    return zip_file_path

save_as_pt(model=model, model_id=model_id, traced_folder=folder_path, sentences=["for example providing a small sentence", "we can add multiple sentences"])

model file is saved to  sentence-transformer-torchscript/instructor-large.pt
zip file is saved to  sentence-transformer-torchscript/instructor-large.zip 



'sentence-transformer-torchscript/instructor-large.zip'

In [25]:
# 4. Run the function below so that you have model_config_json_file saved at sentence-transformer-torchscript/ml-commons_model_config.json
def _get_model_description_from_readme_file(model_id, readme_file_path) -> str:
    readme_data = MarkDownFile.read_file(readme_file_path)

    # Find the description section
    start_str = f"\n# {model_id}"
    start = readme_data.find(start_str)
    if start == -1:
        model_name = model_id.split("/")[1]
        start_str = f"\n# {model_name}"
        start = readme_data.find(start_str)
    end = readme_data.find("\n#", start + len(start_str))

    # If we cannot find the scope of description section, raise error.
    if start == -1 or end == -1:
        assert False, "Cannot find description in README.md file"

    # Parse out the description section
    description = readme_data[start + len(start_str) + 1 : end].strip()
    description = description.split("\n")[0]

    # Remove hyperlink and reformat text
    description = re.sub(r"\(.*?\)", "", description)
    description = re.sub(r"[\[\]]", "", description)
    description = re.sub(r"\*", "", description)

    # Remove unnecessary part if exists (i.e. " For an introduction to ...")
    # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md)
    unnecessary_part = description.find(" For an introduction to")
    if unnecessary_part != -1:
        description = description[:unnecessary_part]

    return description

def _generate_default_model_description(embedding_dimension) -> str:
    """
    Generate default model description of the model based on embedding_dimension

    ::param embedding_dimension: Embedding dimension of the model.
    :type embedding_dimension: int
    :return: Description of the model
    :rtype: string
    """
    print(
            "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function"
    )
    description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space."
    return description
    
def make_model_config_json(
    model,
    model_id,
    folder_path,
    version_number: str = 1,
    model_format: str = "TORCH_SCRIPT",
    embedding_dimension: int = None,
    pooling_mode: str = None,
    normalize_result: bool = None,
    description: str = None,
    all_config: str = None,
    model_type: str = None,
) -> str:
    config_json_file_path = os.path.join(folder_path, "config.json")
    model_name = model_id

    if (
        model_type is None
        or embedding_dimension is None
        or pooling_mode is None
        or normalize_result is None
    ):
        try:
            if embedding_dimension is None:
                embedding_dimension = model.get_sentence_embedding_dimension()

            for str_idx, module in model._modules.items():
                if model_type is None and isinstance(module, Transformer):
                    model_type = module.auto_model.__class__.__name__
                    model_type = model_type.lower().rstrip("model")
                elif pooling_mode is None and isinstance(module, Pooling):
                    pooling_mode = module.get_pooling_mode_str().upper()
                elif normalize_result is None and isinstance(module, Normalize):
                        normalize_result = True
            if normalize_result is None:
                normalize_result = False
        except Exception as e:
            raise Exception(
                f"Raised exception while getting model data from pre-trained hugging-face model object: {e}"
            )

    if description is None:
        readme_file_path = os.path.join(folder_path, "README.md")
        if os.path.exists(readme_file_path):
            try:
                description = _get_model_description_from_readme_file(
                    model_id,
                    readme_file_path
                )
            except Exception as e:
                print(f"Cannot scrape model description from README.md file: {e}")
                description = _generate_default_model_description(
                    embedding_dimension
                )
        else:
            print("Cannot find README.md file to scrape model description")
            description = _generate_default_model_description(
                embedding_dimension
            )

    if all_config is None:
        if not os.path.exists(config_json_file_path):
            raise Exception(
                str(
                    "Cannot find config.json in"
                    + config_json_file_path
                    + ". Please check the config.son file in the path."
                )
            )
        try:
            with open(config_json_file_path) as f:
                config_content = json.load(f)
                if all_config is None:
                    all_config = config_content
        except IOError:
            print(
                "Cannot open in config.json file at ",
                config_json_file_path,
                ". Please check the config.json ",
                "file in the path.",
            )

    model_config_content = {
        "name": model_name,
        "version": version_number,
        "description": description,
        "model_format": model_format,
        "model_task_type": "TEXT_EMBEDDING",
        "model_config": {
            "model_type": model_type,
            "embedding_dimension": embedding_dimension,
            "framework_type": "sentence_transformers",
            "pooling_mode": pooling_mode,
            "normalize_result": normalize_result,
            "all_config": json.dumps(all_config),
        },
    }


    model_config_file_path = os.path.join(
        folder_path, "ml-commons_model_config.json"
    )
    os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
    with open(model_config_file_path, "w") as file:
        json.dump(model_config_content, file)
    print(
        "ml-commons_model_config.json file is saved at : ", model_config_file_path
    )

    return model_config_file_path

make_model_config_json(model=model, model_id=model_id, folder_path=folder_path, model_format='TORCH_SCRIPT')

ml-commons_model_config.json file is saved at :  sentence-transformer-torchscript/ml-commons_model_config.json


'sentence-transformer-torchscript/ml-commons_model_config.json'

In [None]:
# 5. Use 'sentence-transformer-torchscript/instructor-large.zip' and 'sentence-transformer-torchscript/ml-commons_model_config.json' to register the model to OpenSearch cluster

In [32]:
# import json
# import os
# import re
# import torch
# from mdutils.fileutils import MarkDownFile
# from zipfile import ZipFile
# from sentence_transformers import SentenceTransformer
# from sentence_transformers.models import Normalize, Pooling, Transformer


# # 1. Run the three lines below
# model_id = "hkunlp/instructor-large"
# model = SentenceTransformer(model_id, cache_folder="cache_folder")
# folder_path = "sentence-transformer-torchscript"


# # 2. Go to cache_folder/hkunlp_instructor-large/1_Pooling/config.json 
# # and remove "pooling_mode_weightedmean_tokens": false" & "pooling_mode_lasttoken": false
# # as follows:
# # {
# #   "word_embedding_dimension": 768,
# #   "pooling_mode_cls_token": false,
# #   "pooling_mode_mean_tokens": true,
# #   "pooling_mode_max_tokens": false,
# #   "pooling_mode_mean_sqrt_len_tokens": false
# # }


# # 3. Run the function below so that you have model_zip_file saved at sentence-transformer-torchscript/instructor-large.zip
# def save_as_pt(
#     model,
#     model_id,
#     traced_folder,
#     sentences: [str],
# ) -> str:
#     model_name = str(model_id.split("/")[-1] + ".pt")

#     model_path = os.path.join(traced_folder, model_name)
#     save_json_folder_path = traced_folder
#     model_output_path = traced_folder
#     zip_file_name = str(model_id.split("/")[-1] + ".zip")
#     zip_file_path = os.path.join(model_output_path, zip_file_name)

#     if model.tokenizer.model_max_length > model.get_max_seq_length():
#         model.tokenizer.model_max_length = model.get_max_seq_length()
#         print(
#             f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
#         )

#     # save tokenizer.json in save_json_folder_name
#     model.save(save_json_folder_path)


#     # convert to pt format will need to be in cpu,
#     # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format
#     device = torch.device("cpu")
#     cpu_model = model.to(device)
#     features = cpu_model.tokenizer(
#         sentences, return_tensors="pt", padding=True, truncation=True
#     ).to(device)

#     compiled_model = torch.jit.trace(
#         cpu_model,
#         (
#             {
#                 "input_ids": features["input_ids"],
#                 "attention_mask": features["attention_mask"],
#             }
#         ),
#         strict=False,
#     )
#     torch.jit.save(compiled_model, model_path)
#     print("model file is saved to ", model_path)

#     # zip model file along with tokenizer.json as output
#     with ZipFile(str(zip_file_path), "w") as zipObj:
#         zipObj.write(
#             model_path,
#             arcname=str(model_name),
#         )
#         zipObj.write(
#             os.path.join(save_json_folder_path, "tokenizer.json"),
#             arcname="tokenizer.json",
#         )
#     print("zip file is saved to ", zip_file_path, "\n")
#     return zip_file_path

# save_as_pt(model=model, model_id=model_id, traced_folder=folder_path, sentences=["for example providing a small sentence", "we can add multiple sentences"])


# # 4. Run the function below so that you have model_config_json_file saved at sentence-transformer-torchscript/ml-commons_model_config.json
# def _get_model_description_from_readme_file(model_id, readme_file_path) -> str:
#     readme_data = MarkDownFile.read_file(readme_file_path)

#     # Find the description section
#     start_str = f"\n# {model_id}"
#     start = readme_data.find(start_str)
#     if start == -1:
#         model_name = model_id.split("/")[1]
#         start_str = f"\n# {model_name}"
#         start = readme_data.find(start_str)
#     end = readme_data.find("\n#", start + len(start_str))

#     # If we cannot find the scope of description section, raise error.
#     if start == -1 or end == -1:
#         assert False, "Cannot find description in README.md file"

#     # Parse out the description section
#     description = readme_data[start + len(start_str) + 1 : end].strip()
#     description = description.split("\n")[0]

#     # Remove hyperlink and reformat text
#     description = re.sub(r"\(.*?\)", "", description)
#     description = re.sub(r"[\[\]]", "", description)
#     description = re.sub(r"\*", "", description)

#     # Remove unnecessary part if exists (i.e. " For an introduction to ...")
#     # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md)
#     unnecessary_part = description.find(" For an introduction to")
#     if unnecessary_part != -1:
#         description = description[:unnecessary_part]

#     return description

# def _generate_default_model_description(embedding_dimension) -> str:
#     """
#     Generate default model description of the model based on embedding_dimension

#     ::param embedding_dimension: Embedding dimension of the model.
#     :type embedding_dimension: int
#     :return: Description of the model
#     :rtype: string
#     """
#     print(
#             "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function"
#     )
#     description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space."
#     return description
    
# def make_model_config_json(
#     model,
#     model_id,
#     folder_path,
#     version_number: str = 1,
#     model_format: str = "TORCH_SCRIPT",
#     embedding_dimension: int = None,
#     pooling_mode: str = None,
#     normalize_result: bool = None,
#     description: str = None,
#     all_config: str = None,
#     model_type: str = None,
# ) -> str:
#     config_json_file_path = os.path.join(folder_path, "config.json")
#     model_name = model_id

#     if (
#         model_type is None
#         or embedding_dimension is None
#         or pooling_mode is None
#         or normalize_result is None
#     ):
#         try:
#             if embedding_dimension is None:
#                 embedding_dimension = model.get_sentence_embedding_dimension()

#             for str_idx, module in model._modules.items():
#                 if model_type is None and isinstance(module, Transformer):
#                     model_type = module.auto_model.__class__.__name__
#                     model_type = model_type.lower().rstrip("model")
#                 elif pooling_mode is None and isinstance(module, Pooling):
#                     pooling_mode = module.get_pooling_mode_str().upper()
#                 elif normalize_result is None and isinstance(module, Normalize):
#                         normalize_result = True
#             if normalize_result is None:
#                 normalize_result = False
#         except Exception as e:
#             raise Exception(
#                 f"Raised exception while getting model data from pre-trained hugging-face model object: {e}"
#             )

#     if description is None:
#         readme_file_path = os.path.join(folder_path, "README.md")
#         if os.path.exists(readme_file_path):
#             try:
#                 description = _get_model_description_from_readme_file(
#                     model_id,
#                     readme_file_path
#                 )
#             except Exception as e:
#                 print(f"Cannot scrape model description from README.md file: {e}")
#                 description = _generate_default_model_description(
#                     embedding_dimension
#                 )
#         else:
#             print("Cannot find README.md file to scrape model description")
#             description = _generate_default_model_description(
#                 embedding_dimension
#             )

#     if all_config is None:
#         if not os.path.exists(config_json_file_path):
#             raise Exception(
#                 str(
#                     "Cannot find config.json in"
#                     + config_json_file_path
#                     + ". Please check the config.son file in the path."
#                 )
#             )
#         try:
#             with open(config_json_file_path) as f:
#                 config_content = json.load(f)
#                 if all_config is None:
#                     all_config = config_content
#         except IOError:
#             print(
#                 "Cannot open in config.json file at ",
#                 config_json_file_path,
#                 ". Please check the config.json ",
#                 "file in the path.",
#             )

#     model_config_content = {
#         "name": model_name,
#         "version": version_number,
#         "description": description,
#         "model_format": model_format,
#         "model_task_type": "TEXT_EMBEDDING",
#         "model_config": {
#             "model_type": model_type,
#             "embedding_dimension": embedding_dimension,
#             "framework_type": "sentence_transformers",
#             "pooling_mode": pooling_mode,
#             "normalize_result": normalize_result,
#             "all_config": json.dumps(all_config),
#         },
#     }


#     model_config_file_path = os.path.join(
#         folder_path, "ml-commons_model_config.json"
#     )
#     os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
#     with open(model_config_file_path, "w") as file:
#         json.dump(model_config_content, file)
#     print(
#         "ml-commons_model_config.json file is saved at : ", model_config_file_path
#     )

#     return model_config_file_path

# make_model_config_json(model=model, model_id=model_id, folder_path=folder_path, model_format='TORCH_SCRIPT')


# # 5. Use 'sentence-transformer-torchscript/instructor-large.zip' and 'sentence-transformer-torchscript/ml-commons_model_config.json' to register the model to OpenSearch cluster

model file is saved to  sentence-transformer-torchscript/instructor-large.pt
zip file is saved to  sentence-transformer-torchscript/instructor-large.zip 

ml-commons_model_config.json file is saved at :  sentence-transformer-torchscript/ml-commons_model_config.json


'sentence-transformer-torchscript/ml-commons_model_config.json'

In [26]:
#!pip install opensearch-py opensearch-py-ml

import os
import sys
sys.path.append(os.path.abspath(os.path.join('../../..')))

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
warnings.filterwarnings("ignore", message="using SSL with verify_certs=False is insecure.")

import opensearch_py_ml as oml
from opensearchpy import OpenSearch
from opensearch_py_ml.ml_models import SentenceTransformerModel
# import mlcommon to later register the model to OpenSearch Cluster
from opensearch_py_ml.ml_commons import MLCommonClient

CLUSTER_URL = 'https://localhost:9200'

def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client 

client = get_os_client()

# Connect to ml_common client with OpenSearch client
ml_client = MLCommonClient(client)



In [29]:
ml_client.register_model("sentence-transformer-torchscript/instructor-large.zip", "sentence-transformer-torchscript/ml-commons_model_config.json", isVerbose=True)

Total number of chunks 135
Sha1 value of the model file:  41f576f7235aebd1b9ead05366b3fcb4e6c642ef225c9fbcb1d465356a24c53b
Model meta data was created successfully. Model Id:  OJ-mbIoBx1PaKKd27EH_
uploading chunk 1 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 2 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 3 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 4 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 5 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 6 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 7 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 8 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 9 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 10 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 11 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 12 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 13 of 135
Model id: {'status': 'Uploaded'}
uploading chunk 14 of 135
Model id: {'status'

Task ID: OZ-nbIoBx1PaKKd2qkFw
Model deployed successfully


'OJ-mbIoBx1PaKKd27EH_'

In [30]:
import numpy as np

input_sentences = ["first sentence", "second sentence"]

# Generated embedding from torchScript

embedding_output_torch = ml_client.generate_embedding("OJ-mbIoBx1PaKKd27EH_", input_sentences)

In [31]:
original_pre_trained_model = model
original_embedding_data = list(
    original_pre_trained_model.encode(input_sentences, convert_to_numpy=True)
)
        
for i in range(len(input_sentences)):
    print(i)
    print(np.testing.assert_allclose(original_embedding_data[i], embedding_output_torch['inference_results'][i]['output'][0]['data'], rtol=1e-03, atol=1e-05))

0
None
1
None
