## SBERT endpoint (This notebook is for a reference. We don't need to create it again.)
We are using the approach suggested in [Creating document embeddings with Hugging Face's Transformers & Amazon SageMaker](https://www.philschmid.de/custom-inference-huggingface-sagemaker).

In [1]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [None]:
!pip install transformers sentence_transformers

In [None]:
!pip install -U sagemaker

In [5]:
!git config --global user.email "ram.senth@berkeley.edu"
!git config --global user.name "Ram S"

In [46]:
# Choose transformer model for embeddings
import transformers
from transformers import AutoTokenizer, AutoModel
import os
import sagemaker
import time
from sentence_transformers import SentenceTransformer

print(f'transformers.__version__: {transformers.__version__}')

#Defining default bucket for SageMaker pretrained model hosting
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3_ARTIFACT = 's3://sagemaker-us-west-2-571667364805/sentence_transformer/model.tar.gz'
S3_ARTIFACT = 's3://project-langbot-models/cc-model-inference.tar.gz'
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'
ENDPOINT_NAME = 'sm-cc-aws'

transformers.__version__: 4.35.2
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Download Model Files

In [None]:
# Install git-lfs. This is required to download model files from HF
!sudo yum update -y
!sudo yum install amazon-linux-extras
!sudo amazon-linux-extras install epel -y
!sudo yum update -y
!sudo yum install git-lfs -y

In [13]:
# Download model files
!git clone https://huggingface.co/sentence-transformers/{MODEL_NAME}
!cd {MODEL_NAME} && git lfs install && git lfs pull

Cloning into 'multi-qa-MiniLM-L6-cos-v1'...
remote: Enumerating objects: 46, done.[K
remote: Total 46 (delta 0), reused 0 (delta 0), pack-reused 46[K
Unpacking objects: 100% (46/46), 310.04 KiB | 3.30 MiB/s, done.
Filtering content: 100% (2/2), 173.47 MiB | 83.47 MiB/s, done.
Updated git hooks.
Git LFS initialized.


In [31]:
# Test the downloaded model
model = SentenceTransformer(f'{MODEL_NAME}/')
model.encode("Hola como estas")[0:5]

array([ 0.01638701,  0.04483588, -0.06652745,  0.00947834, -0.02653527],
      dtype=float32)

In [17]:
!ls {MODEL_NAME}/

1_Pooling			   sentence_bert_config.json
code				   special_tokens_map.json
config.json			   tf_model.h5
config_sentence_transformers.json  tokenizer_config.json
data_config.json		   tokenizer.json
modules.json			   train_script.py
pytorch_model.bin		   vocab.txt
README.md


## Create Custom Inference Script

In [16]:
# Create the folder for script
!mkdir {MODEL_NAME}/code

In [95]:
# %%writefile transformer/code/inference.py
# from sentence_transformers import SentenceTransformer
# import json
# import logging

# logger = logging.getLogger(__name__)
# logger.setLevel(logging.INFO)


# def model_fn(model_dir):
#     # implement custom code to load the model
#     logger.info(f"==== model_fn:: Loading model from model_dir={model_dir}")
#     model = SentenceTransformer(model_dir)
#     logger.info(f"==== model_fn:: Done loading model from model_dir={model_dir}")
#     return model

# def input_fn(input_data, content_type):
#     # decode the input data  (e.g. JSON string -> dict)
#     logger.info(f"==== input_fn:: encoding {input_data}")
#     input_data = json.loads(input_data)
#     return input_data['query']

# def predict_fn(data, model):
#     # call your custom model with the data
#     return model(data)



Overwriting transformer/code/inference.py


In [34]:
%%writefile multi-qa-MiniLM-L6-cos-v1/code/inference.py
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import logging

#This version does not need SentenceTransformer.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Helper: Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def model_fn(model_dir):
  # Load model from HuggingFace Hub
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
  model = AutoModel.from_pretrained(model_dir)
  return model, tokenizer

def predict_fn(data, model_and_tokenizer):
    # destruct model and tokenizer
    model, tokenizer = model_and_tokenizer

    # Tokenize sentences
    sentences = data.pop("inputs", data)
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    # return list, which will be json serializable
    return sentence_embeddings[0].tolist()

Overwriting multi-qa-MiniLM-L6-cos-v1/code/inference.py


## Upload Model Artifact to S3

In [35]:
!cd {MODEL_NAME} && tar -czf ../model.tar.gz *
!aws s3 cp ./model.tar.gz {S3_ARTIFACT}

upload: ./model.tar.gz to s3://project-langbot-models/cc-model-inference.tar.gz


## Depoly Model

In [47]:
from sagemaker.huggingface.model import HuggingFaceModel

huggingface_model = HuggingFaceModel(
    # model_data=S3_ARTIFACT,  # path to your trained SageMaker model
    model_data=S3_ARTIFACT,
    role=role,                                            # IAM role with permissions to create an endpoint
    transformers_version="4.6.1",                           # Transformers version used
    pytorch_version="1.7.1",                                # PyTorch version used
    py_version='py36',                                    # Python version used
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=ENDPOINT_NAME)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
----!

## Test Endpoint

In [39]:
print(predictor.predict(data={
  "inputs": "Hola como estas",
})[0:5])

[0.01638699509203434, 0.0448358878493309, -0.06652743369340897, 0.009478328749537468, -0.026535259559750557]


In [40]:
import json
import boto3
session = boto3.Session()
region = 'us-west-2'
sm_runtime = session.client("sagemaker-runtime", region_name=region)
content_type = "application/json"

def test(text):
    # specify "Inputs"
    data = {
       "inputs": text #"Sí, tengo algo de tiempos hoy."
    }
    response = sm_runtime.invoke_endpoint(
        EndpointName = ENDPOINT_NAME,
        ContentType = content_type,
        Body=json.dumps(data)
    )
    print(response)
    print(response["Body"].read())
    

In [41]:
test("hola como estas")

{'ResponseMetadata': {'RequestId': 'f0540f22-9a5e-4b54-814d-715296674498', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'f0540f22-9a5e-4b54-814d-715296674498', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 05 Dec 2023 02:05:46 GMT', 'content-type': 'application/json', 'content-length': '8052', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7ff16d532e90>}
b'[0.01638699509203434,0.0448358878493309,-0.06652743369340897,0.009478328749537468,-0.026535259559750557,-0.018551960587501526,0.07422944158315659,-0.10322673618793488,-0.017511803656816483,-0.07184222340583801,0.08230551332235336,0.0003614563902374357,-0.024639848619699478,0.03719811141490936,0.14361363649368286,-0.01519789919257164,0.08132601529359818,0.053825560957193375,0.03190746530890465,-0.020510395988821983,0.06528374552726746,-0.01873570866882801,-0.0430560857057

In [21]:
# delete endpoint
predictor.delete_endpoint()

NameError: name 'predictor' is not defined

In [42]:
def test_st(text):
    model = SentenceTransformer('transformer')
    return model.encode(text)

In [43]:
test_st('hola como estas')

array([ 1.63870137e-02,  4.48358841e-02, -6.65274486e-02,  9.47833713e-03,
       -2.65352689e-02, -1.85519978e-02,  7.42293969e-02, -1.03226796e-01,
       -1.75118279e-02, -7.18422532e-02,  8.23055282e-02,  3.61408864e-04,
       -2.46398598e-02,  3.71981226e-02,  1.43613651e-01, -1.51979141e-02,
        8.13260451e-02,  5.38255461e-02,  3.19074616e-02, -2.05103997e-02,
        6.52837604e-02, -1.87356919e-02, -4.30561006e-02,  2.82524377e-02,
       -1.66849475e-02,  4.81762625e-02,  5.72583489e-02, -1.16925035e-02,
       -5.54189831e-03, -5.30852228e-02, -2.94697490e-02, -7.28598936e-03,
        3.71430814e-03, -5.89620136e-02,  6.24467582e-02, -1.60234030e-02,
        5.22793923e-03, -9.16677266e-02, -2.59019490e-02, -1.09608191e-04,
       -8.86333957e-02, -3.30093093e-02, -4.05986188e-03, -2.77562323e-03,
        3.96670178e-02, -7.00709298e-02, -3.38905528e-02,  8.83476436e-03,
        8.27529281e-03, -7.00346455e-02, -6.59077838e-02, -2.64828186e-02,
        1.20611368e-02,  