### Import necessary library

In [12]:
import time
import json
import base64
import requests

import boto3
import sagemaker

### Sagemaker setup

In [13]:
sm_client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")
account = boto3.client("sts").get_caller_identity().get("Account")

sess = sagemaker.Session()
boto_session = boto3.session.Session()
region = boto_session.region_name

role = f"arn:aws:iam::{account}:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole"

### Download CLIP-ViT-B-32 model

In [6]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="sentence-transformers/clip-ViT-B-32", local_dir="./CLIP-ViT-B-32")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 11 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 80.26it/s]


'C:\\Users\\tochi\\OneDrive\\Documents\\Upwork_Job\\clip-embedding-api\\CLIP-ViT-B-32'

### Package and Upload CLIP-ViT-B-32 model to s3 bucket

In [3]:
%cd  CLIP-ViT-B-32
!tar zcvf model.tar.gz *

C:\Users\tochi\OneDrive\Documents\Upwork_Job\clip-embedding-api\CLIP-ViT-B-32


a .cache
a .gitattributes
a 0_CLIPModel
a code
a config_sentence_transformers.json
a model.tar.gz
a modules.json
a README.md
a code/.ipynb_checkpoints
a code/inference.py
a code/requirements.txt
a code/.ipynb_checkpoints/inference-checkpoint.py
a code/.ipynb_checkpoints/requirements-checkpoint.txt
a 0_CLIPModel/config.json
a 0_CLIPModel/merges.txt
a 0_CLIPModel/preprocessor_config.json
a 0_CLIPModel/pytorch_model.bin
a 0_CLIPModel/special_tokens_map.json
a 0_CLIPModel/tokenizer_config.json
a 0_CLIPModel/vocab.json
a .cache/huggingface
a .cache/huggingface/.gitignore
a .cache/huggingface/download
a .cache/huggingface/download/.gitattributes.metadata
a .cache/huggingface/download/0_CLIPModel
a .cache/huggingface/download/config_sentence_transformers.json.metadata
a .cache/huggingface/download/modules.json.metadata
a .cache/huggingface/download/README.md.metadata
a .cache/huggingface/download/0_CLIPModel/config.json.metadata
a .cache/huggingface/download/0_CLIPModel/merges.txt.metadata
a 

In [4]:
bucket = "huggingface-clip-models"
model_artifact = sagemaker.Session().upload_data("model.tar.gz", bucket=bucket, key_prefix="CLIP-ViT-B-32")
print("S3 Model Path:", model_artifact)

S3 Model Path: s3://huggingface-clip-models/CLIP-ViT-B-32/model.tar.gz


### Create the Model 

In [35]:
# Model artifact in S3 
model_data = "s3://huggingface-clip-models/CLIP-ViT-B-32/model.tar.gz"

# Generate unique name for the model 
current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
model_name = "normalized-b32-model-" + current_time

# Hugging Face DLC container URI (for HF + PyTorch)
container_image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-cpu-py39-ubuntu20.04"

# Create the model
create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": container_image_uri,
        "Mode": "SingleModel",
        "ModelDataUrl": model_data,
        "Environment": {
            "HF_TASK": "feature-extraction"
        }
    }
)

print("Created Model:", create_model_response["ModelArn"])


Created Model: arn:aws:sagemaker:us-east-1:289669704200:model/normalized-b32-model-2025-06-13-00-45-00


### Create an Endpoint Configuration with Serverless Inference

In [36]:
# Generate unique name for the endpoint config name
current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
endpoint_config_name = "normalized-b32-model-endpoint-config-" + current_time

create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "AllTrafficVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 5120,        
                "MaxConcurrency": 5,
                "ProvisionedConcurrency": 1
            }
        }
    ]
)

print("Created Endpoint Config:", create_endpoint_config_response["EndpointConfigArn"])


Created Endpoint Config: arn:aws:sagemaker:us-east-1:289669704200:endpoint-config/normalized-b32-model-endpoint-config-2025-06-13-00-45-10


### Create or Update the Endpoint

In [37]:
endpoint_name = "normalized-b32-model"

# Check if endpoint exists
try:
    sm_client.describe_endpoint(EndpointName=endpoint_name)
    # If it exists, update
    response = sm_client.update_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=endpoint_config_name
    )
    print("Updated Endpoint:", response["EndpointArn"])
    
except sm_client.exceptions.ClientError as e:
    if "Could not find endpoint" in str(e):
        # Create the endpoint
        response = sm_client.create_endpoint(
            EndpointName=endpoint_name,
            EndpointConfigName=endpoint_config_name
        )
        print("Created Endpoint:", response["EndpointArn"])
    else:
        raise e


Updated Endpoint: arn:aws:sagemaker:us-east-1:289669704200:endpoint/normalized-b32-model


### Wait for the endpoint to be in service

In [38]:
print("Waiting for endpoint to be in service...")

while True:
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Endpoint status:", status)
    if status == "InService":
        print("✅ Endpoint is ready!")
        break
    elif status == "Failed":
        raise Exception("Endpoint creation failed")
    time.sleep(10)

Waiting for endpoint to be in service...
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: InService
✅ Endpoint is ready!


### Test deployed model

In [39]:
%cd  ..

C:\Users\tochi\OneDrive\Documents\Upwork_Job


In [40]:
%cd  clip-embedding-api

C:\Users\tochi\OneDrive\Documents\Upwork_Job\clip-embedding-api


In [41]:
with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')


input_data = {
    "inputs": {
        "image": encoded_image 
    }
}

# Convert to JSON string
payload = json.dumps(input_data)

# Invoke the endpoint
response = runtime.invoke_endpoint(
    EndpointName="normalized-b32-model",
    ContentType="application/json",
    Body=payload
)

# Read the raw response body (bytes -> string)
response = response["Body"].read().decode()

# Print response
print("Raw response from SageMaker endpoint:")
print(response)

Raw response from SageMaker endpoint:
{"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0066422351736321, 0.004895574351542614, -0.018706036851959033, 0.004662472853023341, 0.04510598618566244, -0.013993906502128825, -9.325842974767581e-05, -0.00012819223574102892, 0.009970531988663314, 0.011693910556755336, 0.022426834809983958, 0.012249128650215647, -0.035945628896813527, 0.00021407623818882568, -0.029337305160487985, 0.04781863083922653, 0.009619789789457416, 0.013744609590783474, 0.025819747195330277, 0.00017121583952888717, 0.0868025640456482, 0.03205825530144295, -0.002890173008613393, -0.009452101193794495, 0.0038173492489438424, 0.022394205287845154, -0.0223957611288003, 0.03160075753651388, -0.0007961750070793659, 0.023058468851576756, -0.003921940729050102, 0.045612086006306805, -0.02794219896841739, 0.03558561707772098, 0.10422789646745112, -0.02490249123850219, -0.009725057096965634, -0.005157083607834415, -0.00835640020113253, -0.2080916398755

### Test APIgateway Endpoint

In [76]:
# Track execution time
start_time = time.time()

with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
endpoint_url = "https://bzljzzqrv4.execute-api.us-east-1.amazonaws.com/prod/b32model"

payload = {
    "image": encoded_image,
    "model": "B-32 model"
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {bearer_token}"
}

response = requests.post(endpoint_url, headers=headers, data=json.dumps(payload))

end_time = time.time()
duration = end_time - start_time
print(f"Execution time: {duration:.2f} seconds")

# Print response
print("Status Code:", response.status_code)
print(json.dumps(response.json(), indent=2)) 

Execution time: 7.57 seconds
Status Code: 200
{
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        0.0066422351736321,
        0.004895574351542614,
        -0.018706036851959033,
        0.004662472853023341,
        0.04510598618566244,
        -0.013993906502128825,
        -9.325842974767581e-05,
        -0.00012819223574102892,
        0.009970531988663314,
        0.011693910556755336,
        0.022426834809983958,
        0.012249128650215647,
        -0.035945628896813527,
        0.00021407623818882568,
        -0.029337305160487985,
        0.04781863083922653,
        0.009619789789457416,
        0.013744609590783474,
        0.025819747195330277,
        0.00017121583952888717,
        0.0868025640456482,
        0.03205825530144295,
        -0.002890173008613393,
        -0.009452101193794495,
        0.0038173492489438424,
        0.022394205287845154,
        -0.0223957611288003,
        0.03160075753651388,

### Add Autoscale Action for weekdays and weekends 

In [80]:
### Register scalable target

endpoint_name = "normalized-b32-model"
variant_name = "AllTrafficVariant" 
min_capacity = 1
max_capacity = 5

client = boto3.client("application-autoscaling") # Initialize the client

response = client.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=f"endpoint/{endpoint_name}/variant/{variant_name}",
    ScalableDimension="sagemaker:variant:DesiredProvisionedConcurrency",
    MinCapacity=min_capacity,
    MaxCapacity=max_capacity
)

print("Scalable target registered:")
print(response)

Scalable target registered:
{'ScalableTargetARN': 'arn:aws:application-autoscaling:us-east-1:289669704200:scalable-target/056me5d0af673fa947ecba212074e5cd96ae', 'ResponseMetadata': {'RequestId': '5bdc342f-4edd-4c3c-96a8-4194b783a19d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '5bdc342f-4edd-4c3c-96a8-4194b783a19d', 'content-type': 'application/x-amz-json-1.1', 'content-length': '131', 'date': 'Fri, 13 Jun 2025 16:18:32 GMT'}, 'RetryAttempts': 0}}


In [78]:
### setup the scheduled action using cron

endpoint_name = "normalized-b32-model"
variant_name = "AllTrafficVariant"
resource_id = f"endpoint/{endpoint_name}/variant/{variant_name}"
client = boto3.client('application-autoscaling')

# Desired min and max concurrency during "up" times (change as needed)
desired_min = 1
desired_max = 5

# Minimal concurrency during "down" times (change as needed, must be > 0)
minimal_concurrency = 1

# 1. Weekday evening scale down at 6 PM (Mon-Fri)
client.put_scheduled_action(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency',
    ScheduledActionName='WeekdayEveningScaleDown',
    Schedule='cron(30 23 ? * MON-FRI *)',  # 6 PM UTC Mon-Fri
    ScalableTargetAction={
        'MinCapacity': minimal_concurrency,  # Set a minimal non-zero value
        'MaxCapacity': minimal_concurrency   # Set a minimal non-zero value
    }
)

# 2. Weekday morning scale up at 6 AM (Mon-Fri)
client.put_scheduled_action(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency',
    ScheduledActionName='WeekdayMorningScaleUp',
    Schedule='cron(0 11 ? * MON-FRI *)', 
    ScalableTargetAction={
        'MinCapacity': desired_min,
        'MaxCapacity': desired_max
    }
)
print("Scheduled scaling actions created successfully.")

Scheduled scaling actions created successfully.


In [63]:
#verify it was added successfully
response = client.describe_scheduled_actions(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
)

for action in response['ScheduledActions']:
    print(f"{action['ScheduledActionName']}: {action['Schedule']}")

WeekdayEveningScaleDown: cron(30 23 ? * MON-FRI *)
WeekendScaleDown: cron(0 11 ? * SAT *)
WeekdayMorningScaleUp: cron(0 11 ? * MON-FRI *)


In [90]:
### confirm that the triggering was successful
response = client.describe_scaling_activities(
    ServiceNamespace='sagemaker',
    ResourceId='endpoint/normalized-b32-model/variant/AllTrafficVariant',
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
)
response

{'ScalingActivities': [{'ActivityId': '85af6796-7bc3-432c-8232-42780f5b8bda',
   'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/normalized-b32-model/variant/AllTrafficVariant',
   'ScalableDimension': 'sagemaker:variant:DesiredProvisionedConcurrency',
   'Description': 'Setting min capacity to 1 and max capacity to 5',
   'Cause': 'scheduled action name WeekdayMorningScaleUp was triggered',
   'StartTime': datetime.datetime(2025, 6, 16, 6, 0, 42, 699000, tzinfo=tzlocal()),
   'EndTime': datetime.datetime(2025, 6, 16, 6, 0, 42, 706000, tzinfo=tzlocal()),
   'StatusCode': 'Successful',
   'StatusMessage': 'Successfully set min capacity to 1 and max capacity to 5'},
  {'ActivityId': 'd049914a-c5a1-4aae-8743-40b8a2189e62',
   'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/normalized-b32-model/variant/AllTrafficVariant',
   'ScalableDimension': 'sagemaker:variant:DesiredProvisionedConcurrency',
   'Description': 'Setting min capacity to 1 and max capacity to 1',
  

In [92]:
#verify if you are having the expected provisioned concurrency.
sm_client.describe_endpoint(EndpointName="normalized-b32-model")

{'EndpointName': 'normalized-b32-model',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:289669704200:endpoint/normalized-b32-model',
 'EndpointConfigName': 'normalized-b32-model-endpoint-config-2025-06-13-00-45-10',
 'ProductionVariants': [{'VariantName': 'AllTrafficVariant',
   'DeployedImages': [{'SpecifiedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-cpu-py39-ubuntu20.04',
     'ResolvedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference@sha256:a2e83f13bbb51b944215023d8109074c6bcc1832b87af53f20157872ce956941',
     'ResolutionTime': datetime.datetime(2025, 6, 17, 18, 30, 30, 285000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 0,
   'CurrentServerlessConfig': {'MemorySizeInMB': 5120,
    'MaxConcurrency': 5,
    'ProvisionedConcurrency': 1}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2025, 6, 4, 10, 47, 3, 625000

In [59]:
#List of scheduled action names to delete
# scheduled_actions = [
#     'WeekendScaleDown',
#     'WeekdayMorningScaleUp',
#     'WeekdayEveningScaleDown'
# ]

# for action_name in scheduled_actions:
#     response = client.delete_scheduled_action(
#         ServiceNamespace='sagemaker',
#         ScheduledActionName=action_name,
#         ResourceId=resource_id,
#         ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
#     )
#     print(f"Deleted scheduled action: {action_name}")

Deleted scheduled action: RunNow
Deleted scheduled action: WeekdayEveningScaleDown


###  Perform Load testing on the Endpoint

In [83]:
# Simulate load
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

input_data = {
    "inputs": {
        "image": encoded_image 
    }
}

# Convert to JSON string
payload = json.dumps(input_data)

# Invoke the endpoint
def invoke_endpoint():
    response = runtime.invoke_endpoint(
        EndpointName="normalized-b32-model",
        ContentType="application/json",
        Body=payload
    )
    print("Raw response:", response)  
    return response.get('ResponseMetadata', {}).get('HTTPStatusCode', 'NoStatusCode')
    
def simulate_load(concurrent_requests=5, duration_sec=60):
    print(f"Starting load test: {concurrent_requests} requests every second for {duration_sec} seconds")
    start_time = time.time()
    while time.time() - start_time < duration_sec:
        with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
            futures = [executor.submit(invoke_endpoint) for _ in range(concurrent_requests)]
            for future in as_completed(futures):
                try:
                    status = future.result()
                    print(f"Response status: {status}")
                except Exception as e:
                    print(f"Request failed: {e}")
        time.sleep(1)

simulate_load(concurrent_requests=5, duration_sec=15)

Starting load test: 5 requests every second for 15 seconds
Raw response: {'ResponseMetadata': {'RequestId': '37d6eec9-4dee-4e5d-8e40-eb01597aaded', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '37d6eec9-4dee-4e5d-8e40-eb01597aaded', 'x-amzn-invoked-production-variant': 'AllTrafficVariant', 'date': 'Fri, 13 Jun 2025 17:20:39 GMT', 'content-type': 'application/json', 'content-length': '11531', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTrafficVariant', 'Body': <botocore.response.StreamingBody object at 0x0000023B8C039030>}
Response status: 200
Raw response: {'ResponseMetadata': {'RequestId': '0423f3b4-59ca-4f15-bbe4-b56ac5aa9d41', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '0423f3b4-59ca-4f15-bbe4-b56ac5aa9d41', 'x-amzn-invoked-production-variant': 'AllTrafficVariant', 'date': 'Fri, 13 Jun 2025 17:20:39 GMT', 'content-type': 'application/json', 'content-length': '11531', 'connection': 