### Import necessary library

In [1]:
import time
import json
import base64
import requests

import boto3
import sagemaker



sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\tochi\AppData\Local\sagemaker\sagemaker\config.yaml


### Sagemaker setup

In [15]:
sm_client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")
account = boto3.client("sts").get_caller_identity().get("Account")

sess = sagemaker.Session()
boto_session = boto3.session.Session()
region = boto_session.region_name

role = f"arn:aws:iam::{account}:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole"

### Download CLIP-ViT-L-14 model

In [3]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="sentence-transformers/clip-ViT-L-14", local_dir="./CLIP-ViT-L-14")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 12 files: 100%|████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 305.20it/s]


'C:\\Users\\tochi\\OneDrive\\Documents\\Upwork_Job\\clip-embedding-api\\CLIP-ViT-L-14'

### Package and Upload CLIP-ViT-L-14 model to s3 bucket

In [14]:
%cd  CLIP-ViT-L-14
!tar zcvf model.tar.gz *

C:\Users\tochi\OneDrive\Documents\Upwork_Job\clip-embedding-api\CLIP-ViT-L-14


a .cache
a .gitattributes
a 0_CLIPModel
a code
a config_sentence_transformers.json
a model.tar.gz
a modules.json
a README.md
a code/.ipynb_checkpoints
a code/inference.py
a code/requirements.txt
a code/.ipynb_checkpoints/inference-checkpoint.py
a code/.ipynb_checkpoints/requirements-checkpoint.txt
a 0_CLIPModel/config.json
a 0_CLIPModel/merges.txt
a 0_CLIPModel/preprocessor_config.json
a 0_CLIPModel/pytorch_model.bin
a 0_CLIPModel/special_tokens_map.json
a 0_CLIPModel/tokenizer.json
a 0_CLIPModel/tokenizer_config.json
a 0_CLIPModel/vocab.json
a .cache/huggingface
a .cache/huggingface/.gitignore
a .cache/huggingface/download
a .cache/huggingface/download/.gitattributes.metadata
a .cache/huggingface/download/0_CLIPModel
a .cache/huggingface/download/config_sentence_transformers.json.metadata
a .cache/huggingface/download/modules.json.metadata
a .cache/huggingface/download/README.md.metadata
a .cache/huggingface/download/0_CLIPModel/config.json.metadata
a .cache/huggingface/download/0_CLI

In [34]:
bucket = "huggingface-clip-models"
model_artifact = sess.upload_data("model.tar.gz", bucket=bucket, key_prefix="CLIP-ViT-L-14")
print("S3 Model Path:", model_artifact)

S3 Model Path: s3://huggingface-clip-models/CLIP-ViT-L-14/model.tar.gz


### Create the Model 

In [24]:
# Model artifact in S3 
model_data = "s3://huggingface-clip-models/CLIP-ViT-L-14/model.tar.gz"

# Generate unique name for the model 
current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
model_name = "normalized-l14-model-" + current_time

# Hugging Face DLC container URI (for HF + PyTorch)
container_image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-cpu-py39-ubuntu20.04"

# Create the model
create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": container_image_uri,
        "Mode": "SingleModel",
        "ModelDataUrl": model_data,
        "Environment": {
            "HF_TASK": "feature-extraction"
        }
    }
)

print("Created Model:", create_model_response["ModelArn"])


Created Model: arn:aws:sagemaker:us-east-1:289669704200:model/normalized-l14-model-2025-06-13-01-24-37


### Create an Endpoint Configuration with Serverless Inference

In [25]:
# Generate unique name for the endpoint config name
current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
endpoint_config_name = "normalized-l14-model-endpoint-config-" + current_time

create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "AllTrafficVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 5120,        
                "MaxConcurrency": 5,
                "ProvisionedConcurrency": 1
            }
        }
    ]
)

print("Created Endpoint Config:", create_endpoint_config_response["EndpointConfigArn"])


Created Endpoint Config: arn:aws:sagemaker:us-east-1:289669704200:endpoint-config/normalized-l14-model-endpoint-config-2025-06-13-01-24-46


### Create or Update the Endpoint

In [26]:
endpoint_name = "normalized-l14-model"

# Check if endpoint exists
try:
    sm_client.describe_endpoint(EndpointName=endpoint_name)
    # If it exists, update
    response = sm_client.update_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=endpoint_config_name
    )
    print("Updated Endpoint:", response["EndpointArn"])
    
except sm_client.exceptions.ClientError as e:
    if "Could not find endpoint" in str(e):
        # Create the endpoint
        response = sm_client.create_endpoint(
            EndpointName=endpoint_name,
            EndpointConfigName=endpoint_config_name
        )
        print("Created Endpoint:", response["EndpointArn"])
    else:
        raise e


Updated Endpoint: arn:aws:sagemaker:us-east-1:289669704200:endpoint/normalized-l14-model


### Wait for the endpoint to be in service

In [27]:
print("Waiting for endpoint to be in service...")

while True:
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Endpoint status:", status)
    if status == "InService":
        print("✅ Endpoint is ready!")
        break
    elif status == "Failed":
        raise Exception("Endpoint creation failed")
    time.sleep(10)

Waiting for endpoint to be in service...
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: Updating
Endpoint status: InService
✅ Endpoint is ready!


### Test deployed model

In [20]:
%cd  ..

C:\Users\tochi\OneDrive\Documents\Upwork_Job\clip-embedding-api


In [28]:
with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')


input_data = {
    "inputs": {
        "image": encoded_image 
    }
}

# Convert to JSON string
payload = json.dumps(input_data)

# Invoke the endpoint
response = runtime.invoke_endpoint(
    EndpointName="normalized-l14-model",
    ContentType="application/json",
    Body=payload
)

# Read the raw response body (bytes -> string)
response = response["Body"].read().decode()

# Print response
print("Raw response from SageMaker endpoint:")
print(response)

Raw response from SageMaker endpoint:
[0.37861865758895874, -0.2048639953136444, 0.22263170778751373, -0.6245788931846619, -0.055607572197914124, -0.06229861080646515, -0.5927261710166931, 0.009563378989696503, -0.2391338348388672, -0.10730738937854767, 0.057211458683013916, 0.11848915368318558, -0.09497398883104324, 0.2676980495452881, 0.19893492758274078, 0.32281601428985596, -0.2885391414165497, -0.35819971561431885, -0.4916241765022278, 0.026667796075344086, 0.015242338180541992, 0.2755463421344757, 0.4547341763973236, 0.16531586647033691, -0.778900146484375, 0.08384381979703903, -0.4893878698348999, 0.1691056489944458, -0.253076434135437, 0.17060652375221252, -0.6237554550170898, 0.029045745730400085, 0.009264398366212845, -0.3716703951358795, 0.4490283131599426, -0.567813515663147, -0.19414737820625305, 0.3332390785217285, 0.639544665813446, 0.4740416407585144, 0.3455004096031189, -0.005832865834236145, -0.14018262922763824, 0.6296952962875366, -0.31594616174697876, 0.50306242704

### Test APIgateway Model Endpoint

In [38]:
# Track execution time
start_time = time.time()

with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
endpoint_url = "https://bwcckx57bf.execute-api.us-east-1.amazonaws.com/prod/l14model"

payload = {
    "image": encoded_image,
    "model": "L-14 model"
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {bearer_token}"
}

response = requests.post(endpoint_url, headers=headers, data=json.dumps(payload))

end_time = time.time()
duration = end_time - start_time
print(f"Execution time: {duration:.2f} seconds")

# Print response
print("Status Code:", response.status_code)
print(json.dumps(response.json(), indent=2))     

Execution time: 11.82 seconds
Status Code: 200
{
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        0.022251607663153736,
        -0.01203995935396903,
        0.013084176692749729,
        -0.03670681359004742,
        -0.0032680847994337307,
        -0.003661320481279414,
        -0.03483481319471944,
        0.0005620445610575209,
        -0.014054014943966806,
        -0.006306508884200177,
        0.00336234600945952,
        0.006963666758756848,
        -0.005581668772296518,
        0.015732748112851982,
        0.011691505081280858,
        0.01897205843764626,
        -0.01695758949426778,
        -0.021051576242067112,
        -0.02889299846687648,
        0.0015672796984922074,
        0.0008957998298930627,
        0.016193996189300005,
        0.026724954730587124,
        0.00971569606373123,
        -0.045776350744868084,
        0.0049275431775718205,
        -0.02876156960678608,
        0.0099384234760361

### Add Autoscale Action for weekdays and weekends

In [39]:
### Register scalable target
endpoint_name = "normalized-l14-model"
variant_name = "AllTrafficVariant" 
min_capacity = 1
max_capacity = 5

client = boto3.client("application-autoscaling") # Initialize the client

response = client.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=f"endpoint/{endpoint_name}/variant/{variant_name}",
    ScalableDimension="sagemaker:variant:DesiredProvisionedConcurrency",
    MinCapacity=min_capacity,
    MaxCapacity=max_capacity
)

print("Scalable target registered:")
print(response)

Scalable target registered:
{'ScalableTargetARN': 'arn:aws:application-autoscaling:us-east-1:289669704200:scalable-target/056m6d9664704da14dc4b02f32982239ea1d', 'ResponseMetadata': {'RequestId': '7a2b691e-5e93-4040-a9e7-56ac1903ba41', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '7a2b691e-5e93-4040-a9e7-56ac1903ba41', 'content-type': 'application/x-amz-json-1.1', 'content-length': '131', 'date': 'Fri, 13 Jun 2025 16:50:07 GMT'}, 'RetryAttempts': 0}}


In [41]:
### setup the scheduled action using cron

endpoint_name = "normalized-l14-model"
variant_name = "AllTrafficVariant"
resource_id = f"endpoint/{endpoint_name}/variant/{variant_name}"
client = boto3.client('application-autoscaling')

# Desired min and max concurrency during "up" times (change as needed)
desired_min = 1
desired_max = 5

# Minimal concurrency during "down" times (change as needed, must be > 0)
minimal_concurrency = 1

# 1. Weekday evening scale down at 6 PM (Mon-Fri)
client.put_scheduled_action(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency',
    ScheduledActionName='WeekdayEveningScaleDown',
    Schedule='cron(30 23 ? * MON-FRI *)',  # 6 PM UTC Mon-Fri
    ScalableTargetAction={
        'MinCapacity': minimal_concurrency,  # Set a minimal non-zero value
        'MaxCapacity': minimal_concurrency   # Set a minimal non-zero value
    }
)

# 2. Weekday morning scale up at 6 AM (Mon-Fri)
client.put_scheduled_action(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency',
    ScheduledActionName='WeekdayMorningScaleUp',
    Schedule='cron(0 11 ? * MON-FRI *)', 
    ScalableTargetAction={
        'MinCapacity': desired_min,
        'MaxCapacity': desired_max
    }
)

# 3. Weekend scale down (start Sat 00:00 UTC)
client.put_scheduled_action(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency',
    ScheduledActionName='WeekendScaleDown',
    Schedule='cron(0 11 ? * SAT *)', 
    ScalableTargetAction={
        'MinCapacity': minimal_concurrency,  # Set a minimal non-zero value
        'MaxCapacity': minimal_concurrency   # Set a minimal non-zero value
    }
)
print("Scheduled scaling actions created successfully.")

Scheduled scaling actions created successfully.


In [42]:
#verify it was added successfully
response = client.describe_scheduled_actions(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
)

for action in response['ScheduledActions']:
    print(f"{action['ScheduledActionName']}: {action['Schedule']}")

WeekendScaleDown: cron(0 11 ? * SAT *)
WeekdayEveningScaleDown: cron(30 23 ? * MON-FRI *)
WeekdayMorningScaleUp: cron(0 11 ? * MON-FRI *)


In [44]:
### confirm that the triggering was successful
response = client.describe_scaling_activities(
    ServiceNamespace='sagemaker',
    ResourceId='endpoint/normalized-l14-model/variant/AllTrafficVariant',
    ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
)
response

{'ScalingActivities': [{'ActivityId': 'ed980672-0e86-42bb-b1fd-9f44affbc8e6',
   'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/normalized-l14-model/variant/AllTrafficVariant',
   'ScalableDimension': 'sagemaker:variant:DesiredProvisionedConcurrency',
   'Description': 'Setting desired provisioned concurrency to 2.',
   'Cause': 'monitor alarm TargetTracking-endpoint/normalized-l14-model/variant/AllTrafficVariant-AlarmLow-8822e036-1bbb-4124-a104-3d7ff90592a3 in state ALARM triggered policy SageMakerServerlessEndpointProvisionedConcurrencyScalingPolicy',
   'StartTime': datetime.datetime(2025, 6, 13, 2, 7, 38, 138000, tzinfo=tzlocal()),
   'EndTime': datetime.datetime(2025, 6, 13, 2, 8, 14, 216000, tzinfo=tzlocal()),
   'StatusCode': 'Successful',
   'StatusMessage': 'Successfully set desired provisioned concurrency to 2. Change successfully fulfilled by sagemaker.'},
  {'ActivityId': '516005a8-2627-40b9-abfd-3c26b4e13f85',
   'ServiceNamespace': 'sagemaker',
   'ResourceId

In [50]:
#verify if you are having the expected provisioned concurrency.
sm_client.describe_endpoint(EndpointName="normalized-l14-model")

{'EndpointName': 'normalized-l14-model',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:289669704200:endpoint/normalized-l14-model',
 'EndpointConfigName': 'normalized-l14-model-endpoint-config-2025-06-13-01-24-46',
 'ProductionVariants': [{'VariantName': 'AllTrafficVariant',
   'DeployedImages': [{'SpecifiedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-cpu-py39-ubuntu20.04',
     'ResolvedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference@sha256:a2e83f13bbb51b944215023d8109074c6bcc1832b87af53f20157872ce956941',
     'ResolutionTime': datetime.datetime(2025, 6, 13, 2, 7, 39, 712000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 0,
   'CurrentServerlessConfig': {'MemorySizeInMB': 5120,
    'MaxConcurrency': 5,
    'ProvisionedConcurrency': 2}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2025, 6, 5, 10, 20, 54, 149000,

In [47]:
# #List of scheduled action names to delete
# scheduled_actions = [
#     'WeekendScaleDown',
#     'WeekdayMorningScaleUp',
#     'WeekdayEveningScaleDown'
# ]

# for action_name in scheduled_actions:
#     response = client.delete_scheduled_action(
#         ServiceNamespace='sagemaker',
#         ScheduledActionName=action_name,
#         ResourceId=resource_id,
#         ScalableDimension='sagemaker:variant:DesiredProvisionedConcurrency'
#     )
#     print(f"Deleted scheduled action: {action_name}")

### Perform Load testing on the Endpoint

In [49]:
# Simulate load
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

with open("aws_login.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

input_data = {
    "inputs": {
        "image": encoded_image 
    }
}

# Convert to JSON string
payload = json.dumps(input_data)

# Invoke the endpoint
def invoke_endpoint():
    response = runtime.invoke_endpoint(
        EndpointName="normalized-b32-model",
        ContentType="application/json",
        Body=payload
    )
    print("Raw response:", response)  
    return response.get('ResponseMetadata', {}).get('HTTPStatusCode', 'NoStatusCode')
    
def simulate_load(concurrent_requests=5, duration_sec=60):
    print(f"Starting load test: {concurrent_requests} requests every second for {duration_sec} seconds")
    start_time = time.time()
    while time.time() - start_time < duration_sec:
        with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
            futures = [executor.submit(invoke_endpoint) for _ in range(concurrent_requests)]
            for future in as_completed(futures):
                try:
                    status = future.result()
                    print(f"Response status: {status}")
                except Exception as e:
                    print(f"Request failed: {e}")
        time.sleep(1)

simulate_load(concurrent_requests=5, duration_sec=15)

Starting load test: 5 requests every second for 15 seconds
Raw response: {'ResponseMetadata': {'RequestId': '343ee26b-6b7a-48ef-8bda-079d7a49c0d0', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '343ee26b-6b7a-48ef-8bda-079d7a49c0d0', 'x-amzn-invoked-production-variant': 'AllTrafficVariant', 'date': 'Fri, 13 Jun 2025 17:20:48 GMT', 'content-type': 'application/json', 'content-length': '11531', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTrafficVariant', 'Body': <botocore.response.StreamingBody object at 0x0000020D8D5944C0>}
Response status: 200
Raw response: {'ResponseMetadata': {'RequestId': '7bee5cd1-c77f-415a-9bc8-4b1792fb2daa', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '7bee5cd1-c77f-415a-9bc8-4b1792fb2daa', 'x-amzn-invoked-production-variant': 'AllTrafficVariant', 'date': 'Fri, 13 Jun 2025 17:20:48 GMT', 'content-type': 'application/json', 'content-length': '11531', 'connection': 