# SageMaker Multi-Model Endpoint with SKLearn Example 

In this notebook we configure a SageMaker MME to run with a the set of different SciKit Learn models that have been trained and packaged in the preceding notebooks. The MME Endpoint used Deep Java Library (DJL) Serving as the backend on a CPU based instance.

Read in essential static variables used across notebooks from the store. These values are set in notebook 00.

In [None]:
%store -r

### Imports

In [None]:
import sagemaker
import boto3
import json
import time
from time import gmtime, strftime

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name
account_id = sess.account_id()
s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
# replace this with your ECR image URI based off of your region, we are utilizing the CPU image here
inference_image_uri =f'763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.29.0-cpu-full'

In [None]:
mme_model_name = "sklearn-djl-mme-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: " + mme_model_name)

In [None]:
# Step 1: Create the model in SageMaker. In the request, you name the model and describe a primary container.
# Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/create_model.html

create_model_response = sm_client.create_model(
    ModelName=mme_model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={"Image": inference_image_uri, "Mode": "MultiModel", "ModelDataUrl": mme_artifacts},
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

### MME Endpoint Config Creation

In [None]:
# Step 2: Endpoint Configuration Creation
# Documentation: https://boto3.amazonaws.com/v1/documentation/api/1.35.9/reference/services/sagemaker/client/create_endpoint_config.html

mme_epc_name = "sklearn-djl-mme-epc-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=mme_epc_name,
    ProductionVariants=[
        {
            "VariantName": "sklearnvariant",
            "ModelName": mme_model_name,
            "InstanceType": "ml.m4.2xlarge",
            "InitialInstanceCount": 1  # use 2 or more for Production
        },
    ],
)
print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

### MME Endpoint Creation

In [None]:
# Step 3: Endpoint Creation
# Dcoumentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/create_endpoint.html

mme_endpoint_name = "sklearn-djl-ep-mme-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=mme_endpoint_name,
    EndpointConfigName=mme_epc_name,
)
print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

In [None]:
# Optional Step 4: Monitor creation

describe_endpoint_response = sm_client.describe_endpoint(EndpointName=mme_endpoint_name)
while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = sm_client.describe_endpoint(EndpointName=mme_endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)
print(describe_endpoint_response)

# Test model inference

### Example inference with model example #1 - Sklearn Linear Regression

In [None]:
request_body = '[[0.5]]'

In [None]:
response = smr_client.invoke_endpoint(
    EndpointName=mme_endpoint_name,
    ContentType=content_type,
    TargetModel=lr_model_reference_name,
    Body=request_body)
result = json.loads(response['Body'].read().decode())
print(result)

### Example inference with model example #2 - Sklearn Extra Tree Classification

In [None]:
# Build a synthetic dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


X, y = make_classification(
    n_samples=1000, n_features=5, n_informative=4, n_redundant=1, n_classes=4, random_state=42
)

# Train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=20250108
)

In [None]:
test_data = X_test[0:2]
print(f"test data shape: {test_data.shape}")
request_body = json.dumps(test_data.tolist())
print(f"request_body: {request_body}")

In [None]:
response = smr_client.invoke_endpoint(
    EndpointName=mme_endpoint_name,
    ContentType=content_type,
    TargetModel=etc_model_reference_name,
    Body=request_body)
result = json.loads(response['Body'].read().decode())
print(result)

### Example inference with model example #3 - Sklearn Multi Output Classification

In [None]:
# Build a synthetic dataset
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split

X, y = make_multilabel_classification(n_samples=1000, n_features=8, n_classes=3, n_labels=2, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
test_data = X_test[0:2]
print(f"test data shape: {test_data.shape}")
request_body = json.dumps(test_data.tolist())
print(f"request_body: {request_body}")

In [None]:
response = smr_client.invoke_endpoint(
    EndpointName=mme_endpoint_name,
    ContentType=content_type,
    TargetModel=moc_model_reference_name,
    Body=request_body)
result = json.loads(response['Body'].read().decode())
print(result)

Take a quick look at model/request latency

In [None]:
%%time

for i in range(10):
    response = smr_client.invoke_endpoint(
    EndpointName=mme_endpoint_name,
    ContentType=content_type,
    TargetModel = moc_model_reference_name,
    Body=request_body)

### Cleanup

In [None]:
sm_client.delete_endpoint(EndpointName=mme_endpoint_name)