In [None]:
import sagemaker
from sagemaker import image_uris
from sagemaker.model import Model
from datetime import datetime
import boto3
import os

In [None]:
# --- 1. CONFIGURATION (Replace these placeholders) ---
AWS_REGION = "INSERT AWS REGION"

# Set IAM Role to be Sagemaker Execution Role
IAM_ROLE_ARN = "INSERT IAM ID FOR SAGEMAKER EXECUTION ROLE"

# Model and Endpoint Details (point to model folder cloned from HF)
S3_MODEL_PATH = "INSERT PATH TO MODEL IN S3" 

# Networking for private VPC access and security
VPC_CONFIG = {
    'Subnets': ['INSERT APPROPRIATE SUBNET ID', 'ADDITIONAL SUBNET IDS IF NEEDED'], 
    'SecurityGroupIds': ['INSERT SECUTIY GROUP ID'] 
}


INSTANCE_TYPE = "ml.g6.4xlarge" # The GPU instance type
ENDPOINT_NAME = f"gemma2-9b-vllm-lmcache-prod-{datetime.now().strftime('%Y%m%d%H%M')}"
LMI_VERSION = "0.30.0" # A recent, stable LMI version

In [None]:
# --- 2. RETRIEVE THE LMI CONTAINER IMAGE URI ---
# Framework "djl-lmi" is the identifier for the container that includes vLLM.
lmi_image_uri = image_uris.retrieve(
    framework="djl-lmi",
    region=AWS_REGION,
    version=LMI_VERSION
)


In [None]:
# --- 3. DEFINE VLLM AND LMCACHE SETTINGS (The "env" Dictionary) ---
# This is the instruction set for the container at startup.

environment_vars = {
    # 1. MODEL LOADING AND OPTIMIZATION
    "HF_MODEL_ID": S3_MODEL_PATH,
    "OPTION_ROLLING_BATCH": "vllm",      # CRITICAL: Activates the vLLM scheduler (Continuous Batching)
    "OPTION_QUANTIZE": "awq_marlin",            # Loads the 4-bit AWQ hardware kernels
    "OPTION_DTYPE": "fp16",          # Data type for model processing
    "OPTION_MAX_MODEL_LEN": "4096",      # Max context length (Gemma 2 default)
    "OPTION_TENSOR_PARALLEL_DEGREE": "1", # Use 1 GPU (Since ml.g6.4xlarge has 1 GPU) (change to match gpu count in instance type)

    # Basic config (from above)
    "OPTION_ENGINE": "Python",
    
    # Enable prefix caching (requires vLLM backend)
    "OPTION_ENABLE_PREFIX_CACHING": "true",
    "OPTION_BLOCK_SIZE": "32",  # Block size for KV cache
    "OPTION_MAX_NUM_SEQS": "256",  # Max concurrent sequences
    
    # Memory management
    "OPTION_GPU_MEMORY_UTILIZATION": "0.9",  # Use 90% of GPU memory
    "OPTION_ENFORCE_EAGER": "false",  # Allow CUDA graphs
    
    # Performance tuning
    "OPTION_USE_V2_BLOCK_MANAGER": "true",  # Use newer block manager
    "OPTION_SWAP_SPACE": "48"  # GB of CPU swap space

}

In [None]:
# --- 4. CREATE AND DEPLOY THE SAGEMAKER ENDPOINT ---

print(f"LMI Container URI: {lmi_image_uri}")
print(f"Deploying model to Endpoint: {ENDPOINT_NAME}")

# Create the Model Blueprint
lmi_model = Model(
    image_uri=lmi_image_uri,
    role=IAM_ROLE_ARN,
    env=environment_vars,
    name=ENDPOINT_NAME,
    vpc_config=VPC_CONFIG,                 # Attaches the endpoint to your private network
    enable_network_isolation=False,         # Opens up access for DNS
)

In [None]:
# --- DEPLOY THE MODEL ---
# This step actually creates the endpoint on AWS. It will take ~5-10 minutes.
predictor = lmi_model.deploy(
    initial_instance_count=1,
    instance_type=INSTANCE_TYPE,
    endpoint_name=ENDPOINT_NAME,
    wait=True,  # IMPORTANT: Script must pause here until deployment finishes
    container_startup_health_check_timeout=900
)

print("-" * 50)
print(f"âœ… Deployment successful! Endpoint Name: {ENDPOINT_NAME}")