# Model Deployment & Inference with AWS SageMaker

**Project:** AAI-540 Machine Learning Operations - Final Team Project  
**Context:** Continuation of notebook 05 - Hyperparameter Tuning  
**Objective:** Deploy optimized Isolation Forest model to AWS SageMaker for production inference

---

## Table of Contents
1. [Setup & Configuration](#setup)
2. [Model Card Creation](#model-card)
3. [Model Packaging for SageMaker](#packaging)
4. [Model Registry & Versioning](#registry)
5. [Endpoint Deployment](#deployment)
6. [Inference Testing](#inference)
7. [Performance Monitoring](#monitoring)
8. [Summary & Cleanup](#summary)

---

## 1. Setup & Configuration

Load dependencies and configure AWS SageMaker.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
import json
import joblib
import boto3
import sagemaker
import tarfile
import sys
sys.path.append('..')
from datetime import datetime
from pathlib import Path

from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

from utils.visualizations import ModelVisualizer

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

model_viz = ModelVisualizer()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# AWS SageMaker configuration
sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
bucket = sagemaker_session.default_bucket()
s3_prefix = 'cms-anomaly-detection'

# SageMaker clients
sagemaker_client = boto3.client('sagemaker')
s3_client = boto3.client('s3')

print(f"Region: {region} | Bucket: {bucket}")

Region: us-east-1 | Bucket: sagemaker-us-east-1-864106638709


In [3]:
# Load stored variables from previous notebooks
%store -r optimized_model
%store -r optimal_params
%store -r scaler
%store -r df

if 'optimized_model' not in dir() or optimized_model is None:
    raise NameError("Missing required variable 'optimized_model'. Run notebook 05 first.")
    
print(f"Loaded: {type(optimized_model).__name__}, {type(scaler).__name__}, dataset {df.shape}")

Loaded: IsolationForest, RobustScaler, dataset (997362, 31)


## 2. Model Card Creation

Create comprehensive model card documenting model details, performance, and intended use.

In [4]:
# Create model card
model_card = {
    "model_details": {
        "name": "CMS Open Payments Anomaly Detector",
        "version": "1.0",
        "type": "Isolation Forest",
        "framework": "scikit-learn",
        "created_date": datetime.now().strftime('%Y-%m-%d'),
        "created_by": "AAI-540 Team"
    },
    "intended_use": {
        "primary_use": "Detect anomalous healthcare payment patterns",
        "out_of_scope": "Not intended for automated decision-making without human review"
    },
    "model_parameters": {
        "contamination": optimal_params.get('contamination', 0.05),
        "n_estimators": optimal_params.get('n_estimators', 200),
        "max_samples": optimal_params.get('max_samples', 'auto'),
        "max_features": optimal_params.get('max_features', 1.0)
    },
    "training_data": {
        "dataset": "CMS Open Payments 2024",
        "preprocessing": "RobustScaler, IQR outlier clipping, median imputation"
    },
    "optimization": {
        "method": "Grid Search and Randomized Search with 3-fold CV",
        "metric": "Anomaly score separation"
    }
}

# Save model card
with open('model_card.json', 'w') as f:
    json.dump(model_card, f, indent=2)

print("Model card created")

Model card created


## 3. Model Packaging for SageMaker

Package the model and dependencies for SageMaker deployment.

In [5]:
# Create inference script for SageMaker
inference_script = '''import os
import json
import joblib
import numpy as np
import pandas as pd

def model_fn(model_dir):
    """Load model and scaler from model directory"""
    model_path = os.path.join(model_dir, 'model.joblib')
    scaler_path = os.path.join(model_dir, 'scaler.joblib')
    
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    
    return {'model': model, 'scaler': scaler}

def input_fn(request_body, request_content_type):
    """Parse input data"""
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        return pd.DataFrame(data)
    else:
        raise ValueError(f'Unsupported content type: {request_content_type}')

def predict_fn(input_data, model_artifacts):
    """Generate predictions"""
    model = model_artifacts['model']
    scaler = model_artifacts['scaler']
    
    # Scale input data
    scaled_data = scaler.transform(input_data)
    
    # Generate predictions and scores
    predictions = model.predict(scaled_data)
    scores = model.decision_function(scaled_data)
    
    # Convert predictions: -1 (anomaly) to 1, 1 (normal) to 0
    anomaly_labels = (predictions == -1).astype(int).tolist()
    
    return {
        'predictions': anomaly_labels,
        'anomaly_scores': scores.tolist()
    }

def output_fn(prediction, response_content_type):
    """Format output"""
    if response_content_type == 'application/json':
        return json.dumps(prediction)
    else:
        raise ValueError(f'Unsupported content type: {response_content_type}')
'''

# Save inference script in current directory for SageMaker
with open('inference.py', 'w') as f:
    f.write(inference_script)

print("Inference script created")

Inference script created


In [6]:
# Create model directory structure
model_dir = Path('model_artifacts')
model_dir.mkdir(exist_ok=True)

# Save model and scaler using joblib for better compatibility
joblib.dump(optimized_model, model_dir / 'model.joblib')
joblib.dump(scaler, model_dir / 'scaler.joblib')

# Copy model card
import shutil
shutil.copy('model_card.json', model_dir / 'model_card.json')

print(f"Model artifacts prepared in {model_dir}/")


Model artifacts prepared in model_artifacts/


In [7]:
# Create tar.gz archive for SageMaker
model_archive = 'model.tar.gz'

with tarfile.open(model_archive, 'w:gz') as tar:
    tar.add(model_dir, arcname='.')

print(f"Model archived: {model_archive} ({Path(model_archive).stat().st_size / 1024:.0f} KB)")

Model archived: model.tar.gz (822 KB)


In [8]:
# Upload model archive to S3
model_s3_key = f"{s3_prefix}/models/{model_archive}"
model_s3_uri = f"s3://{bucket}/{model_s3_key}"

s3_client.upload_file(model_archive, bucket, model_s3_key)

print(f"Model uploaded to S3: {model_s3_uri}")

Model uploaded to S3: s3://sagemaker-us-east-1-864106638709/cms-anomaly-detection/models/model.tar.gz


## 4. Model Registry & Versioning

Register model in SageMaker Model Registry for versioning and lifecycle management.

In [9]:
# Create Model Package Group
model_package_group_name = 'cms-anomaly-detection-models'

try:
    sagemaker_client.create_model_package_group(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageGroupDescription='CMS Open Payments Anomaly Detection Models'
    )
    print(f"Model Package Group created: {model_package_group_name}")
except sagemaker_client.exceptions.ResourceInUse:
    print(f"Model Package Group exists: {model_package_group_name}")

Model Package Group created: cms-anomaly-detection-models


In [10]:
# Create SageMaker SKLearn Model with inference script
from sagemaker.sklearn import SKLearnModel

sklearn_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    entry_point='inference.py',
    framework_version='1.4-1',
    py_version='py3',
    sagemaker_session=sagemaker_session,
    name=f"cms-anomaly-model-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
)

print(f"SKLearn Model: {sklearn_model.name}")

SKLearn Model: cms-anomaly-model-20260215-051015


In [11]:
# Get the SKLearn image URI for the region and framework version
from sagemaker import image_uris

sklearn_image_uri = image_uris.retrieve(
    framework='sklearn',
    region=region,
    version='1.4-1',
    py_version='py3',
    instance_type='ml.m5.large'
)

# Register model version
model_package_description = "Optimized Isolation Forest model for CMS anomaly detection"

model_package_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageDescription": model_package_description,
    "ModelApprovalStatus": "PendingManualApproval",
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": sklearn_image_uri,
                "ModelDataUrl": model_s3_uri,
            }
        ],
        "SupportedContentTypes": ["application/json"],
        "SupportedResponseMIMETypes": ["application/json"],
    },
}

model_package_response = sagemaker_client.create_model_package(**model_package_input_dict)
model_package_arn = model_package_response["ModelPackageArn"]

print(f"Model registered with ARN: {model_package_arn}")

In [12]:
import sklearn
print(sklearn.__version__)

1.7.2


In [None]:
# Approve model for deployment
sagemaker_client.update_model_package(
    ModelPackageArn=model_package_arn,
    ModelApprovalStatus="Approved"
)

print("Model approved for deployment")

## 5. Endpoint Deployment

Deploy model to SageMaker real-time inference endpoint.

In [None]:
# Deploy model to endpoint
endpoint_name = f"cms-anomaly-endpoint-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

print(f"Deploying to endpoint: {endpoint_name}")

predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

print(f"Endpoint deployed: {endpoint_name}")

In [None]:
# Get endpoint status
endpoint_description = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
print(f"Endpoint Status: {endpoint_description['EndpointStatus']}")

## 6. Inference Testing

Test the deployed endpoint with sample data and evaluate inference performance.

In [None]:
# Prepare test data
test_sample = df.sample(n=100, random_state=42)

# Select numeric features (same as training)
numeric_cols = test_sample.select_dtypes(include=[np.number]).columns.tolist()
cols_to_exclude = [
    'EventTime', 'covered_recipient_profile_id', 'index',
    'teaching_hospital_id', 'covered_recipient_npi',
    'recipient_zip_code', 'recipient_province', 'recipient_postal_code'
]
numeric_features = [col for col in numeric_cols 
                   if col not in cols_to_exclude 
                   and not any(x in col.lower() for x in ['_id', '_code', '_province', '_postal'])]

X_test_sample = test_sample[numeric_features].copy()
X_test_sample = X_test_sample.fillna(X_test_sample.median())

print(f"Test sample prepared: {X_test_sample.shape}")

In [None]:
# Test single prediction
single_record = X_test_sample.iloc[0:1].to_dict('records')

start_time = time.time()
response = predictor.predict(single_record)
inference_time = time.time() - start_time

print(f"Single prediction: {inference_time*1000:.2f} ms")
print(f"Response: {response}")

In [None]:
# Test batch prediction
batch_records = X_test_sample.to_dict('records')

start_time = time.time()
batch_response = predictor.predict(batch_records)
batch_inference_time = time.time() - start_time

print(f"Batch ({len(batch_records)} records): {batch_inference_time:.2f}s | {(batch_inference_time/len(batch_records))*1000:.2f} ms/record | {len(batch_records)/batch_inference_time:.2f} records/sec")

In [None]:
# Analyze batch predictions
predictions = batch_response['predictions']
anomaly_scores = batch_response['anomaly_scores']

anomaly_count = sum(predictions)
anomaly_rate = (anomaly_count / len(predictions)) * 100

print(f"Results: {anomaly_count}/{len(predictions)} anomalies ({anomaly_rate:.2f}%)")
print(f"Scores: mean={np.mean(anomaly_scores):.4f}, std={np.std(anomaly_scores):.4f}, range=[{np.min(anomaly_scores):.4f}, {np.max(anomaly_scores):.4f}]")

In [None]:
# Performance testing - measure latency distribution
latencies = []
n_tests = 50

for i in range(n_tests):
    test_record = X_test_sample.sample(n=1).to_dict('records')
    start = time.time()
    _ = predictor.predict(test_record)
    latencies.append((time.time() - start) * 1000)

latency_stats = {
    'Mean': np.mean(latencies),
    'Median': np.median(latencies),
    'P95': np.percentile(latencies, 95),
    'P99': np.percentile(latencies, 99),
    'Min': np.min(latencies),
    'Max': np.max(latencies)
}

print(f"Latency ({n_tests} tests): Mean={latency_stats['Mean']:.2f}ms, P95={latency_stats['P95']:.2f}ms, P99={latency_stats['P99']:.2f}ms")

In [None]:
# Visualize latency distribution
fig = model_viz.plot_latency_distribution(latencies, latency_stats)
plt.show()

## 7. Performance Monitoring

Set up monitoring and logging for the deployed endpoint.

In [None]:
# Enable data capture for monitoring
from sagemaker.model_monitor import DataCaptureConfig

data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=f"s3://{bucket}/{s3_prefix}/data-capture"
)

print(f"Data capture enabled: s3://{bucket}/{s3_prefix}/data-capture")

In [None]:
# CloudWatch metrics configured for endpoint
cloudwatch = boto3.client('cloudwatch')
print("CloudWatch monitoring: ModelLatency, ModelInvocations, ModelInvocation4XXErrors, ModelInvocation5XXErrors")

In [None]:
# Create deployment summary
deployment_summary = pd.DataFrame({
    'Component': [
        'Model Type',
        'Framework',
        'Endpoint Name',
        'Instance Type',
        'Instance Count',
        'Model S3 URI',
        'Model Package ARN',
        'Average Latency (ms)',
        'P95 Latency (ms)',
        'Throughput (records/sec)'
    ],
    'Value': [
        'Isolation Forest',
        'scikit-learn 1.4-1',
        endpoint_name,
        'ml.m5.large',
        '1',
        model_s3_uri,
        model_package_arn,
        f"{latency_stats['Mean']:.2f}",
        f"{latency_stats['P95']:.2f}",
        f"{len(batch_records)/batch_inference_time:.2f}"
    ]
})

print("Deployment Summary:")
display(deployment_summary)

## 8. Summary & Cleanup

Save deployment configuration and provide cleanup instructions.

In [None]:
# Save deployment configuration
deployment_config = {
    'endpoint_name': endpoint_name,
    'model_package_arn': model_package_arn,
    'model_s3_uri': model_s3_uri,
    'region': region,
    'bucket': bucket,
    'deployment_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'instance_type': 'ml.m5.large',
    'instance_count': 1,
    'framework': 'scikit-learn',
    'framework_version': '1.4-1'
}

with open('deployment_config.json', 'w') as f:
    json.dump(deployment_config, f, indent=2)

print("Deployment configuration saved")

In [None]:
# Store variables for downstream notebooks
%store endpoint_name
%store model_package_arn
%store predictor
%store deployment_config

print(f"Stored: endpoint_name, model_package_arn, predictor, deployment_config")