In [31]:
!pip install --upgrade -q boto3 sagemaker huggingface_hub transformers

In [32]:
%%writefile inference.py

import os
import torch
from flask import Flask, request, jsonify
from transformers import AutoModelForCausalLM, AutoTokenizer

# More flexible model loading with fallback options
MODEL_DIR = os.environ.get("MODEL_PATH", "/opt/ml/model")
MODEL_ID = os.environ.get("MODEL_ID", "gpt2")  # Default fallback to a HF model

# Initialize Flask application
app = Flask(__name__)

# Load models at startup - using a function to handle errors better
def load_model():
    try:
        # First try to load from local path
        print(f"Attempting to load model from local path: {MODEL_DIR}")
        if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):
            model = AutoModelForCausalLM.from_pretrained(MODEL_DIR)
            tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
            print("Model loaded successfully from local path")
        else:
            # Fallback to downloading from Hugging Face
            print(f"Local model not found. Loading model from Hugging Face: {MODEL_ID}")
            model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
            print(f"Model {MODEL_ID} loaded successfully from Hugging Face")
        
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

# Load model when the script starts
model, tokenizer = load_model()

@app.route("/ping", methods=["GET"])
def ping():
    # Health check endpoint required by SageMaker
    return jsonify({"status": "healthy"}), 200

@app.route("/invocations", methods=["POST"])
def invoke():
    try:
        # Parse input data from request
        data = request.get_json()
        input_text = data.get("input", "")

        print(f"Received input: {input_text}")

        # Generate response using the model
        inputs = tokenizer(input_text, return_tensors="pt")
        outputs = model.generate(**inputs)
        response = tokenizer.decode(outputs[0])

        print(f"Generated response: {response}")
        return jsonify({"response": response})
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    # This will be used when running the Flask server directly (not through gunicorn)
    app.run(host="0.0.0.0", port=8080)

Overwriting inference.py


In [33]:
%%writefile requirements.txt

torch>=1.10.0
transformers>=4.18.0
flask>=2.0.0
gunicorn>=20.1.0
numpy>=1.20.0

Overwriting requirements.txt


In [34]:
%%writefile Dockerfile

FROM python:3.8

WORKDIR /app

# Copy application files
COPY inference.py /app/
COPY requirements.txt /app/

# Create the model directory that SageMaker expects
RUN mkdir -p /opt/ml/model

# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Create a serve script that SageMaker expects
RUN echo '#!/bin/bash\ncd /app && gunicorn --bind 0.0.0.0:8080 inference:app' > /usr/local/bin/serve && \
    chmod +x /usr/local/bin/serve

# Environment variables
ENV MODEL_PATH="/opt/ml/model"
ENV MODEL_ID="gpt2"

# Make sure serve is in PATH
ENV PATH="/usr/local/bin:${PATH}"

# Expose the port
EXPOSE 8080

# Set working directory as per SageMaker requirements
WORKDIR /app

# Command to run when container starts
CMD ["serve"]

Overwriting Dockerfile


In [35]:
import boto3
import sagemaker
import subprocess

# Get AWS account ID
account_id = boto3.client("sts").get_caller_identity()["Account"]
region = boto3.Session().region_name
repository_name = "llm-autogen-app"
print(account_id)
print(region)
print(repository_name)

# Full ECR URL
ecr_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{repository_name}"

print(ecr_uri)

# Login to AWS ECR
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {ecr_uri}

# Create the repository if it does not exist
ecr_client = boto3.client("ecr", region_name=region)
try:
    ecr_client.create_repository(repositoryName=repository_name)
    print(f"✅ Created repository: {repository_name}")
except ecr_client.exceptions.RepositoryAlreadyExistsException:
    print(f"✅ Repository {repository_name} already exists.")

# Build, tag, and push the Docker image
!docker build -t {repository_name} .
!docker tag {repository_name}:latest {ecr_uri}:latest
!docker push {ecr_uri}:latest

print(f"✅ Docker image pushed to: {ecr_uri}")

311141549115
us-east-1
llm-autogen-app
311141549115.dkr.ecr.us-east-1.amazonaws.com/llm-autogen-app
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
✅ Repository llm-autogen-app already exists.
[1A[1B[0G[?25l[+] Building 0.0s (0/0)  docker:default
[1A[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[1A[0G[?25l[+] Building 0.2s (2/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 784B                                       0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.8              0.1s
[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (13/13) FINISHED                               docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 784B               

In [52]:
!netstat -tuln | grep 8080

tcp        0      0 0.0.0.0:8080            0.0.0.0:*               LISTEN     
tcp6       0      0 :::8080                 :::*                    LISTEN     


In [54]:
# !docker run -p 8082:8080 311141549115.dkr.ecr.us-east-1.amazonaws.com/llm-autogen-app

In [None]:
from sagemaker.model import Model
from sagemaker.serverless import ServerlessInferenceConfig

# Define the SageMaker Model
model = Model(
    image_uri=ecr_uri + ":latest",
    role=sagemaker.get_execution_role(),
)

# Deploy the model as a serverless endpoint
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=2048,
    max_concurrency=5
)

predictor = model.deploy(
    serverless_inference_config=serverless_config
)

print("✅ SageMaker Serverless Endpoint Deployed!")

------------------------------!✅ SageMaker Serverless Endpoint Deployed!


In [50]:
import json
import boto3
import sagemaker
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Define your endpoint name
endpoint_name = "llm-autogen-app-2025-02-24-20-45-43-050"  # Replace with your actual endpoint name

# Initialize the SageMaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime')

# Method 1: Using the SageMaker Predictor class
def test_endpoint_with_predictor():
    try:
        # Initialize the predictor
        predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sagemaker.Session(),
            serializer=JSONSerializer(),
            deserializer=JSONDeserializer()
        )
        
        # Prepare input data (as a Python dict, not JSON string)
        input_data = {"input": "What is LLM?"}
        
        # Make prediction
        response = predictor.predict(input_data)  # ✅ Corrected: No json.dumps()
        print("Response using Predictor:")
        print(response)
        return response
    except Exception as e:
        print(f"Error with Predictor method: {str(e)}")
        return None

# Method 2: Using the boto3 sagemaker-runtime client directly
def test_endpoint_with_boto3():
    try:
        # Prepare input data
        input_data = {"input": "What is LLM?"}
        
        # Invoke the endpoint
        response = sagemaker_runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType='application/json',
            Body=json.dumps(input_data)
        )
        
        # Process the response
        result = json.loads(response['Body'].read().decode())
        print("Response using boto3:")
        print(result)
        return result
    except Exception as e:
        print(f"Error with boto3 method: {str(e)}")
        return None

# Execute both methods to test the endpoint
if __name__ == "__main__":
    print("Testing SageMaker endpoint using two different methods...")
    test_endpoint_with_predictor()
    test_endpoint_with_boto3()

Testing SageMaker endpoint using two different methods...
Response using Predictor:
{'response': 'What is LLM?\n\nLLM is a programming language that is designed to be used in'}
Response using boto3:
{'response': 'What is LLM?\n\nLLM is a programming language that is designed to be used in'}
