# 简化版triton模型部署与测试

本笔记本演示如何部署一个简化版的基于PyTorch的模型到SageMaker上，该模型仅执行基本的torch张量操作，无需模型微调。

## 0. 配置sagemaker，获取 account id 等

In [1]:
# !pip install sagemaker boto3 --break-system-packages

In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = 'arn:aws:iam::310850127430:role/service-role/AmazonSageMaker-ExecutionRole-20240425T101325'
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name # region name of the current SageMaker Studio environment
account_id = sess.account_id()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


## 1. 构建和上传Docker镜像到ECR

我们需要先构建Docker镜像并将其上传到ECR（Elastic Container Registry）。这里有两种方式：

### 1.1 使用提供的脚本（推荐）

In [3]:
# 设置环境变量
%env ACCOUNT={account_id}
%env REGION={region}

# 使用build_and_push.sh脚本构建并上传镜像
!chmod +x ./build_and_push.sh
!./build_and_push.sh sagemaker-endpoint/whisper-triton-byoc:latest

env: ACCOUNT=310850127430
env: REGION=us-west-2
310850127430.dkr.ecr.us-west-2.amazonaws.com/sagemaker-endpoint/whisper-triton-byoc:latest



https://docs.docker.com/go/credential-store/

Login Succeeded
[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
 => [internal] load build definition from Dockerfile.server                0.0s
[?25h[1A[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile.server                0.0s
[0m[34m => => transferring dockerfile: 621B                                       0.0s
[0m => [internal] load metadata for nvcr.io/nvidia/tritonserver:24.10-py3     0.1s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile.server                0.0s
[0m[34m => => transferring dockerfile: 621B                                       0.0s
[0m => [internal] load metadata for nvcr.io/nvidia/tritonserver:24.10-py3     0.3s
[?25h[1A[1A[1A[1A[0G

### 1.2 手动构建和上传（步骤详解）

In [4]:
# # 1. 定义镜像名称和ECR仓库
# DOCKER_IMAGE = "sagemaker-endpoint/whisper-triton-byoc:latest"
# REPO_NAMESPACE = "sagemaker-endpoint/whisper-triton-byoc"
# TAG = "latest"

# # 2. 创建ECR仓库（如果不存在）
# !aws ecr describe-repositories --repository-names "{REPO_NAMESPACE}" > /dev/null 2>&1 || \
# aws ecr create-repository --repository-name "{REPO_NAMESPACE}"

# # 3. 登录到Docker
# !aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com

# # 4. 构建Docker镜像
# !docker build . -f Dockerfile.server -t {DOCKER_IMAGE}

# # 5. 标记和推送镜像到ECR
# REPO_NAME = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{REPO_NAMESPACE}:{TAG}"
# !docker tag {DOCKER_IMAGE} {REPO_NAME}
# !docker push {REPO_NAME}

# # 设置CONTAINER变量用于后续步骤
# CONTAINER = REPO_NAME

## 2. 准备模型数据

In [5]:
# 确保先删除之前的打包文件
!rm -f model_data.tar.gz

# 显示部署配置内容
!cat model_data/deploy_config.sh

# 打包模型数据
!tar czvf model_data.tar.gz model_data/ --exclude=model_data/.ipynb_checkpoints --exclude=model_data/__pycache__

# 上传到S3
s3_code_prefix = f"whisper_deploy_codes"
bucket = sess.default_bucket()
code_artifact = sess.upload_data("model_data.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

# deploy_config.sh

#注意 s3 路径最后加上 / 
S3_PATH="s3://a-web-uw2/test_triton/"
model_data/
model_data/ssh_helper_start.py
model_data/run_server.py
model_data/model_repo_whisper_trtllm/
model_data/model_repo_whisper_trtllm/whisper/
model_data/model_repo_whisper_trtllm/whisper/config.pbtxt
model_data/model_repo_whisper_trtllm/whisper/1/
model_data/model_repo_whisper_trtllm/whisper/1/__pycache__/
model_data/model_repo_whisper_trtllm/whisper/1/__pycache__/model.cpython-310.pyc
model_data/model_repo_whisper_trtllm/whisper/1/model.py
model_data/start_triton_and_client.sh
model_data/download_model_from_s3.py
model_data/deploy_config.sh
model_data/whisper_api.py
tar: The following options were used after non-option arguments.  These options are positional and affect only arguments that follow them.  Please, rearrange them properly.
tar: --exclude ‘model_data/.ipynb_checkpoints’ has no effect
tar: --exclude ‘model_data/__pycache__’ has no effect
tar: Exiting with failure status due to previous erro

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-310850127430/whisper_deploy_codes/model_data.tar.gz


## 3. 使用 SSH-helper 进行调试（可选）

Since we are using the BYOC (Bring Your Own Container) method to deploy model, we can deploy and debug the code using SSH Helper after preparing the initial code. Once the debugging is successful, we can then deploy it using the regular method.

In [6]:
# %pip install sagemaker_ssh_helper==2.2.0

In [7]:
from sagemaker_ssh_helper.wrapper import SSHModelWrapper
CONTAINER='310850127430.dkr.ecr.us-west-2.amazonaws.com/sagemaker-endpoint/whisper-triton-byoc:latest'
model = Model(image_uri=CONTAINER, model_data=code_artifact, role=role,dependencies=[SSHModelWrapper.dependency_dir()])

In [8]:
from sagemaker_ssh_helper.wrapper import SSHModelWrapper
from time import gmtime, strftime
from sagemaker import Predictor

instance_type = "ml.g4dn.xlarge"
# instance_type = "ml.p4d.24xlarge"
endpoint_name = sagemaker.utils.name_from_base("whisper-trt-triton-sshelper")

ssh_wrapper = SSHModelWrapper.create(model, connection_wait_time_seconds=0)  # <--NEW--

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    wait=False
)

In [9]:
# 等待端点创建完成
import time
sm_client = boto3.client("sagemaker")
resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:310850127430:endpoint/whisper-trt-triton-sshelper-2025-11-01-03-01-10-009
Status: InService


In [10]:
# aws ssm start-session --target <Your_instance_ids> 
instance_ids = ssh_wrapper.get_instance_ids(timeout_in_sec=0)
print(instance_ids)

[]


## 4. 正式部署

In [11]:
# model = Model(
#     model_data=code_artifact,
#     image_uri=CONTAINER,
#     role=role,
# )

# # 部署模型到endpoint
# endpoint_name = sagemaker.utils.name_from_base("simplified-whisper-torch-model")
# print(f"endpoint_name: {endpoint_name}")
# predictor = model.deploy(
#     initial_instance_count=1,
#     instance_type='ml.g5.4xlarge',
#     endpoint_name=endpoint_name,
# )

## 5. 推理调用测试

In [12]:
%pip install pydub

Note: you may need to restart the kernel to use updated packages.


In [13]:
import boto3
import json
import base64
import os
import io
from pydub import AudioSegment

# endpoint_name = 'whisper-trt-triton-sshelper-2025-10-30-09-52-49-729'
endpoint_name = endpoint_name
def encode_audio(audio_file_path):
    # 加载音频文件
    audio = AudioSegment.from_wav(audio_file_path)
    
    # 检查是否为双通道
    if audio.channels == 2:
        print("检测到双通道音频，正在转换为单通道...")
        # 将双通道转换为单通道
        audio = audio.set_channels(1)
    
    # 将音频数据写入内存缓冲区
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    buffer.seek(0)
    
    # 将缓冲区的内容编码为 base64
    return base64.b64encode(buffer.read()).decode('utf-8')

def invoke_sagemaker_endpoint(runtime_client, endpoint_name, audio_data, repetition_penalty=1.0, whisper_prompt=""):
    """Invoke SageMaker endpoint with audio data"""
    payload = {
        "whisper_prompt": whisper_prompt,
        "audio_data": audio_data,
        "repetition_penalty": repetition_penalty
    }
    
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps(payload)
    )
    
    result = json.loads(response['Body'].read().decode())
    return result

def process_audio(audio_path, endpoint_name, whisper_prompt=""):
    # Read and encode the audio file
    print("Reading and encoding audio file...")
    audio_data = encode_audio(audio_path)

    # Create a SageMaker runtime client
    runtime_client = boto3.client('sagemaker-runtime')

    # Invoke the SageMaker endpoint
    print(f"Invoking SageMaker endpoint: {endpoint_name}")
    result = invoke_sagemaker_endpoint(
        runtime_client,
        endpoint_name,
        audio_data,
        repetition_penalty=1.0,
        whisper_prompt=whisper_prompt
    )

    return result



In [14]:
# !pip install soundfile

In [15]:
# 创建测试音频文件
def create_dummy_audio(output_path="audio.wav", duration=3.0, sample_rate=16000):
    """Create a dummy audio file for testing."""
    import numpy as np
    import soundfile as sf
    
    # Generate some random data (white noise)
    samples = np.random.uniform(-0.1, 0.1, size=int(duration * sample_rate))
    # Add a simple sine wave to make it more interesting
    t = np.linspace(0, duration, int(duration * sample_rate), False)
    sine = 0.3 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
    samples = samples + sine
    
    # Normalize
    samples = samples / np.max(np.abs(samples))
    
    # Save as WAV file
    sf.write(output_path, samples, sample_rate)
    print(f"Created dummy audio file at {output_path}")
    return output_path

# 创建测试音频文件
audio_path = create_dummy_audio("./audio.wav")

Created dummy audio file at ./audio.wav


In [16]:
%%time
audio_path = "./audio.wav"
endpoint_name = endpoint_name
whisper_prompt = ""  # Optional: add a prompt if needed

# Call the function
result = process_audio(audio_path, endpoint_name, whisper_prompt)

# Print the result
print("Processing result:")
print(result)

Reading and encoding audio file...
Invoking SageMaker endpoint: whisper-trt-triton-sshelper-2025-11-01-03-01-10-009


Processing result:
{'code': 0, 'message': 'Success', 'transcribe_text': "Transcription (TEXT_PREFIX: '', WAV shape: (1, 240000), rep_penalty: 1.0)"}
CPU times: user 47 ms, sys: 1.14 ms, total: 48.1 ms
Wall time: 404 ms


## 6. 清理资源

In [17]:
# 删除端点和模型，避免额外费用
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
sess.delete_model(model.name)
print("Resources cleaned up.")

Resources cleaned up.
