# Deploy T4Rec model on Triton-based Vertex endpoint

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Setup

### get project vars

In [353]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


### get workspace vars

In [354]:
# INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data")
# OUTPUT_DIR = os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/sessions_by_day")
# model_path= os.environ.get("model_path", f"{INPUT_DATA_DIR}/saved_model")

REPO_WORKSPACE = 'workspace'

DATA_DIR = 'data'
INPUT_DATA_DIR=f'{REPO_WORKSPACE}/{DATA_DIR}'
TRANSFORMED_WORKFLOW=f'{INPUT_DATA_DIR}/processed_nvt'
OUTPUT_DIR=f'{INPUT_DATA_DIR}/sessions_by_day'
MODEL_PATH = f'{INPUT_DATA_DIR}/saved_model'
ENSEMBLE_MODEL_PATH = f'{INPUT_DATA_DIR}/models'

print(f"INPUT_DATA_DIR: {INPUT_DATA_DIR}")
print(f"TRANSFORMED_WORKFLOW: {TRANSFORMED_WORKFLOW}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"MODEL_PATH: {MODEL_PATH}")
print(f"ENSEMBLE_MODEL_PATH: {ENSEMBLE_MODEL_PATH}")

INPUT_DATA_DIR: workspace/data
TRANSFORMED_WORKFLOW: workspace/data/processed_nvt
OUTPUT_DIR: workspace/data/sessions_by_day
MODEL_PATH: workspace/data/saved_model
ENSEMBLE_MODEL_PATH: workspace/data/models


In [355]:
!tree $ENSEMBLE_MODEL_PATH

[01;34mworkspace/data/models[00m
├── [01;34m0_predictpytorchtriton[00m
│   ├── [01;34m1[00m
│   │   └── model.pt
│   └── config.pbtxt
└── [01;34mensemble_model[00m
    ├── [01;34m1[00m
    └── config.pbtxt

4 directories, 3 files


### set deployment version

In [356]:
VERSION='jvt02'
MODEL_VERSION='v03'

### create GCS bucket

In [357]:
BUCKET_NAME=f'merlin-transformers4rec-{VERSION}'
BUCKET_URI=f'gs://{BUCKET_NAME}'

print(f"BUCKET_URI: {BUCKET_URI}")

BUCKET_URI: gs://merlin-transformers4rec-jvt02


In [7]:
# ! gcloud storage buckets create $BUCKET_URI --location=$REGION --project=$PROJECT_ID

Creating gs://merlin-transformers4rec-jvt02/...


### copy artifact repo to GCS

In [359]:
# ! gcloud storage cp -r ./$REPO_WORKSPACE $BUCKET_URI/$MODEL_VERSION/

In [360]:
! gcloud storage ls $BUCKET_URI/$MODEL_VERSION/workspace/data

gs://merlin-transformers4rec-jvt02/v03/workspace/data/models/
gs://merlin-transformers4rec-jvt02/v03/workspace/data/processed_nvt/
gs://merlin-transformers4rec-jvt02/v03/workspace/data/saved_model/
gs://merlin-transformers4rec-jvt02/v03/workspace/data/sessions_by_day/
gs://merlin-transformers4rec-jvt02/v03/workspace/data/workflow_etl/


In [361]:
MODEL_ARTIFACTS_REPO_GCS = f"{BUCKET_URI}/{MODEL_VERSION}/workspace/data/models"
WORKFLOW_REPO_GCS = f"{BUCKET_URI}/{MODEL_VERSION}/workspace/data/workflow_etl"

print(f"MODEL_ARTIFACTS_REPO_GCS: {MODEL_ARTIFACTS_REPO_GCS}")
print(f"WORKFLOW_REPO_GCS: {WORKFLOW_REPO_GCS}")

MODEL_ARTIFACTS_REPO_GCS: gs://merlin-transformers4rec-jvt02/v03/workspace/data/models
WORKFLOW_REPO_GCS: gs://merlin-transformers4rec-jvt02/v03/workspace/data/workflow_etl


# Build Serving Image

### define model, endpoint, and serving image names

In [362]:
# set model names and version
MODEL_NAME = "0_predictpytorchtriton"
MODEL_DISPLAY_NAME = f"triton-{MODEL_NAME}-{MODEL_VERSION}"
DEPLOYED_MODEL_DISPLAY_NAME=f"deployed-{MODEL_DISPLAY_NAME}"
ENDPOINT_DISPLAY_NAME = f"endpoint-{MODEL_NAME}-{VERSION}"

# Docker definitions for training
IMAGE_NAME = f'{MODEL_DISPLAY_NAME}'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
DOCKERNAME='servet4rec-tr'


print(f"MODEL_DISPLAY_NAME: {MODEL_DISPLAY_NAME}")
print(f"DEPLOYED_MODEL_DISPLAY_NAME: {DEPLOYED_MODEL_DISPLAY_NAME}")
print(f"ENDPOINT_DISPLAY_NAME: {ENDPOINT_DISPLAY_NAME}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"DOCKERNAME: {DOCKERNAME}")

MODEL_DISPLAY_NAME: triton-0_predictpytorchtriton-v03
DEPLOYED_MODEL_DISPLAY_NAME: deployed-triton-0_predictpytorchtriton-v03
ENDPOINT_DISPLAY_NAME: endpoint-0_predictpytorchtriton-jvt02
IMAGE_URI: gcr.io/hybrid-vertex/triton-0_predictpytorchtriton-v03
DOCKERNAME: servet4rec-tr


### create serving dir

In [363]:
REPO_DOCKER_PATH_PREFIX = 'src'
SERVING_SUB_DIR = 'serving'

! rm -rf $REPO_DOCKER_PATH_PREFIX

!mkdir $REPO_DOCKER_PATH_PREFIX
!mkdir $REPO_DOCKER_PATH_PREFIX/$SERVING_SUB_DIR

In [364]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Writing src/cloudbuild.yaml


In [365]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/entrypoint.sh
#!/bin/bash
# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!/bin/bash

# Set up a global error handler
err_handler() {
    echo "Error on line: $1"
    echo "Caused by: $2"
    echo "That returned exit status: $3"
    echo "Aborting..."
    exit $3
}

trap 'err_handler "$LINENO" "$BASH_COMMAND" "$?"' ERR


if [ -z "${AIP_STORAGE_URI}" ]
  then
    echo 'AIP_STORAGE_URI not set. Exiting ....'
    exit 1
fi

if [ -z "$1" ]
  then
    MODEL_REPOSITORY=/model
  else
    MODEL_REPOSITORY=$1
fi

echo "Copying model ensemble from ${AIP_STORAGE_URI} to ${MODEL_REPOSITORY}"
mkdir ${MODEL_REPOSITORY} 
gsutil -m cp -r ${AIP_STORAGE_URI}/* ${MODEL_REPOSITORY}

# gsutil does not copy empty dirs so create a version folder for the ensemble
ENSEMBLE_DIR=$(ls ${MODEL_REPOSITORY} | grep ens)
mkdir ${MODEL_REPOSITORY}/${ENSEMBLE_DIR}/1 
export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}

echo "Starting Triton Server"
LD_PRELOAD=/usr/local/lib/libarrow.so tritonserver --model-repository=$MODEL_REPOSITORY --backend-config=pytorch,shm-default-byte-size=67108864 --vertex-ai-default-model=0_predictpytorchtriton --strict-model-config=true

Writing src/serving/entrypoint.sh


In [366]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

FROM nvcr.io/nvidia/tritonserver:23.01-py3 
# AS triton
# FROM nvcr.io/nvidia/merlin/merlin-pytorch:22.12

EXPOSE 8000
EXPOSE 8001
EXPOSE 8002

WORKDIR /src

# Copies the serving code to the docker image.
COPY serving/* serving/ 

RUN pip3 install -U pip
RUN pip3 install -r serving/requirements.txt
# RUN pip3 install google-cloud-aiplatform
# RUN pip3 install transformers
# RUN pip3 install transformers4rec[pytorch,nvtabular,dataloader]
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y

COPY serving/entrypoint.sh ./
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

Writing src/Dockerfile.servet4rec-tr


In [367]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/requirements.txt
google-cloud-aiplatform==1.21.0
transformers4rec[pytorch,nvtabular,dataloader]
google-api-core==2.10.2

Writing src/serving/requirements.txt


In [368]:
# cloud build
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = f'./{REPO_DOCKER_PATH_PREFIX}'

print(f"MACHINE_TYPE: {MACHINE_TYPE}")
print(f"FILE_LOCATION: {FILE_LOCATION}")

MACHINE_TYPE: e2-highcpu-32
FILE_LOCATION: ./src


### gcloud ignore

In [369]:
! gcloud config set gcloudignore/enabled true

E0302 20:14:29.750621465   21009 backup_poller.cc:134]       Run client channel backup poller: {"created":"@1677788069.750435173","description":"pollset_work","file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":320,"referenced_errors":[{"created":"@1677788069.750417091","description":"Bad file descriptor","errno":9,"file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":950,"os_error":"Bad file descriptor","syscall":"epoll_wait"}]}
Updated property [gcloudignore/enabled].


In [374]:
%%writefile .gcloudignore
.gcloudignore
/categories/
/tmp/
/workspace/
/torch_ensemble/
/workspace_v3/
/local_model_artifacts/
/workspace_v2/
/testing_entry/
*.ipynb
*.parquet
.git
.github
.ipynb_checkpoints/*
*__pycache__
*cpython-37.pyc
data_utils.py
t4rec_payload.json
single_t4rec_payload.json
credentials.json
instances.json
payload_ensemble.json

Overwriting .gcloudignore


In [375]:
!gcloud meta list-files-for-upload

src/cloudbuild.yaml
src/Dockerfile.servet4rec-tr
src/serving/entrypoint.sh
src/serving/requirements.txt


## submit image to Cloud Build

* run in notebook terminal or in notebook cell/

In [376]:
# DOCKERNAME=servet4rec-tr
# IMAGE_URI=gcr.io/hybrid-vertex/triton-0_predictpytorchtriton-v03
# FILE_LOCATION=./src
# MACHINE_TYPE=e2-highcpu-32

# gcloud builds submit --config src/cloudbuild.yaml \
#     --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
#     --timeout=2h \
#     --machine-type=$MACHINE_TYPE

# Create Vertex AI Model resource

* see [Triton protocol](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference)

## credentials

if running into IAM issues from notebook instance, run these in noteboook terminal...

#### [1] application_default_credentials
* creates file e.g.,: `[/root/.config/gcloud/application_default_credentials.json]`


In [422]:
# !gcloud auth application-default login

#### [2] set service account

In [423]:
# VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'
# !gcloud config set account $VERTEX_SA`

#### [3] grant SA permissions

In [424]:
# !gcloud projects add-iam-policy-binding $PROJECT_ID \
#     --member=serviceAccount:$VERTEX_SA \
#     --role=roles/storage.objectViewer

#### get credentials config for SDK

In [47]:
# CREDENTIALS_FILE = "./credentials.json"

# !gcloud iam service-accounts keys create $CREDENTIALS_FILE \
#     --iam-account=$VERTEX_SA

created key [90365f8890abbc778a4299ac9998342dea811f4d] of type [json] as [./credentials.json] for [jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com]


In [378]:
from google.oauth2 import service_account

# t4rec-nvidia-docs/credentials.json
credentials = service_account.Credentials.from_service_account_file('credentials.json')

### triton credentials

* TODO
* see [model_repository user guide](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md#cloud-storage-with-environment-variables) re: GCS environment variables and `TRITON_CLOUD_CREDENTIAL_PATH`

In [425]:
# ! gcloud storage cp credentials.json $BUCKET_URI/$MODEL_VERSION/

In [426]:
# ! gcloud storage cp /root/.config/gcloud/application_default_credentials.json $BUCKET_URI/$MODEL_VERSION/

In [None]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/cloud_credential.json
# {
#   "gs": {
#     "": "PATH_TO_GOOGLE_APPLICATION_CREDENTIALS",
#     "gs://gcs-bucket-002": "PATH_TO_GOOGLE_APPLICATION_CREDENTIALS_2"
#   },

In [None]:
# gs://merlin-transformers4rec-jvt01/v02/credentials.json

## upload model to Vertex Model Registry

In [379]:
from google.cloud import aiplatform as vertex_ai

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    credentials=credentials
)

In [380]:
print(f"MODEL_DISPLAY_NAME = {MODEL_DISPLAY_NAME}")
print(f"IMAGE_URI = {IMAGE_URI}")
print(f"MODEL_ARTIFACTS_REPO_GCS = {MODEL_ARTIFACTS_REPO_GCS}")
print(f"MODEL_NAME = {MODEL_NAME}")

MODEL_DISPLAY_NAME = triton-0_predictpytorchtriton-v03
IMAGE_URI = gcr.io/hybrid-vertex/triton-0_predictpytorchtriton-v03
MODEL_ARTIFACTS_REPO_GCS = gs://merlin-transformers4rec-jvt02/v03/workspace/data/models
MODEL_NAME = 0_predictpytorchtriton


In [381]:
SERVING_CONTAINER_ARGS = ['/models']

model = vertex_ai.Model.upload(
    display_name=f'{MODEL_DISPLAY_NAME}',
    serving_container_image_uri=IMAGE_URI,
    artifact_uri=MODEL_ARTIFACTS_REPO_GCS,
    credentials=credentials,
    serving_container_args=SERVING_CONTAINER_ARGS,
        # "tritonserver",  # tritonserver | triton
        # "--model-repository=$(AIP_STORAGE_URI)"
        # "--backend-config=python,shm-default-byte-size=4194304", # 4194304
        # f"--vertex-ai-default-model={MODEL_NAME}", # ensemble | {MODEL_NAME}
        # "--strict-model-config=true",
        # "--log-verbose=99",
        # "--log-error=1",
    # ],
    # serving_container_health_route=f"/v2/models/{MODEL_NAME}",        # "/health",
    # serving_container_predict_route=f"/v2/models/{MODEL_NAME}/infer", #"/predict",
    # serving_container_ports=[8080],
    # serving_container_environment_variables=
    # labels={"key": "value", "key_2": "value_2"},
    sync=True,
)


# model = aip.Model('projects/934903580331/locations/us-central1/models/2389777527854858240@1')

model.resource_name

Creating Model
Create Model backing LRO: projects/934903580331/locations/us-central1/models/4362072689666424832/operations/676066404016848896
Model created. Resource name: projects/934903580331/locations/us-central1/models/4362072689666424832@1
To use this Model in another session:
model = aiplatform.Model('projects/934903580331/locations/us-central1/models/4362072689666424832@1')


'projects/934903580331/locations/us-central1/models/4362072689666424832'

## Create Model endpoint

In [415]:
endpoint = vertex_ai.Endpoint.create(
    display_name=f'{ENDPOINT_DISPLAY_NAME}-v2',
    project=PROJECT_ID,
    location=REGION,
    # enable_request_response_logging=True
)

# endpoint = aip.Endpoint('projects/934903580331/locations/us-central1/endpoints/4659549958607732736')

endpoint.resource_name

Creating Endpoint
Create Endpoint backing LRO: projects/934903580331/locations/us-central1/endpoints/2776763839390154752/operations/3898391937400438784
Endpoint created. Resource name: projects/934903580331/locations/us-central1/endpoints/2776763839390154752
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/934903580331/locations/us-central1/endpoints/2776763839390154752')


'projects/934903580331/locations/us-central1/endpoints/2776763839390154752'

## deploy model to endpoint

In [416]:
traffic_percentage = 100
machine_type = "n1-highmem-16"
accelerator_type = "NVIDIA_TESLA_T4"
accelerator_count = 1
min_replica_count = 1
max_replica_count = 1
# DEPLOYED_MODEL_DISPLAY_NAME = 'deployed-triton-t4r-v3'

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=f'{DEPLOYED_MODEL_DISPLAY_NAME}-v2',
    machine_type=machine_type,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    traffic_percentage=traffic_percentage,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=VERTEX_SA,
    sync=True,
    # deploy_request_timeout=1800
)


Deploying model to Endpoint : projects/934903580331/locations/us-central1/endpoints/2776763839390154752
Deploy Endpoint model backing LRO: projects/934903580331/locations/us-central1/endpoints/2776763839390154752/operations/5472400007166427136
Endpoint model deployed. Resource name: projects/934903580331/locations/us-central1/endpoints/2776763839390154752


<google.cloud.aiplatform.models.Endpoint object at 0x7fe0b0108220> 
resource name: projects/934903580331/locations/us-central1/endpoints/2776763839390154752

# clean-up helpers

In [None]:
# vertex_ai.Endpoint.delete(endpoint)
# vertex_ai.Model.delete(model)