In [20]:
import os
from time import time
import re
import shutil
import glob
import warnings

import numpy as np
import tritonclient.http as httpclient
from tritonclient.utils import *

### prepare repo

In [21]:
BASE_DIR = "/hps_demo"

def repo_check(repo_path):
    if os.path.isdir(repo_path):
        shutil.rmtree(repo_path)
    os.makedirs(repo_path)

hps_embedding_repo    = os.path.join(BASE_DIR, "hps_embedding")
repo_check(hps_embedding_repo)
hps_embedding_version = os.path.join(hps_embedding_repo, "1")
repo_check(hps_embedding_version)

tf_dense_model_repo    = os.path.join(BASE_DIR, "tf_dense_model")
repo_check(tf_dense_model_repo)
tf_dense_model_version = os.path.join(tf_dense_model_repo, "1")
repo_check(tf_dense_model_version)

ensemble_model_repo = os.path.join(BASE_DIR, "ensemble_model")
repo_check(ensemble_model_repo)
ensemble_model_version = os.path.join(ensemble_model_repo, "1")
repo_check(ensemble_model_version)

In [22]:
!tree /hps_demo

[01;34m/hps_demo[00m
├── [01;34mensemble_model[00m
│   └── [01;34m1[00m
├── [01;34mhps_embedding[00m
│   └── [01;34m1[00m
└── [01;34mtf_dense_model[00m
    └── [01;34m1[00m

6 directories, 0 files


### "hps_embedding" backend configuration 

In [23]:
%%writefile $hps_embedding_repo/config.pbtxt
name: "hps_embedding"
backend: "hps"
max_batch_size:1024,
input [
  {
    name: "KEYS"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "NUMKEYS"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
version_policy: {
        specific:{versions: 1}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

Writing /hps_demo/hps_embedding/config.pbtxt


In [24]:
%%writefile $BASE_DIR/hps_embedding.json
{
    "supportlonglong": true,
    "models": [{
        "model": "hps_embedding",
        "sparse_files": ["/hps_demo/hps_embedding/1/naive_dnn_sparse.model"],
        "num_of_worker_buffer_in_pool": 3,
        "embedding_table_names":["sparse_embedding1"],
        "embedding_vecsize_per_table": [16],
        "maxnum_catfeature_query_per_table_per_sample": [3],
        "default_value_for_each_table": [1.0],
        "deployed_device_list": [0],
        "max_batch_size": 65536,
        "cache_refresh_percentage_per_iteration": 0.2,
        "hit_rate_threshold": 1.0,
        "gpucacheper": 1.0,
        "gpucache": true
        }
    ]
}

Writing /hps_demo/hps_embedding.json


In [25]:
!cp -r ./naive_dnn_sparse.model $hps_embedding_version/

### "tf_dense_model" backend configuration 

In [42]:
%%writefile $tf_dense_model_repo/config.pbtxt
name: "tf_dense_model"
platform: "tensorflow_savedmodel"
max_batch_size:16,
input [
  {
    name: "args_0"
    data_type: TYPE_FP32
    dims: [48]
  }
]
output [
  {
    name: "fc_3"
    data_type: TYPE_FP32
    dims: [1]
  }
]
version_policy: {
        specific:{versions: 1}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

Overwriting /hps_demo/tf_dense_model/config.pbtxt


In [27]:
!cp -r ./naive_dnn_dense.model $tf_dense_model_version/model.savedmodel

In [28]:
!tree $BASE_DIR

[01;34m/hps_demo[00m
├── [01;34mensemble_model[00m
│   └── [01;34m1[00m
├── [01;34mhps_embedding[00m
│   ├── [01;34m1[00m
│   │   └── [01;34mnaive_dnn_sparse.model[00m
│   │       ├── emb_vector
│   │       └── key
│   └── config.pbtxt
├── hps_embedding.json
└── [01;34mtf_dense_model[00m
    ├── [01;34m1[00m
    │   └── [01;34mmodel.savedmodel[00m
    │       ├── [01;34massets[00m
    │       ├── keras_metadata.pb
    │       ├── saved_model.pb
    │       └── [01;34mvariables[00m
    │           ├── variables.data-00000-of-00001
    │           └── variables.index
    └── config.pbtxt

10 directories, 9 files


In [None]:
tritonserver --model-repository=/hps_infer/embedding/  --model-control-mode=explicit --backend-config=tensorflow,version=2 --load-model=hps_dense

In [None]:
tritonserver --model-repository=/workspace/workspace/hps_infer/embedding/  --model-control-mode=explicit --backend-config=tensorflow,version=2 --load-model=hps_dense

In [None]:
tritonserver --model-repository=/hps_infer/embedding/ --load-model=hps_triton_ensemble --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hps_infer/embedding/hps_triton_ensemble.json

In [None]:
tritonserver --model-repository=/hps_infer/embedding/ --load-model=hps_dense --backend-config=tensorflow,version=2 --model-control-mode=explicit

# start triton server

### hps_embedding

`tritonserver --model-repository=/hps_demo/ --load-model=hps_embedding --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hps_demo/hps_embedding.json --model-control-mode=explicit`

`tritonserver --model-repository=/hps_demo/ --load-model=hps_embedding --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hps_demo/hps_embedding.json --model-control-mode=explicit --grpc-port=8009 --http-port=8010 --metrics-port=8011`

In [78]:
!curl -v localhost:8010/v2/health/ready

*   Trying 127.0.0.1:8010...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8010 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8010
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host localhost left intact


In [73]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8010", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))
    
triton_client.is_server_live()

client created.
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


True

In [74]:
triton_client.get_model_repository_index()

POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '108'}>
bytearray(b'[{"name":"ensemble_model"},{"name":"hps_embedding","version":"1","state":"READY"},{"name":"tf_dense_model"}]')


[{'name': 'ensemble_model'},
 {'name': 'hps_embedding', 'version': '1', 'state': 'READY'},
 {'name': 'tf_dense_model'}]

In [79]:
model_name = "hps_embedding"
triton_client.load_model(model_name=model_name)

POST /v2/repository/models/hps_embedding/load, headers None
{}
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'hps_embedding'


In [80]:
key_tensor = np.array([[0,1,2]])
nkey_tensor = np.array([[3]],dtype='int32')

inputs = [
    httpclient.InferInput("KEYS", 
                          key_tensor.shape,
                          np_to_triton_dtype(np.int64)),
    httpclient.InferInput("NUMKEYS", 
                          nkey_tensor.shape,
                          np_to_triton_dtype(np.int32)),
]
inputs[0].set_data_from_numpy(key_tensor)
inputs[1].set_data_from_numpy(nkey_tensor)

In [81]:
outputs = [
    httpclient.InferRequestedOutput("OUTPUT0")  # binary_data=
]

In [83]:
with httpclient.InferenceServerClient("localhost:8010") as client:
    response = client.infer(model_name,
                            inputs,
                            outputs=outputs)
    result = response.get_response()
    
    print(result)
    print(response.as_numpy("OUTPUT0").shape)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))

{'model_name': 'hps_embedding', 'model_version': '1', 'parameters': {'NumSample': 1, 'DeviceID': 0}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [48], 'parameters': {'binary_data_size': 192}}]}
(48,)
Prediction Result:
[0.20868957 0.20868957 0.20868957 0.20868957 0.20868957 0.20868957
 0.20868957 0.20868957 0.20868957 0.20868957 0.20868957 0.20868957
 0.20868957 0.20868957 0.20868957 0.20868957 0.25324407 0.25324407
 0.25324407 0.25324407 0.25324407 0.25324407 0.25324407 0.25324407
 0.25324407 0.25324407 0.25324407 0.25324407 0.25324407 0.25324407
 0.25324407 0.25324407 0.24849838 0.24849838 0.24849838 0.24849838
 0.24849838 0.24849838 0.24849838 0.24849838 0.24849838 0.24849838
 0.24849838 0.24849838 0.24849838 0.24849838 0.24849838 0.24849838]


### tf_dense_model

need to use merlin-inference container, or user should build TIS tensorflow backend first

`tritonserver --model-repository=/hps_demo/ --load-model=tf_dense_model --backend-config=tensorflow,version=2 --model-control-mode=explicit`

In [84]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host localhost left intact


In [85]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))
    
triton_client.is_server_live()

client created.
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


True

In [86]:
triton_client.get_model_repository_index()

POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '108'}>
bytearray(b'[{"name":"ensemble_model"},{"name":"hps_embedding"},{"name":"tf_dense_model","version":"1","state":"READY"}]')


[{'name': 'ensemble_model'},
 {'name': 'hps_embedding'},
 {'name': 'tf_dense_model', 'version': '1', 'state': 'READY'}]

In [87]:
model_name = "tf_dense_model"
triton_client.load_model(model_name=model_name)

POST /v2/repository/models/tf_dense_model/load, headers None
{}
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'tf_dense_model'


In [88]:
input_vector= np.random.random((16,48)).astype(np.float32)

inputs = [
    httpclient.InferInput("args_0", 
                          input_vector.shape,
                          np_to_triton_dtype(np.float32)),
]
inputs[0].set_data_from_numpy(input_vector)

In [89]:
outputs = [
    httpclient.InferRequestedOutput("fc_3")  # binary_data=
]

In [90]:
with httpclient.InferenceServerClient("localhost:8000") as client:
    response = client.infer(model_name,
                            inputs,
                            outputs=outputs)
    result = response.get_response()
    
    print(result)
    print(response.as_numpy("fc_3").shape)
    print("Prediction Result:")
    print(response.as_numpy("fc_3"))

{'model_name': 'tf_dense_model', 'model_version': '1', 'outputs': [{'name': 'fc_3', 'datatype': 'FP32', 'shape': [16, 1], 'parameters': {'binary_data_size': 64}}]}
(16, 1)
Prediction Result:
[[5942.154 ]
 [5562.44  ]
 [5896.9277]
 [6393.83  ]
 [5681.9185]
 [5549.136 ]
 [5952.219 ]
 [6012.1567]
 [5906.5796]
 [6557.614 ]
 [5916.146 ]
 [5254.8657]
 [5245.139 ]
 [5265.368 ]
 [6218.0913]
 [5348.871 ]]


In [None]:
name: "ensemble_name"
platform: "ensemble"
max_batch_size: 1
input [
  {
    name: "INPUT"
    data_type: TYPE_FP32
    dims: [ 0, 0, 0 ]
  }
]
output [
  {
    name: "OUTPUT"
    data_type: TYPE_FP32
    dims: [ 0, 0 ]
  }
]

ensemble_scheduling {
  step [
    {
      model_name: "MODEL1"
      model_version: -1
      input_map {
        key: "INPUT"
        value: "INPUT"
      }
      output_map {
        key: "OUTPUT"
        value: "OUTPUT"
      }
    },
    {
      model_name: "MODEL2"
      model_version: -1
      input_map {
        key: "INPUT"
        value: "INPUT"
      }
      output_map {
        key: "OUTPUT"
        value: "OUTPUT"
      }
    }
  ]
}