# Overview
The **02_model_inference.ipynb** will cover following tasks
  * Configure three backends in Triton format
  * Deploy to inference with Triton ensemble mode
  * Validate deployed ensemble model with dummy dataset

In [110]:
import os
import shutil
import numpy as np
import tritonhttpclient
import tritonclient.http as httpclient
from tritonclient.utils import *

## Configure 3 backends in Triton format
The 3 backends are:
* "hps_embedding" backend, HPS Triton backend for embedding lookup serving
* "tf_reshape_dense_model" backend, Tensorflow Triton backend for dense model serving
* "ensemble_model" backend, integrates the above two backends and serves as one ensemble service
![HPS_Triton_overview](./pic/hps_triton_overview.jpg)

In [114]:
args = dict()
args["slot_num"] = 3

### Prepare Triton Inference Server directories

In [100]:
BASE_DIR = "/hps_demo"

def repo_check(repo_path):
    if os.path.isdir(repo_path):
        shutil.rmtree(repo_path)
    os.makedirs(repo_path)

hps_embedding_repo    = os.path.join(BASE_DIR, "hps_embedding")
repo_check(hps_embedding_repo)
hps_embedding_version = os.path.join(hps_embedding_repo, "1")
repo_check(hps_embedding_version)

tf_reshape_dense_model_repo    = os.path.join(BASE_DIR, "tf_reshape_dense_model")
repo_check(tf_reshape_dense_model_repo)
tf_reshape_dense_model_version = os.path.join(tf_reshape_dense_model_repo, "1")
repo_check(tf_reshape_dense_model_version)

ensemble_model_repo = os.path.join(BASE_DIR, "ensemble_model")
repo_check(ensemble_model_repo)
ensemble_model_version = os.path.join(ensemble_model_repo, "1")
repo_check(ensemble_model_version)

In [102]:
# check created repository 
!tree /hps_demo

[01;34m/hps_demo[00m
├── [01;34mensemble_model[00m
│   └── [01;34m1[00m
├── [01;34mhps_embedding[00m
│   └── [01;34m1[00m
└── [01;34mtf_reshape_dense_model[00m
    └── [01;34m1[00m

6 directories, 0 files


### Configure "hps_embedding" HPS backend
For more references of HPS backend building, please check [Hierarchical Parameter Server Demo](../../samples/Hierarchical_Parameter_Server_Deployment.ipynb).

In [56]:
%%writefile $hps_embedding_repo/config.pbtxt
name: "hps_embedding"
backend: "hps"
max_batch_size:1024
input [
  {
    name: "KEYS"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "NUMKEYS"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
version_policy: {
        specific:{versions: 1}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

Writing /hps_demo/hps_embedding/config.pbtxt


Generate the HPS configuration for deploying embedding tables

In [103]:
%%writefile $hps_embedding_repo/hps_embedding.json
{
    "supportlonglong": true,
    "models": [{
        "model": "hps_embedding",
        "sparse_files": ["/hps_demo/hps_embedding/1/naive_dnn_sparse.model"],
        "num_of_worker_buffer_in_pool": 3,
        "embedding_table_names":["sparse_embedding1"],
        "embedding_vecsize_per_table": [16],
        "maxnum_catfeature_query_per_table_per_sample": [3],
        "default_value_for_each_table": [1.0],
        "deployed_device_list": [0],
        "max_batch_size": 65536,
        "cache_refresh_percentage_per_iteration": 0.2,
        "hit_rate_threshold": 1.0,
        "gpucacheper": 1.0,
        "gpucache": true
        }
    ]
}

Writing /hps_demo/hps_embedding/hps_embedding.json


In [105]:
# copy sparse(HPS) model to target folder
!cp -r ./naive_dnn_sparse.model $hps_embedding_version/

### Configure "tf_reshape_dense_model" Tensorflow backend 
**Note**: For Triton TensorFlow backend, **platform** must be set to tensorflow_graphdef or **tensorflow_savedmodel**. Optionally 'backend' can be set to tensorflow [[link](https://github.com/triton-inference-server/backend#backends)].

In [61]:
%%writefile $tf_reshape_dense_model_repo/config.pbtxt
name: "tf_reshape_dense_model"
platform: "tensorflow_savedmodel"
max_batch_size:0
input [
  {
    name: "input_1"
    data_type: TYPE_FP32
    dims: [-1]
  }
]
output [
  {
    name: "output_1"
    data_type: TYPE_FP32
    dims: [-1,1]
  }
]
version_policy: {
        specific:{versions: 1}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]
dynamic_batching {
    max_queue_delay_microseconds: 100
}

Overwriting /hps_demo/tf_reshape_dense_model/config.pbtxt


In [106]:
# copy reshaped TF dense model to target folder
!cp -r ./naive_dnn_reshape_dense.model $tf_reshape_dense_model_version/model.savedmodel

### Configure "ensemble_model" Triton backend
**Note**: the "**key**" setting in step block must match the naming configuration in the former HPS and TF backends.

In [107]:
%%writefile $ensemble_model_repo/config.pbtxt
name: "ensemble_model"
platform: "ensemble"
max_batch_size: 16
input [
  {
    name: "EMB_KEY"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "EMB_N_KEY"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "DENSE_OUTPUT"
    data_type: TYPE_FP32
    dims: [1]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "hps_embedding"
      model_version: -1
      input_map {
        key: "KEYS"
        value: "EMB_KEY"
      }
      input_map {
        key: "NUMKEYS"
        value: "EMB_N_KEY"
      }
      output_map {
        key: "OUTPUT0"
        value: "LOOKUP_VECTORS"
      }
    },
    {
      model_name: "tf_reshape_dense_model"
      model_version: -1
      input_map {
        key: "input_1"
        value: "LOOKUP_VECTORS"
      }
      output_map {
        key: "output_1"
        value: "DENSE_OUTPUT"
      }
    }
  ]
}

Writing /hps_demo/ensemble_model/config.pbtxt


### Check the generated directory and configurations

In [108]:
!tree $BASE_DIR

[01;34m/hps_demo[00m
├── [01;34mensemble_model[00m
│   ├── [01;34m1[00m
│   └── config.pbtxt
├── [01;34mhps_embedding[00m
│   ├── [01;34m1[00m
│   │   └── [01;34mnaive_dnn_sparse.model[00m
│   │       ├── emb_vector
│   │       └── key
│   └── hps_embedding.json
└── [01;34mtf_reshape_dense_model[00m
    └── [01;34m1[00m
        └── [01;34mmodel.savedmodel[00m
            ├── [01;34massets[00m
            ├── keras_metadata.pb
            ├── saved_model.pb
            └── [01;34mvariables[00m
                ├── variables.data-00000-of-00001
                └── variables.index

10 directories, 8 files


## Start Triton Inference Server, load 3 backends

Now, we assume you have checked your **tritonserver** version and confirmed that can run tritonserver command inside your docker container.

For this tutorial, the command to start Triton will be
> **tritonserver --model-repository=/hps_demo/ --backend-config=hps,ps=/hps_demo/hps_embedding/hps_embedding.json --backend-config=tensorflow,version=2**

A few tips when using Tritonserver:
* Tritonserver `--backend-directory` parameter only take one repository, we suggest you link the compiled hps `libtriton_hps.so` file to Triton default backend repository first. For instance, `ln -s /usr/local/hugectr/backends/hps/ /opt/tritonserver/backends/`

* Tensorflow backend `libtriton_tensorflow2.so` need to be compiled first, and copy to the Triton default directory

* Tritonserver support multiple `--backend-config` setting, e.g., we defined two backends (HPS and TF) in this tutorial

* Tritonserver is using `--grpc-port=8000 --http-port=8001 --metrics-port=8002` as default, need to configure if has port conflict 

* No need to set `--model-control-mode=explicit` setting for Triton ensemble mode

If you successfully started tritonserver, you should see a log similar to following
```
...
+------------------+------+
| Repository Agent | Path |
+------------------+------+
+------------------+------+

I0809 10:24:19.230206 53894 server.cc:583] 
+------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Backend    | Path                                                            | Config                                                                                                               |
+------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| tensorflow | /opt/tritonserver/backends/tensorflow2/libtriton_tensorflow2.so | {"cmdline":{"auto-complete-config":"false","backend-directory":"/opt/tritonserver/backends","min-compute-capability" |
|            |                                                                 | :"6.000000","version":"2","default-max-batch-size":"4"}}                                                             |
| hps        | /opt/tritonserver/backends/hps/libtriton_hps.so                 | {"cmdline":{"auto-complete-config":"false","backend-directory":"/opt/tritonserver/backends","min-compute-capability" |
|            |                                                                 | :"6.000000","ps":"/hps_demo/hps_embedding.json","default-max-batch-size":"4"}}                                       |
+------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+

I0809 10:24:19.230310 53894 server.cc:626] 
+--------------------+---------+--------+
| Model              | Version | Status |
+--------------------+---------+--------+
| ensemble_model     | 1       | READY  |
| hps_embedding      | 1       | READY  |
| tf_new_dense_model | 1       | READY  |
+--------------------+---------+--------+
...
```

## Validate deployed ensemble model with dummy dataset
### Step.1 Check Tritonserver health
**Note**: if you are using default Tritonserver settings, the default port will be `8000` 

In [109]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host localhost left intact


In [111]:
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))
    
triton_client.is_server_live()

client created.
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


True

### Step.2 Check loaded backends

In [112]:
triton_client.get_model_repository_index()

POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '176'}>
bytearray(b'[{"name":"ensemble_model","version":"1","state":"READY"},{"name":"hps_embedding","version":"1","state":"READY"},{"name":"tf_reshape_dense_model","version":"1","state":"READY"}]')


[{'name': 'ensemble_model', 'version': '1', 'state': 'READY'},
 {'name': 'hps_embedding', 'version': '1', 'state': 'READY'},
 {'name': 'tf_reshape_dense_model', 'version': '1', 'state': 'READY'}]

### Step.3 Prepare mock request

In [129]:
# generate mock requests based on model training settings
batch_size = 1
key_tensor  = np.random.randint(1,10,(batch_size,args["slot_num"])).astype(np.int64)
nkey_tensor = np.full((batch_size, 1), 3).astype(np.int32)
print("Input key tensor is \n{}, \nnumber of key tensor is \n{}".format(key_tensor, nkey_tensor))

inputs = [
    httpclient.InferInput("EMB_KEY", 
                          key_tensor.shape,
                          np_to_triton_dtype(np.int64)),
    httpclient.InferInput("EMB_N_KEY", 
                          nkey_tensor.shape,
                          np_to_triton_dtype(np.int32)),
]
inputs[0].set_data_from_numpy(key_tensor)
inputs[1].set_data_from_numpy(nkey_tensor)

outputs = [
    httpclient.InferRequestedOutput("DENSE_OUTPUT")
]

Input key tensor is 
[[4 1 2]], 
number of key tensor is 
[[3]]


### Step.4 Send request to Triton server

In [130]:
model_name = "ensemble_model"

with httpclient.InferenceServerClient("localhost:8000") as client:
    response = client.infer(model_name,
                            inputs,
                            outputs=outputs)
    result = response.get_response()
    
    print("Prediction result is {}".format(response.as_numpy("DENSE_OUTPUT")))
    print("Response details:\n{}".format(result))

Prediction result is [[1918.4756]]
Response details:
{'model_name': 'ensemble_model', 'model_version': '1', 'parameters': {'sequence_id': 0, 'sequence_start': False, 'sequence_end': False}, 'outputs': [{'name': 'DENSE_OUTPUT', 'datatype': 'FP32', 'shape': [1, 1], 'parameters': {'binary_data_size': 4}}]}
