In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1.Overview

In this notebook, we want to provide a tutorial about how to make inference using HugeCTR trained WDL model. And we can collect the inference benchmark by Triton performance analyzer tool.

1. Overview
2. Generate the WDL deployment Configuration
3. Load Models on the Triton Server
4. Prepare Inference Input Data 
5. Inference Benchmarm by Triton Performance Tool

# 2. Generate the WDL Deployment Configuration

## 2.1 Generate related model folders

In [1]:
# define some data folder to store the model related files
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings

BASE_DIR = "/wdl_infer"
model_folder  = os.path.join(BASE_DIR, "model")
wdl_model_repo= os.path.join(model_folder, "wdl")
wdl_version =os.path.join(wdl_model_repo, "1")

if os.path.isdir(model_folder):
    shutil.rmtree(model_folder)
os.makedirs(model_folder)

if os.path.isdir(wdl_model_repo):
    shutil.rmtree(wdl_model_repo)
os.makedirs(wdl_model_repo)

if os.path.isdir(wdl_version):
    shutil.rmtree(wdl_version)
os.makedirs(wdl_version)

## 2.2 Copy WDL model files and configuration to model repository

In [2]:
!cp -r /wdl_train/wdl0_sparse_20000.model $wdl_version/
!cp -r /wdl_train/wdl1_sparse_20000.model $wdl_version/
!cp  /wdl_train/wdl_dense_20000.model $wdl_version/
!cp /wdl_train/wdl.json $wdl_version/
!ls -l $wdl_version

total 4098
-rw-r--r-- 1 root root    3731 Nov 30 06:01 wdl.json
drwxr-xr-x 2 root root    4096 Nov 30 06:01 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Nov 30 06:01 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Nov 30 06:01 wdl_dense_20000.model


## 2.3 Generate the Triton configuration for deploying WDL

In [103]:
%%writefile $wdl_model_repo/config.pbtxt
name: "wdl"
backend: "hugectr"
max_batch_size:64,
input [
  {
    name: "DES"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "CATCOLUMN"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "ROWINDEX"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

parameters [
  {
    key: "config"
    value: { string_value: "/wdl_infer/model/wdl/1/wdl.json" }
  },
  {
    key: "gpucache"
    value: { string_value: "true" }
  },
  {
    key: "hit_rate_threshold"
    value: { string_value: "0.9" }
  },
  {
    key: "gpucacheper"
    value: { string_value: "0.5" }
  },
  {
    key: "label_dim"
    value: { string_value: "1" }
  }
]

Overwriting /wdl_infer/model/wdl/config.pbtxt


## 2.4 Configure a RocksDB directory for localized storage
Make sure the RocksDB directory has read and write permissions for storing model embedded tables. Since we have created the RocksDB folder outside the container, please make sure to mount the correct folder path to /wdl_infer/rocksdb and configure the correct RocksDB path to the ps.json in the next step.

## 2.5 Generate the Hugectr Backend parameter server configuration for deploying wdl

In [5]:
%%writefile /wdl_infer/model/ps.json
{
    "supportlonglong": true,
    "volatile_db": {
        "type": "disabled",
        "address": "127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002",
        "user_name": "default",
        "password": "",
        "num_partitions": 8,
        "refresh_time_after_fetch": false,
        "max_get_batch_size": 100000,
        "max_set_batch_size": 100000,
        "overflow_policy": "evict_oldest",
        "overflow_margin": 10000000,
        "overflow_resolution_target": 0.8,
        "initial_cache_rate": 0.1,
        "cache_missed_embeddings": false,
        "update_filters": [ "^hps_.+$" ]
    },
    "persistent_db": {
        "type": "rocksdb",
        "path": "/wdl_infer/rocksdb",
        "num_threads": 16,
        "read_only": false,
        "max_get_batch_size": 10000,
        "max_set_batch_size": 10000,
        "update_filters": [ "^hps_.+$" ]
    },
    "update_source": {
        "type": "kafka",
        "brokers": "127.0.0.1:9092",
        "receive_buffer_size": 262144,
        "poll_timeout_ms": 500,
        "max_batch_size": 8192,
        "failure_backoff_ms": 50,
        "max_commit_interval": 32
    },
    "models":[
        {
            "model":"wdl",
            "sparse_files":["/wdl_infer/model/wdl/1/wdl0_sparse_20000.model", "/wdl_infer/model/wdl/1/wdl1_sparse_20000.model"],
            "dense_file":"/wdl_infer/model/wdl/1/wdl_dense_20000.model",
            "network_file":"/wdl_infer/model/wdl/1/wdl.json",
            "num_of_worker_buffer_in_pool": 4,
            "num_of_refresher_buffer_in_pool": 1,
            "deployed_device_list":[0],
            "max_batch_size":64,
            "default_value_for_each_table":[0.0,0.0],
            "hit_rate_threshold":0.9,
            "gpucacheper":0.5,
            "gpucache":true,
            "cache_refresh_percentage_per_iteration": 0.2,
            "maxnum_des_feature_per_sample": 13,
			"maxnum_catfeature_query_per_table_per_sample" : [2,26],
			"embedding_vecsize_per_table" : [1,15],
			"slot_num":28
        }
    ]  
}

Overwriting /wdl_infer/model/ps.json


In [6]:
!ls -l $wdl_model_repo
!ls -l $wdl_version

total 1
drwxr-xr-x 4 root root 4096 Nov 30 06:01 1
-rw-r--r-- 1 root root 1174 Nov 30 06:03 config.pbtxt
total 5858
-rw-r--r-- 1 root root    3731 Nov 30 06:01 wdl.json
drwxr-xr-x 2 root root    4096 Nov 30 06:01 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Nov 30 06:01 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Nov 30 06:01 wdl_dense_20000.model


# 3.Deploy WDL on Triton Server 

At this stage, you should have already launched the Triton Inference Server with the following command:

In this tutorial, we will deploy the Wide&Deep to a single A100(32GB),

Note: `Since Background processes not supported by Jupyter, please launch the Triton Server according to the following command independently in the background.`

In [None]:
!tritonserver --model-repository=/wdl_infer/model/ --load-model=wdl \
    --model-control-mode=explicit \
    --backend-directory=/usr/local/hugectr/backends \
    --backend-config=hugectr,ps=/wdl_infer/model/ps.json

### 3.1 Check Triton server status if deploy Wide&Deep model successfully

In [3]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/7.68.0

> Accept: */*

> 

* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK

< Content-Length: 0

< Content-Type: text/plain

< 

* Connection #0 to host localhost left intact


# 4. Prepare Inference Request

### 4.1 Read validation data

In [13]:
!ls -l /wdl_train/val

total 645762
-rw-r--r-- 1 root root        32 Nov 29 05:27 _file_list.txt
-rw-r--r-- 1 root root   8554464 Nov 29 05:27 _hugectr.keyset
-rw-r--r-- 1 root root     22726 Nov 29 05:27 _metadata
-rw-r--r-- 1 root root      1509 Nov 29 05:27 _metadata.json
-rw-r--r-- 1 root root 142825257 Nov 29 05:27 part_0.parquet
-rw-r--r-- 1 root root     21459 Nov 29 05:27 schema.pbtxt
drwxr-xr-x 2 root root      4096 Nov 29 05:26 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Nov 29 03:50 test.txt


In [1]:
import pandas as pd
df = pd.read_parquet("/wdl_train/val/part_0.parquet")

In [2]:
df.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,-0.054112,0.796048,1.873822,-0.122997,-0.023286,1.580263,0.743047,-0.258333,1.119755,1.386036,...,0,1,4,2,2,2,0,51,2,5
1,-0.059432,-0.537957,-0.594327,-0.152827,-0.224758,0.893091,-0.064249,2.592743,-0.470833,1.386036,...,3,8,1,0,875,0,537,196,7,7
2,-0.055886,-0.160306,0.478781,-0.157301,-0.224758,-0.206385,-0.064249,-0.276593,0.179862,-0.470383,...,0,1,4,2,2,2,0,245,2,5
3,-0.054112,-0.548824,-0.057773,-0.148352,-0.224758,1.442828,-0.064249,-0.067914,1.915049,3.242455,...,1,2,2,1,1,1,1,577,1,2
4,-0.045245,-0.483618,-0.487016,-0.157301,-0.224758,-0.206385,-0.064249,-0.279201,-0.687732,-0.470383,...,2,1,8,0,5387,0,4678,798,1,3


In [3]:
df.head(10).to_csv('/wdl_infer/infer_test.csv', sep=',', index=False,header=True)

## 4.2 Follow the Triton requirements to generate inference requests

In [6]:
%%writefile '/wdl_infer/wdl2predict.py'
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'wdl'
CATEGORICAL_COLUMNS=["C1_C2","C3_C4"]+["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
emb_size_array = [278018, 415262,249058, 19561, 14212, 6890, 18592, 4, 6356, 1254, 52, 226170, 80508, 72308, 11, 2169, 7597, 61, 4, 923, 15, 249619, 168974, 243480, 68212, 9169, 75, 34]
shift = np.insert(np.cumsum(emb_size_array), 0, 0)[:-1]
test_df=pd.read_csv("/wdl_infer/infer_test.csv",sep=',')



with httpclient.InferenceServerClient("localhost:8000") as client:
    dense_features = np.array([list(test_df[CONTINUOUS_COLUMNS].values.flatten())],dtype='float32')
    embedding_columns = np.array([list((test_df[CATEGORICAL_COLUMNS]+shift).values.flatten())],dtype='int64')
    row_ptrs = np.array([list(range(0,21))+list(range(0,261))],dtype='int32')
    
    inputs = [
        httpclient.InferInput("DES", dense_features.shape,
                              np_to_triton_dtype(dense_features.dtype)),
        httpclient.InferInput("CATCOLUMN", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("ROWINDEX", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(dense_features)
    inputs[1].set_data_from_numpy(embedding_columns)
    inputs[2].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))



Overwriting /wdl_infer/wdl2predict.py


## 4.3 Send requests to Triton Server

In [2]:
!python3 /wdl_infer/wdl2predict.py

{'id': '1', 'model_name': 'wdl', 'model_version': '1', 'parameters': {'NumSample': 10, 'DeviceID': 0}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [10], 'parameters': {'binary_data_size': 40}}]}
Prediction Result:
[0.03392845 0.02259001 0.00255735 0.00028795 0.00226125 0.02724345
 0.00389859 0.00180469 0.03567842 0.00802261]
