In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1.Overview

In this notebook, we want to provide a tutorial about how to make inference using HugeCTR trained WDL model. And we can collect the inference benchmark by Triton performance analyzer tool.

1. Overview
2. Generate the WDL deployment Configuration
3. Load Models on the Triton Server
4. Prepare Inference Input Data 
5. Inference Benchmarm by Triton Performance Tool

# 2. Generate the WDL Deployment Configuration

## 2.1 Generate related model folders

In [4]:
# define some data folder to store the model related files
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings

BASE_DIR = "/wdl_infer"
model_folder  = os.path.join(BASE_DIR, "model")
wdl_model_repo= os.path.join(model_folder, "wdl")
wdl_version =os.path.join(wdl_model_repo, "1")

if os.path.isdir(model_folder):
    shutil.rmtree(model_folder)
os.makedirs(model_folder)

if os.path.isdir(wdl_model_repo):
    shutil.rmtree(wdl_model_repo)
os.makedirs(wdl_model_repo)

if os.path.isdir(wdl_version):
    shutil.rmtree(wdl_version)
os.makedirs(wdl_version)

## 2.2 Copy WDL model files and configuration to model repository

In [115]:
!cp -r /wdl_train/wdl0_sparse_20000.model $wdl_version/
!cp -r /wdl_train/wdl1_sparse_20000.model $wdl_version/
!cp  /wdl_train/wdl_dense_20000.model $wdl_version/
!cp /wdl_train/wdl.json $wdl_version/
!ls -l $wdl_version

total 5840
-rw-r--r-- 1 root root    3158 Jul  6 07:17 wdl.json
drwxr-xr-x 2 root root    4096 Jul  6 07:17 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Jul  6 07:17 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Jul  6 07:17 wdl_dense_20000.model


## 2.3 Generate the Triton configuration for deploying WDL

In [116]:
%%writefile $wdl_model_repo/config.pbtxt
name: "wdl"
backend: "hugectr"
max_batch_size:64,
input [
   {
    name: "DES"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "CATCOLUMN"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "ROWINDEX"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

parameters [
  {
  key: "config"
  value: { string_value: "/wdl_infer/model/wdl/1/wdl.json" }
  },
  {
  key: "gpucache"
  value: { string_value: "true" }
  },
  {
  key: "hit_rate_threshold"
  value: { string_value: "0.8" }
  },
  {
  key: "gpucacheper"
  value: { string_value: "0.5" }
  },
  {
  key: "label_dim"
  value: { string_value: "1" }
  },
  {
  key: "slots"
  value: { string_value: "28" }
  },
  {
  key: "cat_feature_num"
  value: { string_value: "28" }
  },
 {
  key: "des_feature_num"
  value: { string_value: "13" }
  },
  {
  key: "max_nnz"
  value: { string_value: "2" }
  },
  {
  key: "embedding_vector_size"
  value: { string_value: "128" }
  },
  {
  key: "embeddingkey_long_type"
  value: { string_value: "true" }
  }
]

Overwriting /wdl_infer/model/wdl/config.pbtxt


## 2.4 configure a RocksDB directory for localized storage
Make sure the RocksDB directory has read and write permissions for storing model embedded tables. Since we have created the RocksDB folder outside the container, please make sure to mount the correct folder path to /wdl_infer/rocksdb and configure the correct RocksDB path to the ps.json in the next step.

## 2.5 Generate the Hugectr Backend parameter server configuration for deploying wdl

In [45]:
%%writefile /wdl_infer/model/ps.json
{
    "supportlonglong":true,
    "db_type":"hierarchy",
    "redis_ip":"127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002",
    "rocksdb_path":"/wdl_infer/rocksdb",
    "cache_size_percentage_redis":"0.1",
    "models":[
        {
            "model":"wdl",
            "sparse_files":["/wdl_infer/model/wdl/1/wdl0_sparse_20000.model", "/wdl_infer/model/wdl/1/wdl1_sparse_20000.model"],
            "dense_file":"/wdl_infer/model/wdl/1/wdl_dense_20000.model",
            "network_file":"/wdl_infer/model/wdl/1/wdl.json",
            "num_of_worker_buffer_in_pool": "4",
			"deployed_device_list":["0"],
			"max_batch_size":"1024",
			"default_value_for_each_table":["0.0","0.0"],
            "hit_rate_threshold":"0.9",
            "gpucacheper":"0.5",
            "gpucache":"true"

        }
    ]  
}

Overwriting /wdl_infer/model/ps.json


In [5]:
!ls  -l $wdl_model_repo
!ls -l $wdl_version

total 8
drwxr-xr-x 4 root root 4096 Jul  6 07:16 1
-rw-r--r-- 1 root root 1174 Jul  6 07:17 config.pbtxt
total 5840
-rwxrwxrwx 1 root root    3590 Jul 29 07:49 wdl.json
drwxr-xr-x 2 root root    4096 Jul  6 07:17 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Jul  6 07:17 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Jul  6 07:17 wdl_dense_20000.model


# 3.Deploy WDL on Triton Server 

At this stage, you should have already launched the Triton Inference Server with the following command:

In this tutorial, we will deploy the Wide&Deep to a single A100(32GB),

`Since Background processes not supported by Jupyter, please launch the triton server according to the following command independently`

In [11]:
!tritonserver --model-repository=/wdl_infer/model/ --load-model=wdl \
    --model-control-mode=explicit \
    --backend-directory=/usr/local/hugectr/backends \
    --backend-config=hugectr,ps=/wdl_infer/model/ps.json

tritonserver: /opt/conda/lib/libcurl.so.4: no version information available (required by /opt/tritonserver/bin/../lib/libtritonserver.so)
I0914 10:45:56.023240 2543 metrics.cc:228] Collecting metrics for GPU 0: NVIDIA A100-PCIE-40GB
I0914 10:45:56.037039 2543 metrics.cc:228] Collecting metrics for GPU 1: NVIDIA A10
I0914 10:45:56.049695 2543 metrics.cc:228] Collecting metrics for GPU 2: NVIDIA A30
I0914 10:45:56.061967 2543 metrics.cc:228] Collecting metrics for GPU 3: Tesla T4
I0914 10:45:56.664522 2543 pinned_memory_manager.cc:206] Pinned memory pool is created at '0x7f904e000000' with size 268435456
I0914 10:45:56.685953 2543 cuda_memory_manager.cc:103] CUDA memory pool is created on device 0 with size 67108864
I0914 10:45:56.685989 2543 cuda_memory_manager.cc:103] CUDA memory pool is created on device 1 with size 67108864
I0914 10:45:56.686009 2543 cuda_memory_manager.cc:103] CUDA memory pool is created on device 2 with size 67108864
I0914 10:45:56.686027 2543 cuda_memory_manager.c

I0914 10:46:05.925549 2543 hugectr.cc:817] ******Creating Embedding Cache for model wdl successfully
I0914 10:46:05.925885 2543 hugectr.cc:1242] TRITONBACKEND_ModelInstanceInitialize: wdl_0 (device 2)
I0914 10:46:05.925903 2543 hugectr.cc:932] Triton Model Instance Initialization on device 2
I0914 10:46:05.925916 2543 hugectr.cc:943] Dense Feature buffer allocation: 
I0914 10:46:06.528513 2543 hugectr.cc:949] Categorical Feature buffer allocation: 
I0914 10:46:06.645175 2543 hugectr.cc:964] Categorical Row Index buffer allocation: 
I0914 10:46:06.645260 2543 hugectr.cc:970] Predict result buffer allocation: 
I0914 10:46:06.645307 2543 hugectr.cc:1265] ******Loading HugeCTR Model***** 
I0914 10:46:06.645327 2543 hugectr.cc:988] The model origin json configuration file path is: /wdl_infer/model/wdl/1/wdl.json
[14d10h46m06s][HUGECTR][INFO]: Global seed is 1088844895
[14d10h46m06s][HUGECTR][INFO]: Device to NUMA mapping:
  GPU 2 ->  node 1

[14d10h46m22s][HUGECTR][INFO]: Peer-to-peer acces

### 3.1 Check Triton server status if deploy Wide&Deep model successfully

In [50]:
!curl -v localhost:8000/v2/health/ready

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
*   Trying 127.0.0.1:8000...
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/7.68.0

> Accept: */*

> 

* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK

< Content-Length: 0

< Content-Type: text/plain

< 

* Connection #0 to host localhost left intact


# 4. Prepare Inference Request

### 4.1 Read validation data

In [35]:
!ls -l /wdl_train/val

total 637376
-rw-r--r-- 1 root root 142856977 Jul  5 05:44 0.110d099942694a5cbf1b71eb73e10f27.parquet
-rw-r--r-- 1 root root        51 Jul  6 07:02 _file_list.txt
-rw-r--r-- 1 root root     27701 Jul  5 05:44 _metadata
-rw-r--r-- 1 root root      1537 Jul  5 05:44 _metadata.json
drwxr-xr-x 2 root root      4096 Jul  5 05:42 temp-parquet-after-conversion
-rw-r--r-- 1 1025 1025 509766965 Jul  5 04:45 test.txt


In [36]:
import pandas as pd
df = pd.read_parquet("/wdl_train/val/0.110d099942694a5cbf1b71eb73e10f27.parquet")

In [37]:
df.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0.061161,0.974006,-0.594327,-0.157301,-0.224758,0.618222,-0.064249,-0.28181,-0.760031,1.386036,...,2,666,1,33722,24373,91481,62242,7673,44,28
1,-0.061206,-0.437431,0.156849,-0.146861,-0.193763,0.893091,-0.064249,0.286841,-0.109336,3.242455,...,1,666,10,0,97438,0,21446,4472,56,19
2,0.043427,-0.4646,-0.379705,-0.120014,0.054203,-0.206385,-0.064249,-0.093999,-0.543133,-0.470383,...,1,575,10,0,46601,0,12090,540,10,17
3,-0.059432,-0.273058,-0.487016,-0.143878,-0.193763,-0.206385,-0.064249,-0.279201,-0.109336,-0.470383,...,0,351,10,125237,4329,238309,0,8488,56,22
4,-0.048792,-0.418412,0.693403,0.300589,-0.193763,-0.206385,-0.064249,-0.28181,0.902856,-0.470383,...,0,575,7,69747,76381,207280,0,444,73,22


In [38]:
df.head(10).to_csv('/wdl_infer/infer_test.txt', sep='\t', index=False,header=True)

## 4.2 Follow the Triton requirements to generate inference requests

In [48]:
%%writefile '/wdl_infer/wdl2predict.py'
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'wdl'
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]+["C1_C2","C3_C4"]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
emb_size_array = [249058, 19561, 14212, 6890, 18592, 4, 6356, 1254, 52, 226170, 80508, 72308, 11, 2169, 7597, 61, 4, 923, 15, 249619, 168974, 243480, 68212, 9169, 75, 34, 278018, 415262]
shift = np.insert(np.cumsum(emb_size_array), 0, 0)[:-1]
test_df=pd.read_csv("/wdl_infer/infer_test.txt",sep='\t')



with httpclient.InferenceServerClient("localhost:8000") as client:
    dense_features = np.array([list(test_df[CONTINUOUS_COLUMNS].values.flatten())],dtype='float32')
    embedding_columns = np.array([list((test_df[CATEGORICAL_COLUMNS]+shift).values.flatten())],dtype='int64')
    row_ptrs = np.array([list(range(0,21))+list(range(0,261))],dtype='int32')
    
    inputs = [
        httpclient.InferInput("DES", dense_features.shape,
                              np_to_triton_dtype(dense_features.dtype)),
        httpclient.InferInput("CATCOLUMN", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("ROWINDEX", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(dense_features)
    inputs[1].set_data_from_numpy(embedding_columns)
    inputs[2].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))



Overwriting /wdl_infer/wdl2predict.py


## 4.3 Send requests to Triton Server

In [49]:
!python3 /wdl_infer/wdl2predict.py

{'id': '1', 'model_name': 'wdl', 'model_version': '1', 'parameters': {'NumSample': 10, 'DeviceID': 2}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [10], 'parameters': {'binary_data_size': 40}}]}
Prediction Result:
[0.01995986 0.02527472 0.01790315 0.00693272 0.0233907  0.0227473
 0.05989734 0.01598154 0.00582242 0.01423134]
