In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1.Overview

In this notebook, we want to provide a tutorial about how to use the Hierarchical Parameter Server(HPS) backend to look up the embedding keys for inference service, here we still use the two embedding tables of the wdl model trained by HugeCTR as the embedding file.

1. Overview
2. Generate the HPS deployment Configuration
3. Load Embedding tables on the Triton Server
4. Prepare Embedding keys as Input Data for looking up
5. Looking up embedding keys from HPS Backend Instance

# 2. Generate the HPS Deployment Configuration

## 2.1 Generate  hps backend and embedding folders

In [1]:
# define some data folder to store the model related files
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings

BASE_DIR = "/hps_infer"
embedding_folder  = os.path.join(BASE_DIR, "embedding")
wdl_embedding_repo= os.path.join(embedding_folder, "hps_wdl")
wdl_version =os.path.join(wdl_embedding_repo, "1")

if os.path.isdir(embedding_folder):
    shutil.rmtree(embedding_folder)
os.makedirs(embedding_folder)

if os.path.isdir(wdl_embedding_repo):
    shutil.rmtree(wdl_embedding_repo)
os.makedirs(wdl_embedding_repo)

if os.path.isdir(wdl_version):
    shutil.rmtree(wdl_version)
os.makedirs(wdl_version)

In [4]:
!cp -r /workspace/data/wdl_models/wdl0_sparse_20000.model $wdl_version/
!cp -r /workspace/data/wdl_models/wdl1_sparse_20000.model $wdl_version/
!cp /workspace/data/wdl_models/wdl_dense_20000.model $wdl_version/
!cp /workspace/data/wdl_models/wdl.json $wdl_version/


total 5840
-rwxr-xr-x 1 root root    3590 Jun 29 07:54 wdl.json
drwxr-xr-x 2 root root    4096 Jun 29 07:54 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Jun 29 07:54 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Jun 29 07:54 wdl_dense_20000.model


In [9]:
!tree $wdl_version

[01;34m/hps_infer/embedding/hps_wdl/1[00m
├── [01;32mwdl.json[00m
├── [01;34mwdl0_sparse_20000.model[00m
│   ├── emb_vector
│   ├── key
│   ├── wdl0_sparse_20000.model.key
│   └── wdl0_sparse_20000.model.vec
├── [01;34mwdl1_sparse_20000.model[00m
│   ├── emb_vector
│   ├── key
│   ├── wdl1_sparse_20000.model.key
│   └── wdl1_sparse_20000.model.vec
└── wdl_dense_20000.model

2 directories, 10 files


## 2.2 Copy embedding tables of wdl model to embedding repository

In [10]:
!cp -r /wdl_train/wdl0_sparse_2000.model $wdl_version/
!cp -r /wdl_train/wdl1_sparse_2000.model $wdl_version/
!ls -l $wdl_version

total 8
drwxr-xr-x 2 root root 4096 Apr  1 09:09 wdl0_sparse_2000.model
drwxr-xr-x 2 root root 4096 Apr  1 09:09 wdl1_sparse_2000.model


## 2.3 Generate the HPS configuration for deploying embedding tables

In [10]:
%%writefile $wdl_embedding_repo/config.pbtxt
name: "hps_wdl"
backend: "hps"
max_batch_size:1024,
input [
  {
    name: "KEYS"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "NUMKEYS"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
version_policy: {
        specific:{versions: 1}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]


Writing /hps_infer/embedding/hps_wdl/config.pbtxt


## 2.4 Configure hashmap for the localized embedding table storage
In this case, we only use the local hashmap for demonstration. If you need to use distributed redis cluster and rocksdb for hierarchical embedded table storage, please refer to the detailed introduction [here](../docs/hierarchical_parameter_server.md#configuration), and add the corresponding configuration to the following hps.json

## 2.5 Generate the HPS  configuration for deploying embedding tables

In [13]:
%%writefile /hps_infer/embedding/hps.json
{
    "supportlonglong": true,
    "volatile_db": {
        "type": "hash_map",
        "user_name": "default",
        "num_partitions": 8,
        "max_get_batch_size": 100000,
        "max_set_batch_size": 100000,
        "overflow_policy": "evict_oldest",
        "overflow_margin": 10000000,
        "overflow_resolution_target": 0.8,
        "initial_cache_rate": 1.0
    },
    "persistent_db": {
        "type": "disabled"
    },
    "models": [{
        "model": "hps_wdl",
        "sparse_files": ["/hps_infer/embedding/hps_wdl/1/wdl0_sparse_20000.model", "/hps_infer/embedding/hps_wdl/1/wdl1_sparse_20000.model"],
        "num_of_worker_buffer_in_pool": 3,
        "embedding_table_names":["embedding_table1","embedding_table2"],
        "embedding_vecsize_per_table":[1,16],
        "maxnum_catfeature_query_per_table_per_sample":[2,26],
        "default_value_for_each_table":[0.0,0.0],
        "deployed_device_list":[0],
        "max_batch_size":1024,
        "cache_refresh_percentage_per_iteration":0.2,
        "hit_rate_threshold":0.9,
        "gpucacheper":0.5,
        "gpucache":true
        }
    ]
}


Overwriting /hps_infer/embedding/hps.json


In [12]:
!ls -l $wdl_embedding_repo
!ls -l $wdl_version

total 8
drwxr-xr-x 4 root root 4096 Jun 29 07:54 1
-rw-r--r-- 1 root root  408 Jun 29 08:02 config.pbtxt
total 5840
-rwxr-xr-x 1 root root    3590 Jun 29 07:54 wdl.json
drwxr-xr-x 2 root root    4096 Jun 29 07:54 wdl0_sparse_20000.model
drwxr-xr-x 2 root root    4096 Jun 29 07:54 wdl1_sparse_20000.model
-rw-r--r-- 1 root root 5963780 Jun 29 07:54 wdl_dense_20000.model


# 3.Deploy HPS Backend on Triton Server 

At this stage, you should have already launched the Triton Inference Server with the following command:

In this tutorial, we will deploy the Wide&Deep to a single A100(32GB),

Note: `Since Background processes not supported by Jupyter, please launch the Triton Server according to the following command independently in the background.`

In [None]:
!tritonserver --model-repository=/hps_infer/embedding/ --load-model=hps_wdl \
    --model-control-mode=explicit \
    --backend-directory=/usr/local/hugectr/backends \
    --backend-config=hps,ps=/hps_infer/embedding/hps.json
!tritonserver --model-repository=/hps_infer/embedding/ --load-model=hps_wdl --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hps_infer/embedding/hps.json

### 3.1 Check Triton server status if deploy two embedding tables successfully

In [18]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host localhost left intact


# 4. Prepare Inference Request

### 4.1 Read validation data

In [13]:
!ls -l /wdl_train/val

total 645762
-rw-r--r-- 1 root root        32 Nov 29 05:27 _file_list.txt
-rw-r--r-- 1 root root   8554464 Nov 29 05:27 _hugectr.keyset
-rw-r--r-- 1 root root     22726 Nov 29 05:27 _metadata
-rw-r--r-- 1 root root      1509 Nov 29 05:27 _metadata.json
-rw-r--r-- 1 root root 142825257 Nov 29 05:27 part_0.parquet
-rw-r--r-- 1 root root     21459 Nov 29 05:27 schema.pbtxt
drwxr-xr-x 2 root root      4096 Nov 29 05:26 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Nov 29 03:50 test.txt


In [23]:
import pandas as pd
df = pd.read_parquet("/wdl_train/val/part_0.parquet")

In [24]:
df.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,-0.055886,-0.548824,-0.272394,-0.157301,-0.224758,-0.206385,-0.064249,0.096421,-0.543133,-0.470383,...,1,1,3856,4891,4119,143,50,1,1,0.0
1,-0.059432,-0.380376,-0.272394,5.629719,-0.224758,-0.206385,-0.064249,-0.279201,-0.253935,-0.470383,...,2,1,2,2,2,0,327,2,1,0.0
2,-0.059432,-0.539315,-0.594327,-0.142386,-0.193763,-0.206385,-0.064249,-0.023569,-0.687732,-0.470383,...,1,1,0,2439,41980,349,3549,6,1,1.0
3,-0.059432,-0.463242,-0.594327,-0.097641,-0.209261,-0.206385,-0.064249,-0.219206,-0.687732,-0.470383,...,1,1,4024,3677,4287,565,306,4,1,0.0
4,0.022145,-0.509429,-0.379705,-0.151335,-0.162767,-0.206385,-0.064249,-0.28181,-0.470833,-0.470383,...,2,3,40847,3862,41562,1066,132,2,1,0.0


In [25]:
df.head(10).to_csv('/hps_infer/infer_test.csv', sep=',', index=False,header=True)

## 4.2 Follow the Triton requirements to generate inference requests

In [31]:
%%writefile '/hps_infer/hps2predict.py'
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'hps_wdl'
batch_szie = 10
emb1 = ["C1_C2","C3_C4"]
emb2 = ["C" + str(x) for x in range(1, 27)]
CATEGORICAL_COLUMNS= emb1 + emb2
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
#This is the bias（offset）added the preprocessed training data, 
#which is added to inference data ensure that the embedded key of inference and training are in the same range, 
#and have nothing to do with the lookup logic of HPS
emb_size_array = [278018, 415262,249058, 19561, 14212, 6890, 18592, 4, 6356, 1254, 52, 226170, 80508, 72308, 11, 2169, 7597, 61, 4, 923, 15, 249619, 168974, 243480, 68212, 9169, 75, 34]
shift = np.insert(np.cumsum(emb_size_array), 0, 0)[:-1]
test_df=pd.read_csv("/hps_infer/infer_test.csv",sep=',').head(batch_szie)



with httpclient.InferenceServerClient("localhost:8000") as client:
    dense_features = np.array([list(test_df[CONTINUOUS_COLUMNS].values.flatten())],dtype='float32')
    input  = test_df[CATEGORICAL_COLUMNS]+shift
    #Input format of "KEYS"= [keys of embedding table1, keys of embedding table2,...]
    #Input format of "NUMKEYS"= [ the number of keys in embedding table1 for looking up, the number of keys in embedding table2 for looking up,...]
    embedding_columns = np.array([list(input[emb1].values.flatten())+list(input[emb2].values.flatten())],dtype='int64')

    row_ptrs = np.array([[batch_szie*2,batch_szie*26]],dtype='int32')

    inputs = [
        httpclient.InferInput("KEYS", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("NUMKEYS", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("The embedding vecotor shape is(batchsize*2*embedding1_vector_size + batchsize*26*embedding2_vector_size):")
    print(response.as_numpy("OUTPUT0").shape)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))


Overwriting /hps_infer/hps2predict.py


## 4.3 Send requests to HPS Backend

In [28]:
!python3 /hps_infer/hps2predict.py

{'id': '1', 'model_name': 'hps_wdl', 'model_version': '1', 'parameters': {'NumSample': 10, 'DeviceID': 0}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [4180], 'parameters': {'binary_data_size': 16720}}]}
The embedding vecotor shape is(batchsize*2*embedding1_vector_size + batchsize*26*embedding2_vector_size):
(4180,)
Prediction Result:
[-0.05476952 -0.08410043 -0.00947467 ... -0.00037787  0.02533177
  0.00137331]
