In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1 Overview
In this notebook, we want to provide a tutorial on how to use standard DLRM model that trained on HugeCTR_DLRM_Training.
notebook and deploy the saved model to Triton Inference Server. We could collect the inference benchmark by Triton performance analyzer  tool

1. [Overview](#1)
2. [Generate the DLRM Deployment Configuration](#2)
3. [Load Models on Triton Server](#3)
4. [Prepare Inference Input Data](#4) 
5. [Inference Benchmarm by Triton Performance Tool](#5) 

# 2. Generate the DLRM Deployment Configuration

## 2.1 Generate related model folders

In [1]:
# define some data folder to store the model related files
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings

BASE_DIR = "/dlrm_infer"
model_folder  = os.path.join(BASE_DIR, "model")
dlrm_model_repo= os.path.join(model_folder, "dlrm")
dlrm_version =os.path.join(dlrm_model_repo, "1")

if os.path.isdir(model_folder):
    shutil.rmtree(model_folder)
os.makedirs(model_folder)

if os.path.isdir(dlrm_model_repo):
    shutil.rmtree(dlrm_model_repo)
os.makedirs(dlrm_model_repo)

if os.path.isdir(dlrm_version):
    shutil.rmtree(dlrm_version)
os.makedirs(dlrm_version)


### 2.2 Copy DLRM model files to model repository

In [2]:
! cp -r /dlrm_train/dlrm0_sparse_20000.model $dlrm_version/
! cp /dlrm_train/dlrm_dense_20000.model $dlrm_version/
! cp /dlrm_train/dlrm.json $dlrm_version/
!ls -l $dlrm_version

total 8193
-rw-r--r-- 1 root root    3706 Nov 29 07:25 dlrm.json
drwxr-xr-x 2 root root    4096 Nov 29 07:25 dlrm0_sparse_20000.model
-rw-r--r-- 1 root root 9479684 Nov 29 07:25 dlrm_dense_20000.model


### 2.3 Generate the Triton configuration for deploying DLRM 

In [3]:
%%writefile $dlrm_model_repo/config.pbtxt
name: "dlrm"
backend: "hugectr"
max_batch_size:64,
input [
   {
    name: "DES"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "CATCOLUMN"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "ROWINDEX"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

parameters [
  {
    key: "config"
    value: { string_value: "/dlrm_infer/model/dlrm/1/dlrm.json" }
  },
  {
    key: "gpucache"
    value: { string_value: "true" }
  },
  {
    key: "hit_rate_threshold"
    value: { string_value: "0.8" }
  },
  {
    key: "gpucacheper"
    value: { string_value: "0.5" }
  },
  {
    key: "label_dim"
    value: { string_value: "1" }
  },
  {
    key: "slots"
    value: { string_value: "26" }
  },
  {
    key: "cat_feature_num"
    value: { string_value: "26" }
  },
  {
    key: "des_feature_num"
    value: { string_value: "13" }
  },
  {
    key: "max_nnz"
    value: { string_value: "2" }
  },
  {
    key: "embedding_vector_size"
    value: { string_value: "128" }
  },
  {
    key: "embeddingkey_long_type"
    value: { string_value: "true" }
  }
]

Writing /dlrm_infer/model/dlrm/config.pbtxt


### 2.4 Generate the Hugectr Backend parameter server configuration for deploying dlrm

In [4]:
%%writefile $model_folder/ps.json
{
    "supportlonglong":true,
    "db_type": "local",
    "models":[
        {
            "model":"dlrm",
            "sparse_files":["/dlrm_infer/model/dlrm/1/dlrm0_sparse_20000.model"],
            "dense_file":"/dlrm_infer/model/dlrm/1/dlrm_dense_20000.model",
            "network_file":"/dlrm_infer/model/dlrm/1/dlrm.json",
            "num_of_worker_buffer_in_pool": 4,
            "num_of_refresher_buffer_in_pool":1,
            "deployed_device_list":[0],
            "max_batch_size":64,
            "default_value_for_each_table":[0.0,0.0],
            "hit_rate_threshold":0.9,
            "gpucacheper":0.5,
            "gpucache":true,
            "cache_refresh_percentage_per_iteration":0.2,
            "maxnum_des_feature_per_sample": 13,
            "maxnum_catfeature_query_per_table_per_sample":[26],
            "embedding_vecsize_per_table":[128],
            "slot_num":26
        }
    ]  
}

Writing /dlrm_infer/model/ps.json


In [5]:
!ls -l $dlrm_version
!ls -l $dlrm_model_repo

total 9281
-rw-r--r-- 1 root root    3706 Nov 29 07:25 dlrm.json
drwxr-xr-x 2 root root    4096 Nov 29 07:25 dlrm0_sparse_20000.model
-rw-r--r-- 1 root root 9479684 Nov 29 07:25 dlrm_dense_20000.model
total 1
drwxr-xr-x 3 root root 4096 Nov 29 07:25 1
-rw-r--r-- 1 root root 1177 Nov 29 07:25 config.pbtxt


## 3. Deploy DLRM on Triton Server
At this stage, you should have already launched the Triton Inference Server with the following command:

In this tutorial, we will deploy the DLRM to a single V100(32GB)

docker run --gpus=all -it -v /dlrm_infer/:/dlrm_infer -v /dlrm_train/:/dlrm_train --net=host nvcr.io/nvidia/merlin/merlin-inference:22.06 /bin/bash

After you enter into the container you can launch triton server with the command below:

tritonserver --model-repository=/dlrm_infer/model/ --load-model=dlrm 
    --model-control-mode=explicit 
    --backend-directory=/usr/local/hugectr/backends 
    --backend-config=hugectr,ps=/dlrm_infer/model/ps.json 
    
Note: The model-repository path is /dlrm_infer/model/. The path for the dlrm model network json file is /dlrm_infer/model/dlrm/1/dlrm.json. The path for the parameter server configuration file is /dlrm_infer/model/ps.json.

## 4. Prepare Inference Input Data 

### 4.1 Read validation data

In [7]:
!ls -l /dlrm_train/val

total 702242
-rw-r--r-- 1 root root        33 Nov 29 06:42 _file_list.txt
-rw-r--r-- 1 root root  81092112 Nov 29 06:42 _hugectr.keyset
-rw-r--r-- 1 root root     21528 Nov 29 06:42 _metadata
-rw-r--r-- 1 root root      1437 Nov 29 06:42 _metadata.json
-rw-r--r-- 1 root root 128131055 Nov 29 06:42 part_0.parquet
-rw-r--r-- 1 root root     19945 Nov 29 06:42 schema.pbtxt
drwxr-xr-x 2 root root      4096 Nov 29 06:41 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Nov 29 06:39 test.txt


In [9]:
import pandas as pd
df=pd.read_parquet('/dlrm_train/val/part_0.parquet',engine='pyarrow')

In [14]:
df.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,label
0,669260,28,473,18,2,1,157,4,3,19900,...,-0.209261,-0.206385,-0.064249,-0.28181,0.035263,-0.470383,-0.261958,-0.17375,-0.262248,0.0
1,1,1,31,2,20,1,51,3,1,1,...,-0.209261,-0.206385,-0.064249,-0.258333,-0.760031,-0.470383,-0.261958,-0.19454,-0.262248,0.0
2,2,25,148,52,2,1,101,6,5,2,...,0.240178,0.205918,-0.064249,-0.276593,2.204247,1.386036,0.690729,-0.277389,0.046789,0.0
3,177,61,377,2,1,1,402,17,1,209,...,0.023207,0.068484,-0.064249,-0.276593,1.987348,1.386036,0.055604,-0.289136,-0.306396,0.0
4,2,3,30,4,2,1,8,1,2,2,...,1.526501,-0.206385,0.339399,-0.28181,0.39676,-0.470383,2.596102,-0.25714,0.399973,0.0


In [16]:
df.head(200000).to_csv('infer_test.csv', sep=',', index=False,header=True)

### 4.2 Follow the Triton requirements to generate input data with json format for performance test

In [18]:
%%writefile ./criteo2predict.py
import argparse
import sys
import numpy as np
import pandas as pd
import json
import pickle

def parse_config(src_config):
    try:
        with open(src_config, 'r') as data_json:
            j_data = json.load(data_json)
            dense_dim = j_data["dense"]
            categorical_dim = j_data["categorical"]
            slot_size = j_data["slot_size"]
        assert(categorical_dim == np.sum(slot_size))
        return dense_dim, categorical_dim, slot_size
    except:
        print("Invalid data configuration file!")

def convert(src_csv, src_config, dst, batch_size,segmentation):
    dense_dim, categorical_dim, slot_size = parse_config(src_config)
    slot_size_array=[4976199, 25419, 14705, 7112, 19283, 4, 6391, 1282, 60, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 5577159, 1385790, 4348882, 178673, 10023, 88, 34]
    offset = np.insert(np.cumsum(slot_size_array), 0, 0)[:-1]
    total_columns = 1 + dense_dim + categorical_dim
    df = pd.read_csv(src_csv,  sep=',', nrows=batch_size)
    cols = df.columns
    slot_num = len(slot_size)
    row_ptrs = [0 for _ in range(batch_size*slot_num + 1)]
    for i in range(1, len(row_ptrs)):
        row_ptrs[i] = row_ptrs[i-1] + slot_size[(i-1)%slot_num]
    label_df =  pd.DataFrame(df['label'].values.reshape(1,batch_size))
    dense_df = pd.DataFrame(df[['I'+str(i+1) for i in range(dense_dim)]].values.reshape(1, batch_size*dense_dim))
    embedding_columns_df = pd.DataFrame(df[['C'+str(i+1) for i in range(categorical_dim)]].values.reshape(1, batch_size*categorical_dim))
    row_ptrs_df = pd.DataFrame(np.array(row_ptrs).reshape(1, batch_size*slot_num + 1))
    with open(dst, 'w') as dst_txt:
        dst_txt.write("{\n\"data\":[\n{\n")
        dst_txt.write("\"DES\":")
        dst_txt.write(','.join('%s' %id for id in dense_df.values.tolist()))
        dst_txt.write(",\n\"CATCOLUMN\":")
        dst_txt.write(','.join('%s' %id for id in (embedding_columns_df.values.reshape(-1,26)+offset).reshape(1,-1).tolist()))
        dst_txt.write(",\n\"ROWINDEX\":")
        dst_txt.write(','.join('%s' %id for id in row_ptrs_df.values.tolist()))
        dst_txt.write("\n}\n]\n}")

if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Convert Preprocessed Criteo Data to Inference Format')
    arg_parser.add_argument('--src_csv_path', type=str, required=True)
    arg_parser.add_argument('--src_config_path', type=str, required=True)
    arg_parser.add_argument('--dst_path', type=str, required=True)
    arg_parser.add_argument('--batch_size', type=int, default=128)
    arg_parser.add_argument('--segmentation', type=str, default=' ')
    args = arg_parser.parse_args()
    src_csv_path = args.src_csv_path
    segmentation = args.segmentation
    src_config_path = args.src_config_path
    dst_path = args.dst_path
    batch_size = args.batch_size
    convert(src_csv_path, src_config_path, dst_path, batch_size, segmentation)


Overwriting ./criteo2predict.py


### 4.3 Define Inference Input Data Format

In [19]:
%%writefile ./dlrm_input_format.json
{
    "dense": 13,
    "categorical": 26,
    "slot_size": [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
}

Writing ./dlrm_input_format.json


### 4.4 Generate the input json data with batch size=1

In [20]:
batchsize=1
!python3 criteo2predict.py --src_csv_path=./infer_test.csv --src_config_path=dlrm_input_format.json --dst_path ./$batchsize".json" --batch_size=$batchsize --segmentation=','

### 4.4 Get Triton server status if deploy DLRM successfully in Step3

In [17]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/7.68.0

> Accept: */*

> 

* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK

< Content-Length: 0

< Content-Type: text/plain

< 

* Connection #0 to host localhost left intact


## 5. Get Inference benchmark by Triton Performance Tool 

### 5.1 Get the inference performance for batchsize=1

In [21]:
!perf_analyzer -m dlrm -u localhost:8000 --input-data 1.json --shape CATCOLUMN:26 --shape DES:13 --shape ROWINDEX:27

 Successfully read data for 1 stream/streams with 1 step/steps.
*** Measurement Settings ***
  Batch size: 1
  Using "time_windows" mode for stabilization
  Measurement window: 5000 msec
  Using synchronous calls for inference
  Stabilizing using average latency

Request concurrency: 1
  Client: 
    Request count: 7323
    Throughput: 1464.6 infer/sec
    Avg latency: 675 usec (standard deviation 117 usec)
    p50 latency: 651 usec
    p90 latency: 780 usec
    p95 latency: 814 usec
    p99 latency: 907 usec
    Avg HTTP time: 671 usec (send/recv 42 usec + response wait 629 usec)
  Server: 
    Inference count: 8796
    Execution count: 8796
    Successful request count: 8796
    Avg request latency: 445 usec (overhead 1 usec + queue 123 usec + compute input 0 usec + compute infer 321 usec + compute output 0 usec)

Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 1464.6 infer/sec, latency 675 usec


## 6. Get Inference result from Triton server

In [25]:
%%writefile /dlrm_infer/dlrm2predict.py
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'dlrm'
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
emb_size_array = [4976199, 25419, 14705, 7112, 19283, 4, 6391, 1282, 60, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 5577159, 1385790, 4348882, 178673, 10023, 88, 34]
shift = np.insert(np.cumsum(emb_size_array), 0, 0)[:-1]
test_df=pd.read_csv("/dlrm_infer/infer_test.csv",sep=',')



with httpclient.InferenceServerClient("localhost:8000") as client:
    dense_features = np.array([list(test_df.head(10)[CONTINUOUS_COLUMNS].values.flatten())],dtype='float32')
    embedding_columns = np.array([list((test_df.head(10)[CATEGORICAL_COLUMNS]+shift).values.flatten())],dtype='int64')
    row_ptrs = np.array([list(range(0,261))],dtype='int32')
    
    inputs = [
        httpclient.InferInput("DES", dense_features.shape,
                              np_to_triton_dtype(dense_features.dtype)),
        httpclient.InferInput("CATCOLUMN", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("ROWINDEX", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(dense_features)
    inputs[1].set_data_from_numpy(embedding_columns)
    inputs[2].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))

Overwriting /dlrm_infer/dlrm2predict.py


In [29]:
!python dlrm2predict.py

{'id': '1', 'model_name': 'dlrm', 'model_version': '1', 'parameters': {'NumSample': 10, 'DeviceID': 2}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [10], 'parameters': {'binary_data_size': 40}}]}
Prediction Result:
[0.02984182 0.03024833 0.03550119 0.03566186 0.04245038 0.03023028
 0.02834382 0.03364136 0.02965043 0.03000181]
