In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1 Overview
In this notebook, we want to provide a tutorial on how to use standard DLRM model that trained on HugeCTR_DLRM_Training.
notebook and deploy the saved model to Triton Inference Server. We could collect the inference benchmark by Triton performance analyzer  tool

1. [Overview](#1)
2. [Generate the DLRM Deployment Configuration](#2)
3. [Load Models on Triton Server](#3)
4. [Prepare Inference Input Data](#4) 
5. [Inference Benchmarm by Triton Performance Tool](#5) 

# 2. Generate the DLRM Deployment Configuration

## 2.1 Generate related model folders

In [1]:
# define some data folder to store the model related files
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings

BASE_DIR = "/dlrm_infer"
model_folder  = os.path.join(BASE_DIR, "model")
dlrm_model_repo= os.path.join(model_folder, "dlrm")
dlrm_version =os.path.join(dlrm_model_repo, "1")

if os.path.isdir(model_folder):
    shutil.rmtree(model_folder)
os.makedirs(model_folder)

if os.path.isdir(dlrm_model_repo):
    shutil.rmtree(dlrm_model_repo)
os.makedirs(dlrm_model_repo)

if os.path.isdir(dlrm_version):
    shutil.rmtree(dlrm_version)
os.makedirs(dlrm_version)


### 2.2 Copy DLRM model files to model repository

In [4]:
! cp -r /dlrm_train/dlrm0_sparse_20000.model $dlrm_version/
! cp /dlrm_train/dlrm_dense_20000.model $dlrm_version/
! cp /dlrm_train/dlrm.json $dlrm_version/
!ls -l $dlrm_version

total 10858568
-rw-r--r-- 1 root root 11100190464 Mar 30 03:28 0_sparse_20000.model
-rw-r--r-- 1 root root     9479684 Mar 30 03:28 _dense_20000.model
-rw-r--r-- 1 root root        2887 Jul  6 11:03 dlrm.json
drwxr-xr-x 2 root root        4096 Jul  6 11:02 dlrm0_sparse_20000.model
-rw-r--r-- 1 root root     9479684 Jul  6 11:03 dlrm_dense_20000.model


### 2.3 Generate the Triton configuration for deploying DLRM 

In [32]:
%%writefile $dlrm_model_repo/config.pbtxt
name: "dlrm"
backend: "hugectr"
max_batch_size:1,
input [
   {
    name: "DES"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "CATCOLUMN"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "ROWINDEX"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[2]
  }
]

parameters [
  {
  key: "config"
  value: { string_value: "/dlrm_infer/model/dlrm/1/dlrm.json" }
  },
  {
  key: "gpucache"
  value: { string_value: "true" }
  },
  {
  key: "hit_rate_threshold"
  value: { string_value: "0.8" }
  },
  {
  key: "gpucacheper"
  value: { string_value: "0.5" }
  },
  {
  key: "label_dim"
  value: { string_value: "1" }
  },
  {
  key: "slots"
  value: { string_value: "26" }
  },
  {
  key: "cat_feature_num"
  value: { string_value: "26" }
  },
 {
  key: "des_feature_num"
  value: { string_value: "13" }
  },
  {
  key: "max_nnz"
  value: { string_value: "2" }
  },
  {
  key: "embedding_vector_size"
  value: { string_value: "128" }
  },
  {
  key: "embeddingkey_long_type"
  value: { string_value: "true" }
  }
]

Overwriting /dlrm_infer/model/dlrm/config.pbtxt


### 2.4 Generate the Hugectr Backend parameter server configuration for deploying dlrm

In [6]:
%%writefile $model_folder/ps.json
{
    "supportlonglong":true,
    "models":[
        {
            "model":"dlrm",
            "sparse_files":["/dlrm_infer/model/dlrm/1/dlrm0_sparse_20000.model"],
            "dense_file":"/dlrm_infer/model/dlrm/1/dlrm_dense_20000.model",
            "network_file":"/dlrm_infer/model/dlrm/1/dlrm.json"
        }
    ]  
}

Writing /dlrm_infer/model/ps.json


In [7]:
!ls -l $dlrm_version
!ls -l $dlrm_model_repo

total 10858568
-rw-r--r-- 1 root root 11100190464 Mar 30 03:28 0_sparse_20000.model
-rw-r--r-- 1 root root     9479684 Mar 30 03:28 _dense_20000.model
-rw-r--r-- 1 root root        2887 Jul  6 11:03 dlrm.json
drwxr-xr-x 2 root root        4096 Jul  6 11:02 dlrm0_sparse_20000.model
-rw-r--r-- 1 root root     9479684 Jul  6 11:03 dlrm_dense_20000.model
total 8
drwxr-xr-x 3 root root 4096 Jul  6 11:02 1
-rw-r--r-- 1 root root 1107 Apr  8 08:04 config.pbtxt


## 3. Deploy DLRM on Triton Server
At this stage, you should have already launched the Triton Inference Server with the following command:

In this tutorial, we will deploy the DLRM to a single V100(32GB)

docker run --gpus=all -it -v /dlrm_infer/:/dlrm_infer -v /dlrm_train/:/dlrm_train --net=host nvcr.io/nvidia/merlin/merlin-inference:0.7 /bin/bash

After you enter into the container you can launch triton server with the command below:

tritonserver --model-repository=/dlrm_infer/model/ --load-model=dlrm 
    --model-control-mode=explicit 
    --backend-directory=/usr/local/hugectr/backends 
    --backend-config=hugectr,ps=/dlrm_infer/model/ps.json 
    
Note: The model-repository path is /dlrm_infer/model/. The path for the dlrm model network json file is /dlrm_infer/model/dlrm/1/dlrm.json. The path for the parameter server configuration file is /dlrm_infer/model/ps.json.

## 4. Prepare Inference Input Data 

### 4.1 Read validation data

In [8]:
!ls -l /dlrm_train/dlrm/val

total 2938676
-rw-r--r-- 1 root root 704993648 Mar 25 09:13 0.2a6cbfbca91e420cb6f68536656260fb.parquet
-rw-r--r-- 1 root root 704993648 Mar 25 13:31 0.83ab760d4f4b4505a397e9b90247eb4a.parquet
-rw-r--r-- 1 root root 128130255 Apr 12 06:15 0.95a75de478af4decb84e22393d0a5205.parquet
-rw-r--r-- 1 root root 128130255 Apr  7 08:12 0.d7c746376f3743608223bca2cfe4fb8a.parquet
-rw-r--r-- 1 root root 128130255 Mar 31 13:05 0.e611389223d34a0c9f127d8e1ea4cb60.parquet
-rw-r--r-- 1 root root 704993648 Mar 25 13:26 0.fafd8818ec0f4f41a3d3ffe689de91c2.parquet
-rw-r--r-- 1 root root        54 Apr 12 06:15 _file_list.txt
-rw-r--r-- 1 root root     26328 Apr 12 06:15 _metadata
-rw-r--r-- 1 root root      1465 Apr 12 06:15 _metadata.json
drwxr-xr-x 2 root root      4096 Apr 12 06:13 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Apr 12 06:09 test.txt


In [9]:
import pandas as pd
df=pd.read_parquet('/dlrm_train/dlrm/val/0.83ab760d4f4b4505a397e9b90247eb4a.parquet',engine='pyarrow')

In [10]:
df.head(2)

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,label
0,687,75,160,134261,62,10,7542,229,3,89976,...,-0.117298,-0.264598,-0.241559,-0.569952,-0.44111,-0.570183,-0.5111,-0.0848,-0.436591,1.0
1,630,156,46,590512,44,23,10501,2,3,22899,...,0.239154,-0.264598,-0.241559,-0.749732,-0.468817,-0.570183,-0.5111,-0.0848,-0.436591,0.0


In [11]:
df.head(200000).to_csv('infer_test.txt', sep='\t', index=False,header=True)

### 4.2 Follow the Triton requirements to generate input data with json format 

In [12]:
%%writefile ./criteo2predict.py
import argparse
import sys
import numpy as np
import pandas as pd
import json
import pickle

def parse_config(src_config):
    try:
        with open(src_config, 'r') as data_json:
            j_data = json.load(data_json)
            dense_dim = j_data["dense"]
            categorical_dim = j_data["categorical"]
            slot_size = j_data["slot_size"]
        assert(categorical_dim == np.sum(slot_size))
        return dense_dim, categorical_dim, slot_size
    except:
        print("Invalid data configuration file!")

def convert(src_csv, src_config, dst, batch_size,segmentation):
    dense_dim, categorical_dim, slot_size = parse_config(src_config)
    slot_size_array=[4976199, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 25419, 5577159, 1385790, 4348882, 178673, 10023, 88, 34, 14705, 7112, 19283, 4, 6391, 1282, 60]
    offset = np.insert(np.cumsum(slot_size_array), 0, 0)[:-1]
    total_columns = 1 + dense_dim + categorical_dim
    df = pd.read_csv(src_csv,  sep='\t', nrows=batch_size)
    cols = df.columns
    slot_num = len(slot_size)
    row_ptrs = [0 for _ in range(batch_size*slot_num + 1)]
    for i in range(1, len(row_ptrs)):
        row_ptrs[i] = row_ptrs[i-1] + slot_size[(i-1)%slot_num]
    label_df =  pd.DataFrame(df['label'].values.reshape(1,batch_size))
    dense_df = pd.DataFrame(df[['I'+str(i+1) for i in range(dense_dim)]].values.reshape(1, batch_size*dense_dim))
    embedding_columns_df = pd.DataFrame(df[['C'+str(i+1) for i in range(categorical_dim)]].values.reshape(1, batch_size*categorical_dim))
    row_ptrs_df = pd.DataFrame(np.array(row_ptrs).reshape(1, batch_size*slot_num + 1))
    with open(dst, 'w') as dst_txt:
        dst_txt.write("{\n\"data\":[\n{\n")
        dst_txt.write("\"DES\":")
        dst_txt.write(','.join('%s' %id for id in dense_df.values.tolist()))
        dst_txt.write(",\n\"CATCOLUMN\":")
        dst_txt.write(','.join('%s' %id for id in (embedding_columns_df.values.reshape(-1,26)+offset).reshape(1,-1).tolist()))
        dst_txt.write(",\n\"ROWINDEX\":")
        dst_txt.write(','.join('%s' %id for id in row_ptrs_df.values.tolist()))
        dst_txt.write("\n}\n]\n}")

if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Convert Preprocessed Criteo Data to Inference Format')
    arg_parser.add_argument('--src_csv_path', type=str, required=True)
    arg_parser.add_argument('--src_config_path', type=str, required=True)
    arg_parser.add_argument('--dst_path', type=str, required=True)
    arg_parser.add_argument('--batch_size', type=int, default=128)
    arg_parser.add_argument('--segmentation', type=str, default=' ')
    args = arg_parser.parse_args()
    src_csv_path = args.src_csv_path
    segmentation = args.segmentation
    src_config_path = args.src_config_path
    dst_path = args.dst_path
    batch_size = args.batch_size
    convert(src_csv_path, src_config_path, dst_path, batch_size, segmentation)


Overwriting ./criteo2predict.py


### 4.3 Define Inference Input Data Format

In [13]:
%%writefile ./dlrm_input_format.json
{
    "dense": 13,
    "categorical": 26,
    "slot_size": [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
}

Overwriting ./dlrm_input_format.json


### 4.4 Generate the input json data with batch size=1

In [14]:
batchsize=1
!python3 criteo2predict.py --src_csv_path=./infer_test.txt --src_config_path=dlrm_input_format.json --dst_path ./$batchsize".json" --batch_size=$batchsize --segmentation=','

### 4.4 Get Triton server status if deploy DLRM successfully in Step3

In [15]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/7.68.0

> Accept: */*

> 

* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK

< Content-Length: 0

< Content-Type: text/plain

< 

* Connection #0 to host localhost left intact


## 5. Get Inference benchmark by Triton Performance Tool 

### 5.1 Get the inference performance for batchsize=1

In [17]:
!perf_analyzer -m dlrm -u localhost:8000 --input-data 1.json --shape CATCOLUMN:26 --shape DES:13 --shape ROWINDEX:27

 Successfully read data for 1 stream/streams with 1 step/steps.
*** Measurement Settings ***
  Batch size: 1
  Measurement window: 5000 msec
  Using synchronous calls for inference
  Stabilizing using average latency

Request concurrency: 1
  Client: 
    Request count: 5552
    Throughput: 1110.4 infer/sec
    Avg latency: 887 usec (standard deviation 263 usec)
    p50 latency: 876 usec
    p90 latency: 918 usec
    p95 latency: 941 usec
    p99 latency: 1043 usec
    Avg HTTP time: 877 usec (send/recv 74 usec + response wait 803 usec)
  Server: 
    Inference count: 6691
    Execution count: 6691
    Successful request count: 6691
    Avg request latency: 573 usec (overhead 1 usec + queue 146 usec + compute input 0 usec + compute infer 426 usec + compute output 0 usec)

Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 1110.4 infer/sec, latency 887 usec


### 5.2 Get the inference performance for batchsize=131072 


#### 5.2.1. Modify the max_batch_size from 1 to 131072 in $dlrm_model_repo/config.pbtxt

In [21]:
%%writefile $dlrm_model_repo/config.pbtxt
name: "dlrm"
backend: "hugectr"
max_batch_size:131072,
input [
   {
    name: "DES"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "CATCOLUMN"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "ROWINDEX"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[2]
  }
]

parameters [
  {
  key: "config"
  value: { string_value: "/dlrm_infer/model/dlrm/1/dlrm.json" }
  },
  {
  key: "gpucache"
  value: { string_value: "true" }
  },
  {
  key: "hit_rate_threshold"
  value: { string_value: "0.8" }
  },
  {
  key: "gpucacheper"
  value: { string_value: "0.5" }
  },
  {
  key: "label_dim"
  value: { string_value: "1" }
  },
  {
  key: "slots"
  value: { string_value: "26" }
  },
  {
  key: "cat_feature_num"
  value: { string_value: "26" }
  },
 {
  key: "des_feature_num"
  value: { string_value: "13" }
  },
  {
  key: "max_nnz"
  value: { string_value: "2" }
  },
  {
  key: "embedding_vector_size"
  value: { string_value: "128" }
  },
  {
  key: "embeddingkey_long_type"
  value: { string_value: "true" }
  }
]

Overwriting /dlrm_infer/model/dlrm/config.pbtxt


#### 5.2.2. Relaunch Triton server to reload DLRM according to Step 3

#### 5.2.3. Generate the input json file with batchsize=131072

In [19]:
batchsize=131072
!python3 criteo2predict.py --src_csv_path=./infer_test.txt --src_config_path=dlrm_input_format.json --dst_path ./$batchsize".json" --batch_size=$batchsize --segmentation=','

In [31]:
!perf_analyzer -m dlrm -u localhost:8000 --input-data 131072.json --shape CATCOLUMN:3407872 --shape DES:1703936 --shape ROWINDEX:3407873

 Successfully read data for 1 stream/streams with 1 step/steps.
*** Measurement Settings ***
  Batch size: 1
  Measurement window: 5000 msec
  Using synchronous calls for inference
  Stabilizing using average latency

Request concurrency: 1
  Client: 
    Request count: 27
    Throughput: 5.4 infer/sec
    Avg latency: 191104 usec (standard deviation 1974 usec)
    p50 latency: 190973 usec
    p90 latency: 192496 usec
    p95 latency: 195875 usec
    p99 latency: 197191 usec
    Avg HTTP time: 191181 usec (send/recv 66241 usec + response wait 124940 usec)
  Server: 
    Inference count: 32
    Execution count: 32
    Successful request count: 32
    Avg request latency: 112264 usec (overhead 2 usec + queue 13793 usec + compute input 0 usec + compute infer 98469 usec + compute output 0 usec)

Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 5.4 infer/sec, latency 191104 usec


## If you want to get more inference results with different batchsize, please repeat step 5.2 with new batchsize