In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<a id='section1'></a>

# 1.Overview

In this notebook, we want to provide a tutorial about how to use the Hierarchical Parameter Server(HPS) backend to look up the embedding keys for inference service, and combine with pytorch and TernsorRT Triton backend 

1. <a href='#section1'>Overview</a> 
2. [Generate sythetic datasets to train native Pytorch DNN model and deploy the pytorch model using pytorch triton backend](#section2) 
3. [Separate the trained DNN model graph into two, embedding lookup and dense model graph](#section3)  
    3.1 [Deploy dense part model using pytorch Triton backend](#section3.1)  
    3.2 [Deploy the embedding part using HPS Triton Backend](#section3.2)   
    3.3 [Configure "ensemble_model" Triton backend for Embedding and Dense model](#section3.3) 
4. <a href='#section4'>Use TensorRT to speed up dense model inference and combine with HPS Backend</a>

<a id='#section2'></a>

<a id='section2'></a>

# 2. Train Pytorch DNN Model Based on Sythetic Dataset

In [1]:
import os
import numpy as np
import torch 
import struct

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
args = dict()
# define model training settings
args["gpu_num"] = 4                               # the number of available GPUs
args["num_sample"] = 64                           # the number of training sample
args["iter_num"] = 20                             # the number of training iteration
args["embed_vec_size"] = 32                       # the dimension of embedding vectors
args["global_batch_size"] = 32                 # the globally batchsize for all GPUs
args["max_vocabulary_size"] = 1000              # the num of embeddings in embedding table
args["vocabulary_range_per_slot"] = [[0,1000]]  # the range of embedding keys in embedding table
# define data type
args["np_key_type"]    = np.int64
args["np_vector_type"] = np.float32
args["tf_key_type"]    = torch.int64
args["tf_vector_type"] = torch.float32

# GPU environment configuration for model training
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(args["gpu_num"])))

## 2.1 Generate training data

In [10]:

def generate_random_samples(num_samples, vocabulary_range_per_slot, key_dtype = np.int64):
    """
    Data generator
    
    Returns a randomly generated set of values for keys and labels
    """
    keys = list()
    for vocab_range in vocabulary_range_per_slot:
        keys_per_slot = np.random.randint(low=vocab_range[0], 
                                          high=vocab_range[1], 
                                          size=(num_samples, 1), 
                                          dtype=key_dtype)
        keys.append(keys_per_slot)
    keys = np.concatenate(np.array(keys), axis = 1)
    labels = np.random.randint(low=0, high=2, size=(num_samples, 1))
    return keys, labels



## 2.2 Define a Naive Pytorch DNN model

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(MLP, self).__init__()
        self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
    
    def forward(self,x):
        user = self.user_Embedding(x)
        user = self.mlp(user)
        score = self.predict(user)
        return score

model = MLP(1000,32)



  from .autonotebook import tqdm as notebook_tqdm


## 2.3 Model training

In [12]:
from torch.utils.data import DataLoader
keys, labels = generate_random_samples(args["num_sample"], args["vocabulary_range_per_slot"], args["np_key_type"])
x_train = torch.from_numpy(keys)
y_train = torch.from_numpy(labels).float()
x_dataloader = DataLoader(x_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)
y_dataloader = DataLoader(y_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)

In [13]:
from torch.utils.data import DataLoader

print(torch.__version__)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.001)
for epoch in range(args["iter_num"]):
    iterations_per_epoch = len(x_dataloader)
    x_iterator = iter(x_dataloader)
    y_iterator = iter(y_dataloader)
    for _ in range(iterations_per_epoch):
        optimizer.zero_grad()
        x_train = next(x_iterator)
        y_train = next(y_iterator)
        x, y = x_train, y_train
        preds = model(x).squeeze(1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
    print(epoch, loss)

1.13.0+cu117
MLP(
  (user_Embedding): Embedding(1000, 32)
  (mlp): Sequential(
    (Linear_layer_1): Linear(in_features=32, out_features=16, bias=True)
    (Relu_layer_1): ReLU(inplace=True)
    (Linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
    (Relu_layer_2): ReLU(inplace=True)
  )
  (predict): Sequential(
    (0): Linear(in_features=8, out_features=1, bias=True)
    (1): Sigmoid()
  )
)
0 tensor(-0., grad_fn=<DivBackward1>)
1 tensor(-0., grad_fn=<DivBackward1>)
2 tensor(-0., grad_fn=<DivBackward1>)
3 tensor(-0., grad_fn=<DivBackward1>)
4 tensor(-0., grad_fn=<DivBackward1>)
5 tensor(-0., grad_fn=<DivBackward1>)
6 tensor(-0., grad_fn=<DivBackward1>)
7 tensor(-0., grad_fn=<DivBackward1>)
8 tensor(-0., grad_fn=<DivBackward1>)
9 tensor(-0., grad_fn=<DivBackward1>)


## 2.4 Print model layers

In [23]:
for param_tensor in model.state_dict():
    print(param_tensor)
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

user_Embedding.weight
user_Embedding.weight 	 torch.Size([1000, 32])
mlp.Linear_layer_1.weight
mlp.Linear_layer_1.weight 	 torch.Size([16, 32])
mlp.Linear_layer_1.bias
mlp.Linear_layer_1.bias 	 torch.Size([16])
mlp.Linear_layer_2.weight
mlp.Linear_layer_2.weight 	 torch.Size([8, 16])
mlp.Linear_layer_2.bias
mlp.Linear_layer_2.bias 	 torch.Size([8])
predict.0.weight
predict.0.weight 	 torch.Size([1, 8])
predict.0.bias
predict.0.bias 	 torch.Size([1])


## 2.5 Save model file

In [18]:
mkdir -p model/torch_test/0

In [19]:
save_model=torch.jit.script(model)
save_model.save("model/torch_test/0/model.pt")

## 2.6 Deploye the model using Pytorch Triton Backend
Configure "torch_test" model with pytorch backend

In [14]:
%%writefile model/torch_test/config.pbtxt
name: "torch_test"
platform: "pytorch_libtorch"
max_batch_size: 32
input: [
   {
      name: "user_Embedding"
      data_type: TYPE_INT64
      dims: [-1]
   }
]
output: [
   {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1]
   }
]

Overwriting model/torch_test/config.pbtxt


In [26]:
# Launch the Triton Server
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=torch_test --model-control-mode=explicit  --allow-gpu-metrics=true

I1110 12:02:36.176673 1599 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7fbdf4000000' with size 268435456
I1110 12:02:36.181262 1599 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1110 12:02:36.181271 1599 cuda_memory_manager.cc:105] CUDA memory pool is created on device 1 with size 67108864
I1110 12:02:36.181274 1599 cuda_memory_manager.cc:105] CUDA memory pool is created on device 2 with size 67108864
I1110 12:02:36.181277 1599 cuda_memory_manager.cc:105] CUDA memory pool is created on device 3 with size 67108864
I1110 12:02:36.551023 1599 model_repository_manager.cc:1206] loading: torch_test:0
I1110 12:02:36.952397 1599 libtorch.cc:1917] TRITONBACKEND_Initialize: pytorch
I1110 12:02:36.952420 1599 libtorch.cc:1927] Triton TRITONBACKEND API version: 1.10
I1110 12:02:36.952426 1599 libtorch.cc:1933] 'pytorch' TRITONBACKEND API version: 1.10
I1110 12:02:36.952448 1599 libtorch.cc:1966] TRITONBACKEND_ModelInitialize: torch_te

## 2.6 Send the inference request to Triton Server

In [27]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'torch_test'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[123,456]],dtype='int64')
    
    inputs = [
        httpclient.InferInput("user_Embedding", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    outputs = [
        httpclient.InferRequestedOutput("prediction")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("prediction"))



{'id': '1', 'model_name': 'torch_test', 'model_version': '0', 'outputs': [{'name': 'prediction', 'datatype': 'FP32', 'shape': [1, 2, 1], 'parameters': {'binary_data_size': 8}}]}
Prediction Result:
[[[0.41792953]
  [0.43865865]]]


<a id='section3'></a>

# 3 Separate the trained navie DNN model graph into  embedding and dense(MLP)

<a id='section3.1'></a>

## 3.1 Depoly the Dense Model using Triton Backend
### 3.1.1 Define Dense Model

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class OnlyMLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(OnlyMLP, self).__init__()
        #self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        self.emb_dim = user_dim
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
    
    def forward(self,x):
        #user = self.user_Embedding(x)
        user = x.reshape(-1,self.emb_dim)
        user = self.mlp(user)
        score = self.predict(user)
        return score

dense_model = OnlyMLP(1000,32)

In [9]:
# Print dense model layers
for param_tensor in dense_model.state_dict():
    print(param_tensor)
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

mlp.Linear_layer_1.weight
mlp.Linear_layer_1.weight 	 torch.Size([16, 32])
mlp.Linear_layer_1.bias
mlp.Linear_layer_1.bias 	 torch.Size([16])
mlp.Linear_layer_2.weight
mlp.Linear_layer_2.weight 	 torch.Size([8, 16])
mlp.Linear_layer_2.bias
mlp.Linear_layer_2.bias 	 torch.Size([8])
predict.0.weight
predict.0.weight 	 torch.Size([1, 8])
predict.0.bias
predict.0.bias 	 torch.Size([1])


### 3.1.2 Load complete pre-trained navie pytorch model( Step 2.5)

In [30]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
# get pre-trained model state dict
pretrain_dict=pretrain_model.state_dict()
# get dense model state dict
new_dict=  dense_model.state_dict()



In [10]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
pretrain_dict=pretrain_model.state_dict()
for param_tensor in pretrain_model.state_dict():
    print(param_tensor)
    print( model.state_dict()[param_tensor].size())

user_Embedding.weight
torch.Size([1000, 32])
mlp.Linear_layer_1.weight
torch.Size([16, 32])
mlp.Linear_layer_1.bias
torch.Size([16])
mlp.Linear_layer_2.weight
torch.Size([8, 16])
mlp.Linear_layer_2.bias
torch.Size([8])
predict.0.weight
torch.Size([1, 8])
predict.0.bias
torch.Size([1])




##3.3 Remove the embedding layer from pre-trained model

In [32]:
pretrain_dict = {k:v for k,v in pretrain_dict.items() if k in new_dict}
new_dict.update(pretrain_dict)
#update dense model
dense_model.load_state_dict(new_dict)

<All keys matched successfully>

### 3.1.3 Save the dense model

In [136]:
mkdir -p model/dense_test/0/

In [34]:
save_model=torch.jit.script(dense_model)
save_model.save("model/dense_test/0/model.pt")

### 3.1.4 Deploye the dense model using Pytorch Triton Backend
Configure "dense_test" model with pytorch backend

In [11]:
%%writefile model/dense_test/config.pbtxt
name: "dense_test"
platform: "pytorch_libtorch"
max_batch_size: 0
input: [
   {
      name: "mlp.Linear_layer_1"
      data_type: TYPE_FP32
      dims: [-1]
   }
]
output: [
   {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1]
   }
]

Overwriting model/dense_test/config.pbtxt


In [40]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=dense_test --model-control-mode=explicit --allow-gpu-metrics=true

I1110 12:09:30.142266 2190 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f8958000000' with size 268435456
I1110 12:09:30.146874 2190 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1110 12:09:30.146885 2190 cuda_memory_manager.cc:105] CUDA memory pool is created on device 1 with size 67108864
I1110 12:09:30.146888 2190 cuda_memory_manager.cc:105] CUDA memory pool is created on device 2 with size 67108864
I1110 12:09:30.146891 2190 cuda_memory_manager.cc:105] CUDA memory pool is created on device 3 with size 67108864
I1110 12:09:30.528751 2190 model_repository_manager.cc:1206] loading: dense_test:0
I1110 12:09:30.916909 2190 libtorch.cc:1917] TRITONBACKEND_Initialize: pytorch
I1110 12:09:30.916938 2190 libtorch.cc:1927] Triton TRITONBACKEND API version: 1.10
I1110 12:09:30.916961 2190 libtorch.cc:1933] 'pytorch' TRITONBACKEND API version: 1.10
I1110 12:09:30.916984 2190 libtorch.cc:1966] TRITONBACKEND_ModelInitialize: dense_te

In [69]:
#send the inference request to dense model
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'dense_test'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[np.random.uniform(0.0,1) for i in range(32)]],dtype='float32')
    
    inputs = [
        httpclient.InferInput("mlp.Linear_layer_1", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    outputs = [
        httpclient.InferRequestedOutput("prediction")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("prediction"))


{'id': '1', 'model_name': 'dense_test', 'model_version': '0', 'outputs': [{'name': 'prediction', 'datatype': 'FP32', 'shape': [1, 1], 'parameters': {'binary_data_size': 4}}]}
Prediction Result:
[[0.43013266]]


<a id='section3.2'></a>

## 3.2 Deploy the embedding part using HPS Triton Backend
### 3.2.1 Configure HPS backend for Embedding part

In [56]:
mkdir -p model/hps_test/0/hps_sparse.model

In [59]:
%%writefile model/hps_test/config.pbtxt
name: "hps_test"
backend: "hps"
max_batch_size:32,
input [
  {
    name: "KEYS"
    data_type: TYPE_INT64
    dims: [-1]
  },
  {
    name: "NUMKEYS"
    data_type: TYPE_INT32
    dims: [-1]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
version_policy: {
        specific:{versions: 0}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

parameters [

]

Overwriting model/hps_test/config.pbtxt


In [60]:
%%writefile model/hps.json
{
    "models": [{
    "model": "hps_test",
    "sparse_files": ["model/hps_test/0/hps_sparse.model"],
    "num_of_worker_buffer_in_pool": 3,
    "embedding_table_names":["0"],
    "num_of_refresher_buffer_in_pool":1,
    "embedding_vecsize_per_table":[32],
    "num_of_refresher_buffer_in_pool":0,
    "maxnum_catfeature_query_per_table_per_sample":[1],
    "deployed_device_list":[0],
    "max_batch_size":32,
    "default_value_for_each_table":[0.0],
    "cache_refresh_percentage_per_iteration":0,
    "hit_rate_threshold":1.1,
    "gpucacheper":1.0,
    "gpucache":true
    }]
}

Overwriting model/hps.json


### 3.2.2 Conver the torch-format embedding file to HPS-format embedding file

In [19]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
pretrain_model.state_dict().keys()

odict_keys(['user_Embedding.weight', 'mlp.Linear_layer_1.weight', 'mlp.Linear_layer_1.bias', 'mlp.Linear_layer_2.weight', 'mlp.Linear_layer_2.bias', 'predict.0.weight', 'predict.0.bias'])

In [59]:
def convert_to_sparse_model(embeddings_weights, embedding_table_path, embedding_vec_size):
    """
    Convert the lookup part of the model to a format supported by HPS (key-vector pair files),
    the embedding weights of the trained dense model will be reloaded.
    
    Outputs(key-vector pair files) will be saved to defined sparse model path
    """
    
    with open("{}/key".format(embedding_table_path), 'wb') as key_file, \
        open("{}/emb_vector".format(embedding_table_path), 'wb') as vec_file:
        for key in range(embeddings_weights.shape[0]):
            vec = embeddings_weights[key].data.tolist()
            key_struct = struct.pack('q', key)
            vec_struct = struct.pack(str(embedding_vec_size) + "f", *vec)
            key_file.write(key_struct)
            vec_file.write(vec_struct)

In [60]:
convert_to_sparse_model(pretrain_model.state_dict()['user_Embedding.weight'], "model/hps_test/0/hps_sparse.model", 32)

### 3.2.3 Launch Triton Server to verify HPS Backend

In [50]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=hps_test --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hugectr_backend/model/hps.json

I1110 12:12:50.076774 2411 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f1b28000000' with size 268435456
I1110 12:12:50.081315 2411 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1110 12:12:50.081324 2411 cuda_memory_manager.cc:105] CUDA memory pool is created on device 1 with size 67108864
I1110 12:12:50.081327 2411 cuda_memory_manager.cc:105] CUDA memory pool is created on device 2 with size 67108864
I1110 12:12:50.081330 2411 cuda_memory_manager.cc:105] CUDA memory pool is created on device 3 with size 67108864
I1110 12:12:50.456462 2411 model_repository_manager.cc:1206] loading: hps_test:0
I1110 12:12:50.474949 2411 hps.cc:61] TRITONBACKEND_Initialize: hps
I1110 12:12:50.474996 2411 hps.cc:68] Triton TRITONBACKEND API version: 1.10
I1110 12:12:50.475005 2411 hps.cc:72] 'hps' TRITONBACKEND API version: 1.9
I1110 12:12:50.475035 2411 hps.cc:95] The Hierarchical Parameter Server Backend Repository location: /usr/local/huge

I1110 12:12:50.756500 2411 metrics.cc:650] Collecting metrics for GPU 0: Tesla V100-SXM2-32GB
I1110 12:12:50.756523 2411 metrics.cc:650] Collecting metrics for GPU 1: Tesla V100-SXM2-32GB
I1110 12:12:50.756530 2411 metrics.cc:650] Collecting metrics for GPU 2: Tesla V100-SXM2-32GB
I1110 12:12:50.756538 2411 metrics.cc:650] Collecting metrics for GPU 3: Tesla V100-SXM2-32GB
I1110 12:12:50.759481 2411 tritonserver.cc:2176] 
+----------------------------------+------------------------------------------+
| Option                           | Value                                    |
+----------------------------------+------------------------------------------+
| server_id                        | triton                                   |
| server_version                   | 2.24.0                                   |
| server_extensions                | classification sequence model_repository |
|                                  |  model_repository(unload_dependents) sch |
|             

In [49]:
# send embedding key to HPS backend
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'hps_test'



with httpclient.InferenceServerClient("localhost:8000") as client:

    embedding_columns = np.array(torch.randint(low=0, high=999, size=(1,32)).numpy().tolist(),dtype='int64')
    row_ptrs = np.array([[32]],dtype='int32')

    inputs = [
        httpclient.InferInput("KEYS", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("NUMKEYS", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))
    print(response.as_numpy("OUTPUT0").shape)





{'id': '1', 'model_name': 'hps_test', 'model_version': '0', 'parameters': {'NumSample': 32, 'DeviceID': 0}, 'outputs': [{'name': 'OUTPUT0', 'datatype': 'FP32', 'shape': [1024], 'parameters': {'binary_data_size': 4096}}]}
Prediction Result:
[ 0.42441678 -3.7125523   1.2965899  ...  1.5860947  -0.10153087
 -0.4881016 ]
(1024,)


<a id='section3.3'></a>

## 3.3 Configure "ensemble_model" Triton backend for Embedding and Dense model
### 3.3.1 Configure "ensemble_model" Triton backend for Embedding deployment(HPS backend) and Dense model(Pytorch backend)


In [64]:
mkdir -p model/ensemble_model/

In [21]:
%%writefile model/ensemble_model/config.pbtxt
name: "ensemble_model"
platform: "ensemble"
max_batch_size: 32
input [
  {
    name: "EMB_KEY"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "EMB_N_KEY"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "DENSE_OUTPUT"
    data_type: TYPE_FP32
    dims: [-1]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "hps_test"
      model_version: -1
      input_map {
        key: "KEYS"
        value: "EMB_KEY"
      }
      input_map {
        key: "NUMKEYS"
        value: "EMB_N_KEY"
      }
      output_map {
        key: "OUTPUT0"
        value: "LOOKUP_VECTORS"
      }
    },
    {
      model_name: "dense_test"
      model_version: -1
      input_map {
        key: "mlp.Linear_layer_1"
        value: "LOOKUP_VECTORS"
      }
      output_map {
        key: "prediction"
        value: "DENSE_OUTPUT"
      }
    }
  ]
}

Overwriting model/ensemble_model/config.pbtxt


### 3.3.2 Launch Triton Server to verify ensemble Backend

In [75]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=ensemble_model --model-control-mode=explicit  --allow-gpu-metrics=true --backend-config=hps,ps=/hugectr_backend/model/hps.json

I1110 12:32:25.378918 2862 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7fbf80000000' with size 268435456
I1110 12:32:25.383488 2862 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1110 12:32:25.383496 2862 cuda_memory_manager.cc:105] CUDA memory pool is created on device 1 with size 67108864
I1110 12:32:25.383500 2862 cuda_memory_manager.cc:105] CUDA memory pool is created on device 2 with size 67108864
I1110 12:32:25.383502 2862 cuda_memory_manager.cc:105] CUDA memory pool is created on device 3 with size 67108864
I1110 12:32:25.761152 2862 model_repository_manager.cc:1206] loading: dense_test:0
I1110 12:32:25.761386 2862 model_repository_manager.cc:1206] loading: hps_test:0
I1110 12:32:26.162400 2862 libtorch.cc:1917] TRITONBACKEND_Initialize: pytorch
I1110 12:32:26.162424 2862 libtorch.cc:1927] Triton TRITONBACKEND API version: 1.10
I1110 12:32:26.162428 2862 libtorch.cc:1933] 'pytorch' TRITONBACKEND API version: 1.10
I11

I1110 12:32:27.249859 2862 hps.cc:307] TRITONBACKEND_ModelInstanceInitialize: hps_test_0 (device 0)
I1110 12:32:27.249937 2862 model_instance_state.cpp:81] Triton Model Instance Initialization on device 0
I1110 12:32:27.249948 2862 model_instance_state.cpp:91] Categorical Feature buffer allocation: 
I1110 12:32:27.250375 2862 model_instance_state.cpp:99] Number of Categorical Feature per Table buffer allocation: 
I1110 12:32:27.250487 2862 model_instance_state.cpp:109] Look_up result buffer allocation: 
I1110 12:32:27.250570 2862 hps.cc:320] ******Loading HPS ******
I1110 12:32:27.250577 2862 model_instance_state.cpp:140] The model origin json configuration file path is: 
[HCTR][12:32:27.250][INFO][RK0][main]: Creating lookup session for hps_test on device: 0
I1110 12:32:27.250662 2862 model_instance_state.cpp:147] ******Loading HugeCTR lookup session successfully
I1110 12:32:27.250711 2862 libtorch.cc:2010] TRITONBACKEND_ModelInstanceInitialize: dense_test (GPU device 1)
I1110 12:32:2

In [73]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'ensemble_model'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[123]],dtype='int64')
    row_ptrs = np.array([[1]],dtype='int32')

    inputs = [
        httpclient.InferInput("EMB_KEY", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("EMB_N_KEY", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("DENSE_OUTPUT"))

{'id': '1', 'model_name': 'ensemble_model', 'model_version': '0', 'parameters': {'sequence_id': 0, 'sequence_start': False, 'sequence_end': False}, 'outputs': [{'name': 'DENSE_OUTPUT', 'datatype': 'FP32', 'shape': [1, 1], 'parameters': {'binary_data_size': 4}}]}
Prediction Result:
[[0.41792953]]


<a id='section4'></a>

# 4 Use TensorRT to speed up dense model inference and combine with HPS Backend

## 4.1 Convert Dense model to Onnx 

In [76]:
import torch
import torch.nn as nn
import torch.nn.functional as F
dense_model = torch.load("model/dense_test/0/model.pt")

In [77]:
import numpy as np
dense_model(torch.from_numpy(np.array([np.random.uniform(0.0,1) for i in range(32)])).float())
dense_model(torch.randn(1,32))

tensor([[0.4337]], grad_fn=<DifferentiableGraphBackward>)

In [78]:

BATCH_SIZE = 32
dummy_input=torch.randn(BATCH_SIZE,32)
import torch.onnx
torch.onnx.export(dense_model, dummy_input,"model/dense_onnx_model.onnx", verbose = True, input_names = ["vectors"], output_names = ["prediction"],dynamic_axes = {'vectors' : {0 : 'BATCH_SIZE'}})

Exported graph: graph(%vectors : Float(*, 32, strides=[32, 1], requires_grad=0, device=cpu),
      %mlp.Linear_layer_1.weight : Float(16, 32, strides=[32, 1], requires_grad=0, device=cpu),
      %mlp.Linear_layer_1.bias : Float(16, strides=[1], requires_grad=0, device=cpu),
      %mlp.Linear_layer_2.weight : Float(8, 16, strides=[16, 1], requires_grad=0, device=cpu),
      %mlp.Linear_layer_2.bias : Float(8, strides=[1], requires_grad=0, device=cpu),
      %predict.0.weight : Float(1, 8, strides=[8, 1], requires_grad=0, device=cpu),
      %predict.0.bias : Float(1, strides=[1], requires_grad=0, device=cpu)):
  %/Constant_output_0 : Long(2, strides=[1], device=cpu) = onnx::Constant[value= -1  32 [ CPULongType{2} ], onnx_name="/Constant"](), scope: OnlyMLP:: # /tmp/ipykernel_128/146538850.py:19:15
  %/Reshape_output_0 : Float(*, *, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape"](%vectors, %/Constant_output_0), scope: OnlyMLP:: # /tmp/ipykernel_128/146538850.py:19:15
  %/ml



## 4.2 Conver onnx to tensorrt

In [81]:
!export PATH=/usr/src/tensorrt/bin/:$PATH

In [82]:
!mkdir -p model/dense_trt/0

In [83]:
!trtexec --onnx=model/dense_onnx_model.onnx  --saveEngine=model/dense_trt/0/dense_dynamic.trt --optShapes=vectors:32x32 --minShapes=vectors:1x32 --maxShapes=vectors:32x32

&&&& RUNNING TensorRT.trtexec [TensorRT v8501] # trtexec --onnx=model/dense_onnx_model.onnx --saveEngine=model/dense_trt/0/dense_dynamic.trt --optShapes=vectors:32x32 --minShapes=vectors:1x32 --maxShapes=vectors:32x32
[11/10/2022-12:38:10] [I] === Model Options ===
[11/10/2022-12:38:10] [I] Format: ONNX
[11/10/2022-12:38:10] [I] Model: model/dense_onnx_model.onnx
[11/10/2022-12:38:10] [I] Output:
[11/10/2022-12:38:10] [I] === Build Options ===
[11/10/2022-12:38:10] [I] Max batch: explicit batch
[11/10/2022-12:38:10] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[11/10/2022-12:38:10] [I] minTiming: 1
[11/10/2022-12:38:10] [I] avgTiming: 8
[11/10/2022-12:38:10] [I] Precision: FP32
[11/10/2022-12:38:10] [I] LayerPrecisions: 
[11/10/2022-12:38:10] [I] Calibration: 
[11/10/2022-12:38:10] [I] Refit: Disabled
[11/10/2022-12:38:10] [I] Sparsity: Disabled
[11/10/2022-12:38:10] [I] Safe mode: Disabled
[11/10/2022-12:38:10] [I] DirectIO mode

[11/10/2022-12:38:21] [I] 
[11/10/2022-12:38:21] [I] === Performance summary ===
[11/10/2022-12:38:21] [I] Throughput: 20532 qps
[11/10/2022-12:38:21] [I] Latency: min = 0.0257568 ms, max = 4.00183 ms, mean = 0.0295817 ms, median = 0.0289307 ms, percentile(90%) = 0.0292969 ms, percentile(95%) = 0.029541 ms, percentile(99%) = 0.0423584 ms
[11/10/2022-12:38:21] [I] Enqueue Time: min = 0.0134277 ms, max = 3.35352 ms, mean = 0.0149384 ms, median = 0.0144043 ms, percentile(90%) = 0.0150146 ms, percentile(95%) = 0.0153809 ms, percentile(99%) = 0.031311 ms
[11/10/2022-12:38:21] [I] H2D Latency: min = 0.00378418 ms, max = 3.95569 ms, mean = 0.00693778 ms, median = 0.0067749 ms, percentile(90%) = 0.00695801 ms, percentile(95%) = 0.00708008 ms, percentile(99%) = 0.00726318 ms
[11/10/2022-12:38:21] [I] GPU Compute Time: min = 0.0166016 ms, max = 3.34741 ms, mean = 0.0174177 ms, median = 0.0170898 ms, percentile(90%) = 0.017334 ms, percentile(95%) = 0.0175781 ms, percentile(99%) = 0.027832 m

### If the tensorrt is not installed in the merlin container, you need to run the following command in the host to get the tensorrt engine of dense model

In [None]:
docker run --runtime=nvidia --cap-add SYS_NICE --gpus=all --net=host -u root -v $(pwd):/hugectr_backend -w /hugectr_backend nvcr.io/nvidia/tensorrt:22.11-py3 trtexec --onnx=/hugectr_backend/model/dense_onnx_model.onnx  --saveEngine=/hugectr_backend/model/dense_trt/0/dense_dynamic.trt --optShapes=vectors:32x32 --minShapes=vectors:1x32 --maxShapes=vectors:32x32

### Create the Tensorrt backend configuration file

In [24]:
%%writefile model/dense_trt/config.pbtxt
platform: "tensorrt_plan"
default_model_filename: "dense_dynamic.trt"
backend: "tensorrt"
max_batch_size: 0

input [
  {
    name: "vectors"
    data_type: TYPE_FP32
    dims: [1024]
    reshape: { shape: [32,32] }
  }
]
output [
  {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1, 1]
  }
]

instance_group [
  {
    kind: KIND_GPU
  }
]

Overwriting model/dense_trt/config.pbtxt


# HPS + TensorRT ensemble

In [25]:
%%writefile model/ensemble_model/config.pbtxt
name: "ensemble_model"
platform: "ensemble"
max_batch_size: 32
input [
  {
    name: "EMB_KEY"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "EMB_N_KEY"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "DENSE_OUTPUT"
    data_type: TYPE_FP32
    dims: [-1]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "hps_test"
      model_version: -1
      input_map {
        key: "KEYS"
        value: "EMB_KEY"
      }
      input_map {
        key: "NUMKEYS"
        value: "EMB_N_KEY"
      }
      output_map {
        key: "OUTPUT0"
        value: "LOOKUP_VECTORS"
      }
    },
    {
      model_name: "dense_trt"
      model_version: -1
      input_map {
        key: "vectors"
        value: "LOOKUP_VECTORS"
      }
      output_map {
        key: "prediction"
        value: "DENSE_OUTPUT"
      }
    }
  ]
}

Overwriting model/ensemble_model/config.pbtxt


In [95]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=ensemble_model --model-control-mode=explicit  --allow-gpu-metrics=true --backend-config=hps,ps=/hugectr_backend/hps.json

I1110 12:47:38.008801 3326 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f8e1e000000' with size 268435456
I1110 12:47:38.013449 3326 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1110 12:47:38.013458 3326 cuda_memory_manager.cc:105] CUDA memory pool is created on device 1 with size 67108864
I1110 12:47:38.013461 3326 cuda_memory_manager.cc:105] CUDA memory pool is created on device 2 with size 67108864
I1110 12:47:38.013464 3326 cuda_memory_manager.cc:105] CUDA memory pool is created on device 3 with size 67108864
I1110 12:47:38.399856 3326 model_repository_manager.cc:1206] loading: dense_trt:0
I1110 12:47:38.399985 3326 model_repository_manager.cc:1206] loading: hps_test:0
I1110 12:47:38.425984 3326 tensorrt.cc:5427] TRITONBACKEND_Initialize: tensorrt
I1110 12:47:38.426016 3326 tensorrt.cc:5437] Triton TRITONBACKEND API version: 1.10
I1110 12:47:38.426023 3326 tensorrt.cc:5443] 'tensorrt' TRITONBACKEND API version: 1.10
I1

I1110 12:47:39.261674 3326 logging.cc:49] Loaded engine size: 0 MiB
I1110 12:47:39.267730 3326 logging.cc:49] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)
I1110 12:47:39.270010 3326 logging.cc:49] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)
W1110 12:47:39.270021 3326 logging.cc:46] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
I1110 12:47:39.270519 3326 tensorrt.cc:1541] Created instance dense_trt_0 on GPU 1 with stream priority 0 and optimization profile default[0];
I1110 12:47:39.270736 3326 tensorrt.cc:5587] TRITONBACKEND_ModelInstanceInitialize: dense_trt_0 (GPU device 2)
I1110 12:47:39.485668 3326 logging.cc:49] Loaded engine size: 0 MiB
I1110 12:47:39.491818 3326 logging.cc:49] [MemUsageChange] Te

In [63]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'ensemble_model'

with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array(torch.randint(low=0, high=999, size=(1,32)).numpy().tolist(),dtype='int64')
    row_ptrs = np.array([[32]],dtype='int32')

    inputs = [
        httpclient.InferInput("EMB_KEY", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("EMB_N_KEY", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("DENSE_OUTPUT"))

{'id': '1', 'model_name': 'ensemble_model', 'model_version': '0', 'parameters': {'sequence_id': 0, 'sequence_start': False, 'sequence_end': False}, 'outputs': [{'name': 'DENSE_OUTPUT', 'datatype': 'FP32', 'shape': [32, 1], 'parameters': {'binary_data_size': 128}}]}
Prediction Result:
[[0.42981684]
 [0.4249443 ]
 [0.42793524]
 [0.4317903 ]
 [0.43148506]
 [0.4266228 ]
 [0.42696753]
 [0.42348012]
 [0.39775002]
 [0.432141  ]
 [0.43840668]
 [0.4015409 ]
 [0.43687832]
 [0.43275222]
 [0.4350475 ]
 [0.42040417]
 [0.4362736 ]
 [0.43107143]
 [0.43180412]
 [0.41212332]
 [0.40849888]
 [0.43725652]
 [0.42471766]
 [0.4306074 ]
 [0.41305465]
 [0.3939272 ]
 [0.43355086]
 [0.40327936]
 [0.42544615]
 [0.39968377]
 [0.4147291 ]
 [0.39775002]]
