In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<a id='section1'></a>

# 1.Overview

In this notebook, we want to provide a tutorial about how to use the Hierarchical Parameter Server(HPS) backend to look up the embedding keys for inference service, and combine with pytorch and TernsorRT Triton backend 

1. <a href='#section1'>Overview</a> 
2. [Generate sythetic datasets to train native Pytorch DNN model and deploy the pytorch model using pytorch triton backend](#section2) 
3. [Separate the trained DNN model graph into two, embedding lookup and dense model graph](#section3)  
    3.1 [Deploy dense part model using pytorch Triton backend](#section3.1)  
    3.2 [Deploy the embedding part using HPS Triton Backend](#section3.2)   
    3.3 [Configure "ensemble_model" Triton backend for Embedding and Dense model](#section3.3) 
4. <a href='#section4'>Use TensorRT to speed up dense model inference and combine with HPS Backend</a>

<a id='#section2'></a>

<a id='section2'></a>

# 2. Train Pytorch DNN Model Based on Sythetic Dataset

In [1]:
import os
import numpy as np
import torch 
import struct

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
args = dict()
# define model training settings
args["gpu_num"] = 4                               # the number of available GPUs
args["num_sample"] = 64                           # the number of training sample
args["iter_num"] = 20                             # the number of training iteration
args["embed_vec_size"] = 32                       # the dimension of embedding vectors
args["global_batch_size"] = 32                 # the globally batchsize for all GPUs
args["max_vocabulary_size"] = 1000              # the num of embeddings in embedding table
args["vocabulary_range_per_slot"] = [[0,1000]]  # the range of embedding keys in embedding table
# define data type
args["np_key_type"]    = np.int64
args["np_vector_type"] = np.float32
args["tf_key_type"]    = torch.int64
args["tf_vector_type"] = torch.float32

# GPU environment configuration for model training
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(args["gpu_num"])))

## 2.1 Generate training data

In [3]:

def generate_random_samples(num_samples, vocabulary_range_per_slot, key_dtype = np.int64):
    """
    Data generator
    
    Returns a randomly generated set of values for keys and labels
    """
    keys = list()
    for vocab_range in vocabulary_range_per_slot:
        keys_per_slot = np.random.randint(low=vocab_range[0], 
                                          high=vocab_range[1], 
                                          size=(num_samples, 1), 
                                          dtype=key_dtype)
        keys.append(keys_per_slot)
    keys = np.concatenate(np.array(keys), axis = 1)
    labels = np.random.randint(low=0, high=2, size=(num_samples, 1))
    return keys, labels



## 2.2 Define a Naive Pytorch DNN model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(MLP, self).__init__()
        self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
    
    def forward(self,x):
        user = self.user_Embedding(x)
        user = self.mlp(user)
        score = self.predict(user)
        return score

model = MLP(1000,32)



## 2.3 Model training

In [None]:
from torch.utils.data import DataLoader
keys, labels = generate_random_samples(args["num_sample"], args["vocabulary_range_per_slot"], args["np_key_type"])
x_train = torch.from_numpy(keys)
y_train = torch.from_numpy(labels).float()
x_dataloader = DataLoader(x_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)
y_dataloader = DataLoader(y_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)

In [None]:
from torch.utils.data import DataLoader

print(torch.__version__)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.001)
for epoch in range(args["iter_num"]):
    iterations_per_epoch = len(x_dataloader)
    x_iterator = iter(x_dataloader)
    y_iterator = iter(y_dataloader)
    for _ in range(iterations_per_epoch):
        optimizer.zero_grad()
        x_train = next(x_iterator)
        y_train = next(y_iterator)
        x, y = x_train, y_train
        preds = model(x).squeeze(1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
    print(epoch, loss)

## 2.4 Print model layers

In [None]:
for param_tensor in model.state_dict():
    print(param_tensor)
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

## 2.5 Save model file

In [None]:
mkdir -p model/torch_test/0

In [None]:
save_model=torch.jit.script(model)
save_model.save("model/torch_test/0/model.pt")

## 2.6 Deploye the model using Pytorch Triton Backend
Configure "torch_test" model with pytorch backend

In [None]:
%%writefile model/torch_test/config.pbtxt
name: "torch_test"
platform: "pytorch_libtorch"
max_batch_size: 32
input: [
   {
      name: "user_Embedding"
      data_type: TYPE_INT64
      dims: [-1]
   }
]
output: [
   {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1]
   }
]

In [None]:
# Launch the Triton Server
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=torch_test --model-control-mode=explicit  --allow-gpu-metrics=true

## 2.6 Send the inference request to Triton Server

In [None]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'torch_test'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[123,456]],dtype='int64')
    
    inputs = [
        httpclient.InferInput("user_Embedding", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    outputs = [
        httpclient.InferRequestedOutput("prediction")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("prediction"))



<a id='section3'></a>

# 3 Separate the trained navie DNN model graph into  embedding and dense(MLP)

<a id='section3.1'></a>

## 3.1 Depoly the Dense Model using Triton Backend
### 3.1.1 Define Dense Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class OnlyMLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(OnlyMLP, self).__init__()
        #self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        self.emb_dim = user_dim
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
    
    def forward(self,x):
        #user = self.user_Embedding(x)
        user = x.reshape(-1,self.emb_dim)
        user = self.mlp(user)
        score = self.predict(user)
        return score

dense_model = OnlyMLP(1000,32)

In [None]:
# Print dense model layers
for param_tensor in dense_model.state_dict():
    print(param_tensor)
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

### 3.1.2 Load complete pre-trained navie pytorch model( Step 2.5)

In [None]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
# get pre-trained model state dict
pretrain_dict=pretrain_model.state_dict()
# get dense model state dict
new_dict=  dense_model.state_dict()

In [None]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
pretrain_dict=pretrain_model.state_dict()
for param_tensor in pretrain_model.state_dict():
    print(param_tensor)
    print( model.state_dict()[param_tensor].size())

##3.3 Remove the embedding layer from pre-trained model

In [None]:
pretrain_dict = {k:v for k,v in pretrain_dict.items() if k in new_dict}
new_dict.update(pretrain_dict)
#update dense model
dense_model.load_state_dict(new_dict)

### 3.1.3 Save the dense model

In [None]:
mkdir -p model/dense_test/0/

In [None]:
save_model=torch.jit.script(dense_model)
save_model.save("model/dense_test/0/model.pt")

### 3.1.4 Deploye the dense model using Pytorch Triton Backend
Configure "dense_test" model with pytorch backend

In [None]:
%%writefile model/dense_test/config.pbtxt
name: "dense_test"
platform: "pytorch_libtorch"
max_batch_size: 0
input: [
   {
      name: "mlp.Linear_layer_1"
      data_type: TYPE_FP32
      dims: [-1]
   }
]
output: [
   {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1]
   }
]

In [None]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=dense_test --model-control-mode=explicit --allow-gpu-metrics=true

In [None]:
#send the inference request to dense model
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'dense_test'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[np.random.uniform(0.0,1) for i in range(32)]],dtype='float32')
    
    inputs = [
        httpclient.InferInput("mlp.Linear_layer_1", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    outputs = [
        httpclient.InferRequestedOutput("prediction")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("prediction"))


<a id='section3.2'></a>

## 3.2 Deploy the embedding part using HPS Triton Backend
### 3.2.1 Configure HPS backend for Embedding part

In [None]:
mkdir -p model/hps_test/0/hps_sparse.model

In [None]:
%%writefile model/hps_test/config.pbtxt
name: "hps_test"
backend: "hps"
max_batch_size:32,
input [
  {
    name: "KEYS"
    data_type: TYPE_INT64
    dims: [-1]
  },
  {
    name: "NUMKEYS"
    data_type: TYPE_INT32
    dims: [-1]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ -1 ]
  }
]
version_policy: {
        specific:{versions: 0}
},
instance_group [
  {
    count: 1
    kind : KIND_GPU
    gpus:[0]
  }
]

parameters [

]

In [None]:
%%writefile model/hps.json
{
    "models": [{
    "model": "hps_test",
    "sparse_files": ["model/hps_test/0/hps_sparse.model"],
    "num_of_worker_buffer_in_pool": 3,
    "embedding_table_names":["0"],
    "num_of_refresher_buffer_in_pool":1,
    "embedding_vecsize_per_table":[32],
    "num_of_refresher_buffer_in_pool":0,
    "maxnum_catfeature_query_per_table_per_sample":[1],
    "deployed_device_list":[0],
    "max_batch_size":32,
    "default_value_for_each_table":[0.0],
    "cache_refresh_percentage_per_iteration":0,
    "hit_rate_threshold":1.1,
    "gpucacheper":1.0,
    "gpucache":true
    }]
}

### 3.2.2 Conver the torch-format embedding file to HPS-format embedding file

In [None]:
pretrain_model = torch.load("model/torch_test/0/model.pt")
pretrain_model.state_dict().keys()

In [None]:
def convert_to_sparse_model(embeddings_weights, embedding_table_path, embedding_vec_size):
    """
    Convert the lookup part of the model to a format supported by HPS (key-vector pair files),
    the embedding weights of the trained dense model will be reloaded.
    
    Outputs(key-vector pair files) will be saved to defined sparse model path
    """
    
    with open("{}/key".format(embedding_table_path), 'wb') as key_file, \
        open("{}/emb_vector".format(embedding_table_path), 'wb') as vec_file:
        for key in range(embeddings_weights.shape[0]):
            vec = embeddings_weights[key].data.tolist()
            key_struct = struct.pack('q', key)
            vec_struct = struct.pack(str(embedding_vec_size) + "f", *vec)
            key_file.write(key_struct)
            vec_file.write(vec_struct)

In [None]:
convert_to_sparse_model(pretrain_model.state_dict()['user_Embedding.weight'], "model/hps_test/0/hps_sparse.model", 32)

### 3.2.3 Launch Triton Server to verify HPS Backend

In [None]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=hps_test --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/hugectr_backend/model/hps.json

In [None]:
# send embedding key to HPS backend
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'hps_test'



with httpclient.InferenceServerClient("localhost:8000") as client:

    embedding_columns = np.array(torch.randint(low=0, high=999, size=(1,32)).numpy().tolist(),dtype='int64')
    row_ptrs = np.array([[32]],dtype='int32')

    inputs = [
        httpclient.InferInput("KEYS", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("NUMKEYS", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    outputs = [
        httpclient.InferRequestedOutput("OUTPUT0")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("OUTPUT0"))
    print(response.as_numpy("OUTPUT0").shape)





<a id='section3.3'></a>

## 3.3 Configure "ensemble_model" Triton backend for Embedding and Dense model
### 3.3.1 Configure "ensemble_model" Triton backend for Embedding deployment(HPS backend) and Dense model(Pytorch backend)


In [None]:
mkdir -p model/ensemble_model/

In [None]:
%%writefile model/ensemble_model/config.pbtxt
name: "ensemble_model"
platform: "ensemble"
max_batch_size: 32
input [
  {
    name: "EMB_KEY"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "EMB_N_KEY"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "DENSE_OUTPUT"
    data_type: TYPE_FP32
    dims: [-1]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "hps_test"
      model_version: -1
      input_map {
        key: "KEYS"
        value: "EMB_KEY"
      }
      input_map {
        key: "NUMKEYS"
        value: "EMB_N_KEY"
      }
      output_map {
        key: "OUTPUT0"
        value: "LOOKUP_VECTORS"
      }
    },
    {
      model_name: "dense_test"
      model_version: -1
      input_map {
        key: "mlp.Linear_layer_1"
        value: "LOOKUP_VECTORS"
      }
      output_map {
        key: "prediction"
        value: "DENSE_OUTPUT"
      }
    }
  ]
}

### 3.3.2 Launch Triton Server to verify ensemble Backend

In [None]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=ensemble_model --model-control-mode=explicit  --allow-gpu-metrics=true --backend-config=hps,ps=/hugectr_backend/model/hps.json

In [None]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'ensemble_model'




with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array([[123]],dtype='int64')
    row_ptrs = np.array([[1]],dtype='int32')

    inputs = [
        httpclient.InferInput("EMB_KEY", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("EMB_N_KEY", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("DENSE_OUTPUT"))

<a id='section4'></a>

# 4 Use TensorRT to speed up dense model inference and combine with HPS Backend

## 4.1 Convert Dense model to Onnx 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
dense_model = torch.load("model/dense_test/0/model.pt")

In [None]:
import numpy as np
dense_model(torch.from_numpy(np.array([np.random.uniform(0.0,1) for i in range(32)])).float())
dense_model(torch.randn(1,32))

In [None]:

BATCH_SIZE = 32
dummy_input=torch.randn(BATCH_SIZE,32)
import torch.onnx
torch.onnx.export(dense_model, dummy_input,"model/dense_onnx_model.onnx", verbose = True, input_names = ["vectors"], output_names = ["prediction"],dynamic_axes = {'vectors' : {0 : 'BATCH_SIZE'}})

## 4.2 Conver onnx to tensorrt

In [None]:
!export PATH=/usr/src/tensorrt/bin/:$PATH

In [None]:
!mkdir -p model/dense_trt/0

In [None]:
!trtexec --onnx=model/dense_onnx_model.onnx  --saveEngine=model/dense_trt/0/dense_dynamic.trt --optShapes=vectors:32x32 --minShapes=vectors:1x32 --maxShapes=vectors:32x32

### If the tensorrt is not installed in the merlin container, you need to run the following command in the host to get the tensorrt engine of dense model

In [None]:
docker run --runtime=nvidia --cap-add SYS_NICE --gpus=all --net=host -u root -v $(pwd):/hugectr_backend -w /hugectr_backend nvcr.io/nvidia/tensorrt:22.11-py3 trtexec --onnx=/hugectr_backend/model/dense_onnx_model.onnx  --saveEngine=/hugectr_backend/model/dense_trt/0/dense_dynamic.trt --optShapes=vectors:32x32 --minShapes=vectors:1x32 --maxShapes=vectors:32x32

### Create the Tensorrt backend configuration file

In [None]:
%%writefile model/dense_trt/config.pbtxt
platform: "tensorrt_plan"
default_model_filename: "dense_dynamic.trt"
backend: "tensorrt"
max_batch_size: 0

input [
  {
    name: "vectors"
    data_type: TYPE_FP32
    dims: [1024]
    reshape: { shape: [32,32] }
  }
]
output [
  {
      name: "prediction"
      data_type: TYPE_FP32
      dims: [-1, 1]
  }
]

instance_group [
  {
    kind: KIND_GPU
  }
]

# HPS + TensorRT ensemble

In [None]:
%%writefile model/ensemble_model/config.pbtxt
name: "ensemble_model"
platform: "ensemble"
max_batch_size: 32
input [
  {
    name: "EMB_KEY"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "EMB_N_KEY"
    data_type: TYPE_INT32
    dims: [ -1 ]
  }
]
output [
  {
    name: "DENSE_OUTPUT"
    data_type: TYPE_FP32
    dims: [-1]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "hps_test"
      model_version: -1
      input_map {
        key: "KEYS"
        value: "EMB_KEY"
      }
      input_map {
        key: "NUMKEYS"
        value: "EMB_N_KEY"
      }
      output_map {
        key: "OUTPUT0"
        value: "LOOKUP_VECTORS"
      }
    },
    {
      model_name: "dense_trt"
      model_version: -1
      input_map {
        key: "vectors"
        value: "LOOKUP_VECTORS"
      }
      output_map {
        key: "prediction"
        value: "DENSE_OUTPUT"
      }
    }
  ]
}

In [None]:
!tritonserver --model-repository=/hugectr_backend/model/ --load-model=ensemble_model --model-control-mode=explicit  --allow-gpu-metrics=true --backend-config=hps,ps=/hugectr_backend/hps.json

In [None]:
from tritonclient.utils import *
import tritonclient.http  as httpclient
import numpy as np
import pandas as pd
import sys

model_name = 'ensemble_model'

with httpclient.InferenceServerClient("localhost:8000") as client:
    embedding_columns = np.array(torch.randint(low=0, high=999, size=(1,32)).numpy().tolist(),dtype='int64')
    row_ptrs = np.array([[32]],dtype='int32')

    inputs = [
        httpclient.InferInput("EMB_KEY", embedding_columns.shape,
                              np_to_triton_dtype(embedding_columns.dtype)),
        httpclient.InferInput("EMB_N_KEY", row_ptrs.shape,
                              np_to_triton_dtype(row_ptrs.dtype)),

    ]

    inputs[0].set_data_from_numpy(embedding_columns)
    inputs[1].set_data_from_numpy(row_ptrs)
    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    outputs = [
        httpclient.InferRequestedOutput("DENSE_OUTPUT")
    ]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()
    print(result)
    print("Prediction Result:")
    print(response.as_numpy("DENSE_OUTPUT"))

In [None]:
import netron
import torch

In [None]:
model= torch.load("/hugectr/uber_model/EtaFitExperiment/0/model.pt")

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict

class MLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(MLP, self).__init__()
        self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        self.mlp2 = nn.Sequential()
        self.mlp2.add_module("Relu_layer_%d" % 0, nn.ReLU(inplace=True))
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
     
    
    def forward(self,x:Dict[str,torch.Tensor]):
        user = self.user_Embedding(x["embedding"])
        user2 = self.user_Embedding(x["embedding1"])
        user2 = self.mlp(user2)
        user = self.mlp(user)
        score = self.predict(user)
        score2 = self.predict(user2)
        return torch.add(score,score2)
    


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict

class MLP(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        
        super(MLP, self).__init__()
        self.user_Embedding = nn.Embedding(user_num,user_dim)
        self.mlp = nn.Sequential()
        self.mlp2 = nn.Sequential()
        self.mlp2.add_module("Relu_layer_%d" % 0, nn.ReLU(inplace=True))
        for id in range(1,len(layer)):
            self.mlp.add_module("Linear_layer_%d" % id, nn.Linear(layer[id-1],layer[id]))
            self.mlp.add_module("Relu_layer_%d" % id, nn.ReLU(inplace=True))
        self.predict =  nn.Sequential(nn.Linear(layer[-1],1),nn.Sigmoid())
     
    
    def forward(self,x:Dict[str,torch.Tensor]):
        user = self.user_Embedding(x["embedding"])
        user2 = self.user_Embedding(x["embedding1"])
        user2 = self.mlp(user2)
        user = self.mlp(user)
        score = self.predict(user)
        score2 = self.predict(user2)
        return torch.add(score,score2)

In [5]:
model = MLP(1000,32)

In [6]:
model

MLP(
  (user_Embedding): Embedding(1000, 32)
  (mlp): Sequential(
    (Linear_layer_1): Linear(in_features=32, out_features=16, bias=True)
    (Relu_layer_1): ReLU(inplace=True)
    (Linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
    (Relu_layer_2): ReLU(inplace=True)
  )
  (mlp2): Sequential(
    (Relu_layer_0): ReLU(inplace=True)
  )
  (predict): Sequential(
    (0): Linear(in_features=8, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [7]:
from torch.utils.data import DataLoader
keys, labels = generate_random_samples(args["num_sample"], args["vocabulary_range_per_slot"], args["np_key_type"])
x_train = torch.from_numpy(keys)
y_train = torch.from_numpy(labels).float()
x_dataloader = DataLoader(x_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)
y_dataloader = DataLoader(y_train, batch_size=args["global_batch_size"], shuffle=True, num_workers=1, pin_memory=False, drop_last=False)

In [8]:
from torch.utils.data import DataLoader

print(torch.__version__)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.001)
for epoch in range(args["iter_num"]):
    iterations_per_epoch = len(x_dataloader)
    x_iterator = iter(x_dataloader)
    y_iterator = iter(y_dataloader)
    for _ in range(iterations_per_epoch):
        optimizer.zero_grad()
        x_train = next(x_iterator)
        y_train = next(y_iterator)
        x, y = x_train, y_train
        x={"embedding":x_train}
        x["embedding1"]=x_train
        preds = model(x).squeeze(1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
    print(epoch, loss)

1.13.1+cu117
MLP(
  (user_Embedding): Embedding(1000, 32)
  (mlp): Sequential(
    (Linear_layer_1): Linear(in_features=32, out_features=16, bias=True)
    (Relu_layer_1): ReLU(inplace=True)
    (Linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
    (Relu_layer_2): ReLU(inplace=True)
  )
  (mlp2): Sequential(
    (Relu_layer_0): ReLU(inplace=True)
  )
  (predict): Sequential(
    (0): Linear(in_features=8, out_features=1, bias=True)
    (1): Sigmoid()
  )
)
0 tensor(-0., grad_fn=<DivBackward1>)
1 tensor(-0., grad_fn=<DivBackward1>)
2 tensor(-0., grad_fn=<DivBackward1>)
3 tensor(-0., grad_fn=<DivBackward1>)
4 tensor(-0., grad_fn=<DivBackward1>)
5 tensor(-0., grad_fn=<DivBackward1>)
6 tensor(-0., grad_fn=<DivBackward1>)
7 tensor(-0., grad_fn=<DivBackward1>)
8 tensor(-0., grad_fn=<DivBackward1>)
9 tensor(-0., grad_fn=<DivBackward1>)
10 tensor(-0., grad_fn=<DivBackward1>)
11 tensor(-0., grad_fn=<DivBackward1>)
12 tensor(-0., grad_fn=<DivBackward1>)
13 tensor(-0., grad_fn=<

In [9]:
device = torch.device("cuda")
my_dict: Dict[str, torch.Tensor] ={'embedding': torch.tensor([1],dtype=torch.int,device=device),'embedding1': torch.tensor([1],dtype=torch.int,device=device)}
model.to(device)
model(my_dict)

tensor([[1.0917]], device='cuda:0', grad_fn=<AddBackward0>)

In [19]:
tmm = torch.jit.trace(model,my_dict)

In [20]:
tmm(my_dict)

tensor([[1.0917]], device='cuda:0', grad_fn=<AddBackward0>)

In [21]:
BATCH_SIZE = 32
dummy_input=torch.randn(BATCH_SIZE,32)
x: Dict[str, torch.Tensor] ={'embedding': torch.tensor([1],dtype=torch.int,device=device),'embedding1': torch.tensor([1],dtype=torch.int,device=device)}
import torch.onnx
torch.onnx.export(tmm, (x,{}), "dict_new_onnx_model.onnx", verbose = True, input_names = ["input","embedding"], output_names = ["prediction"],dynamic_axes = {'vectors' : {0 : 'BATCH_SIZE'}})

Torch IR graph at exception: graph(%x : Dict(str, Tensor),
      %user_Embedding.weight : Int(1, strides=[1], requires_grad=0, device=cuda:0),
      %mlp.Linear_layer_1.bias : Float(1000, 32, strides=[32, 1], requires_grad=0, device=cuda:0),
      %mlp.Linear_layer_1.weight : Float(16, strides=[1], requires_grad=0, device=cuda:0),
      %mlp.Linear_layer_2.bias : Float(16, 32, strides=[32, 1], requires_grad=0, device=cuda:0),
      %mlp.Linear_layer_2.weight : Float(8, strides=[1], requires_grad=0, device=cuda:0),
      %predict.0.bias : Float(8, 16, strides=[16, 1], requires_grad=0, device=cuda:0),
      %predict.0.weight : Float(1, strides=[1], requires_grad=0, device=cuda:0)):
  %61 : Long(device=cpu) = prim::Constant[value={1}](), scope: MLP::
  %62 : Bool(device=cpu) = prim::Constant[value={0}](), scope: MLP::/torch.nn.modules.sparse.Embedding::user_Embedding
  %63 : Long(device=cpu) = prim::Constant[value={-1}](), scope: MLP::/torch.nn.modules.sparse.Embedding::user_Embedding
  %



RuntimeError: ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type

In [20]:
ls -l

total 354126
-rw-r--r-- 1 38458 dip     34783 Dec 14 04:15 01_model_training.ipynb
-rw-r--r-- 1 38458 dip     22935 Dec 14 04:15 02_model_inference_hps_tf_ensemble.ipynb
-rw-r--r-- 1 38458 dip     20214 Dec 14 04:15 03_model_inference_hps_trt_ensemble.ipynb
-rwxrwxrwx 1 root  dip       136 Nov 10 14:21 [0m[01;32m32.json[0m*
-rw-r--r-- 1 root  dip    153761 Feb  1 06:08 Dict_dense_onnx_model.onnx
-rw-r--r-- 1 38458 dip   1031320 Nov 29 07:31 HPS_Pytorch.ipynb
-rwxrwxrwx 1 root  dip     59509 Feb  1 07:07 [01;32mHPS_Pytorch_ensemble_deployment.ipynb[0m*
-rw-r--r-- 1 38458 dip      5441 Dec 14 09:58 README.md
-rw-r--r-- 1 root  dip    153769 Feb  1 06:07 dense_onnx_model.onnx
-rw-r--r-- 1 root  dip    179361 Feb  1 07:08 dict_onnx_model.onnx
-rw-rw-r-- 1 root  dip  72400431 Nov 21 05:01 hps_trt_report.nsys-rep
-rw-r--r-- 1 root  dip 251457536 Nov 21 05:01 hps_trt_report.sqlite
drwxrwxrwx 7 root  dip        10 Nov 29 08:22 [34;42mmodel[0m/
drwxr-xr-x 2 38458 dip       

In [6]:
import torch
device = torch.device("cuda")
user_model= torch.load("/hugectr/uber_model/EtaFitExperiment/0/model.pt")



In [4]:
device = torch.device("cuda")
user_model

RecursiveScriptModule(
  original_name=TorchGraphScriptWrapper
  (model): RecursiveScriptModule(
    original_name=LightningGraph
    (embeddings): RecursiveScriptModule(
      original_name=TorchGraph
      (emb0_derived_city_id): RecursiveScriptModule(
        original_name=Embedding
        (my_module): RecursiveScriptModule(original_name=Embedding)
      )
      (emb1_derived_minute_of_week): RecursiveScriptModule(
        original_name=Embedding
        (my_module): RecursiveScriptModule(original_name=Embedding)
      )
      (emb2_indexed_derived_geohash_begin_4): RecursiveScriptModule(
        original_name=Embedding
        (my_module): RecursiveScriptModule(original_name=Embedding)
      )
      (emb3_indexed_derived_geohash_end_4): RecursiveScriptModule(
        original_name=Embedding
        (my_module): RecursiveScriptModule(original_name=Embedding)
      )
      (emb4_indexed_derived_geohash_begin_5): RecursiveScriptModule(
        original_name=Embedding
        (my_modu

In [6]:
user_dict ={'derived_unmodified_eta_sec': torch.tensor([1],dtype=torch.float64,device=device),
'derived_estimated_distance_m': torch.tensor([1],dtype=torch.float64,device=device),
'derived_city_id': torch.tensor([1],dtype=torch.float64,device=device),
'derived_minute_of_week': torch.tensor([1],dtype=torch.float64,device=device),
'indexed_derived_geohash_begin_4': torch.tensor([1],dtype=torch.float64,device=device),
'indexed_derived_geohash_end_4': torch.tensor([1],dtype=torch.float64,device=device),
'indexed_derived_geohash_begin_5': torch.tensor([1],dtype=torch.float64,device=device),
'indexed_derived_geohash_end_5': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_begin_6_fh21': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_end_6_fh21': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_begin_6shift_fh21b': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_end_6shift_fh21b': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_begin_7_fh18': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_end_7_fh18': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_od_56_fh21': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_od_56shift_fh21b': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_od_65_fh21': torch.tensor([1],dtype=torch.float64,device=device),
'derived_geohash_od_65shift_fh21b': torch.tensor([1],dtype=torch.float64,device=device),
'derived_segment_countrycode_fh12': torch.tensor([1],dtype=torch.float64,device=device),
'derived_is_eats_record': torch.tensor([1],dtype=torch.float64,device=device),
'derived_map_data_vendor': torch.tensor([1],dtype=torch.float64,device=device),
'derived_route_type': torch.tensor([1],dtype=torch.float64,device=device),
'derived_waypoint_tasktype': torch.tensor([1],dtype=torch.float64,device=device),
'derived_unmodified_eta_sec_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_estimated_distance_m_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_haversine_distance_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_historical_seconds_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_umm_default_and_limit_seconds_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_realtime_seconds_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_historical_seconds_ratio_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_umm_default_and_limit_seconds_ratio_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device),
'derived_realtime_seconds_ratio_after_QuantileDiscretizer_0': torch.tensor([1],dtype=torch.float64,device=device)}

In [7]:
user_model(user_dict)

Output(prediction=tensor([[19.3254]], device='cuda:0', grad_fn=<ReluBackward0>))

In [15]:
input_name=["derived_unmodified_eta_sec","derived_estimated_distance_m","derived_city_id","derived_minute_of_week","indexed_derived_geohash_begin_4","indexed_derived_geohash_end_4","indexed_derived_geohash_begin_5","indexed_derived_geohash_end_5",
"derived_geohash_begin_6_fh21",
"derived_geohash_end_6_fh21",
"derived_geohash_begin_6shift_fh21b",
"derived_geohash_end_6shift_fh21b",
"derived_geohash_begin_7_fh18",
"derived_geohash_end_7_fh18",
"derived_geohash_od_56_fh21",
"derived_geohash_od_56shift_fh21b",
"derived_geohash_od_65_fh21",
"derived_geohash_od_65shift_fh21b",
"derived_segment_countrycode_fh12",
"derived_is_eats_record",
"derived_map_data_vendor",
"derived_route_type",
"derived_waypoint_tasktype",
"derived_unmodified_eta_sec_after_QuantileDiscretizer_0",
"derived_estimated_distance_m_after_QuantileDiscretizer_0",
"derived_haversine_distance_after_QuantileDiscretizer_0",
"derived_historical_seconds_after_QuantileDiscretizer_0",
"derived_umm_default_and_limit_seconds_after_QuantileDiscretizer_0",
"derived_realtime_seconds_after_QuantileDiscretizer_0",
"derived_historical_seconds_ratio_after_QuantileDiscretizer_0",
"derived_umm_default_and_limit_seconds_ratio_after_QuantileDiscretizer_0",
"derived_realtime_seconds_ratio_after_QuantileDiscretizer_0"]

In [None]:
torch.onnx.export(user_model, (user_dict,{}),"user__onnx_model.onnx", verbose = True, input_names = input_name, output_names = ["output_spec"])

In [7]:
dummy_input = [
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device),
    torch.tensor([1],dtype=torch.float64,device=device)
]

In [None]:
torch.onnx.export(user_model, dummy_input,"user__onnx_model.onnx", verbose = True, input_names = input_name, output_names = ["output_spec"])

In [21]:

test=(dict(zip(input_name,dummy_input)))

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple

class MOCK(nn.Module):
    def __init__(self,user_num,user_dim,layer=[32,16,8]):
        super(MOCK, self).__init__()
        self.predict= torch.load("/hugectr/uber_model/EtaFitExperiment/0/model.pt")
    
    def forward(self,x:List[str],y:List[torch.Tensor]):
        input={}
        for i in range(len(x)):
            input[x[i]]=y[i]
        score = self.predict(input)
        return score

In [41]:
test_model=MOCK(1000,32)

In [50]:
tmm = torch.jit.script(test_model)


In [67]:
tmm(input_name,dummy_input)

Output(prediction=tensor([[19.3254]], device='cuda:0', grad_fn=<DifferentiableGraphBackward>))

In [53]:
user_model(test)

Output(prediction=tensor([[19.3254]], device='cuda:0', grad_fn=<DifferentiableGraphBackward>))

In [74]:
torch.onnx.export(tmm, (input_name,dummy_input),"user__onnx_model.onnx", verbose = True, input_names = input_name, output_names = ["output_spec"])

ValueError: args contained 32 None's after flattening. When exporting a ScriptModule or ScriptFunction, no args may be None because that breaks type propagation.

In [71]:
dummy_input = torch.randn(32, device="cuda")

In [73]:
dummy_input

tensor([-0.8503,  0.9634, -0.7635, -0.2634, -0.4005,  0.0717,  0.6939,  0.9627,
         0.2049,  0.1340, -0.2913, -0.3506,  1.2504,  0.8410,  1.4949,  0.2318,
         0.0424,  1.0670,  0.1694, -0.3616, -1.3760,  0.0022, -0.5034,  0.5992,
         1.1692,  0.2213,  0.6310, -1.3881, -0.4218, -0.0618, -1.7155, -1.1066],
       device='cuda:0')