In [1]:
import numpy as np
from cuda import cudart
import torch
from torch import Tensor, nn
import tensorrt as trt

## Generate input and data shape

In [2]:
# Input tensor shape NCHW
nIn, hIn, wIn = 1, 2, 2

# Output tensor shape C
cOut = 2

# Input tensor
data = np.arange(hIn * wIn, dtype=np.float32).reshape(nIn, hIn, wIn)

# fully connected weight
weight = np.ones(cOut * hIn * wIn, dtype=np.float32).reshape(cOut, hIn * wIn)

# fully connected bias
bias = np.zeros(cOut, dtype=np.float32)

print("inputH0 :", data.shape)
print(data)

inputH0 : (1, 2, 2)
[[[0. 1.]
  [2. 3.]]]


## torch silu

In [3]:
class SiLUActivation(nn.Module):
    """
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    """

    def forward(self, input: Tensor) -> Tensor:
        return nn.functional.silu(input)
    
    def b_forward(self, input: Tensor) -> Tensor:
        return torch.matmul(input.T, nn.functional.sigmoid(input))


In [4]:
def test_torch(nIn, hIn, wIn, cOut, raw_data, weight, bias):
    data = torch.tensor(raw_data).reshape(-1)
    # model = torch.nn.Linear(hIn * wIn, cOut)

    # # initialize model weights
    # model.weight.data.fill_(1)
    # print(model.weight.data.detach().cpu().numpy())
    # model.bias.data.fill_(0)
    model = SiLUActivation()

    output = model(data)

    return output

## Test torch

In [5]:
torch_output = test_torch(nIn, hIn, wIn, cOut, data, weight, bias)
print("output_torch :", torch_output.shape)
print(torch_output)

output_torch : torch.Size([4])
tensor([0.0000, 0.7311, 1.7616, 2.8577])


## tensorRT SiLU

In [6]:
def trt_create(nIn, hIn, cOut, weight, bias):
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)

    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    config = builder.create_builder_config()

    # input
    inputT0 = network.add_input('inputT0', trt.DataType.FLOAT, (nIn, -1, hIn))

    # dynamic shape optimization
    profile = builder.create_optimization_profile();
    profile.set_shape("inputT0", (nIn, 1, hIn), (nIn, 2, hIn), (nIn, 3, hIn)) 
    config.add_optimization_profile(profile)

    # add fully connected layer
    selu_sigmoid_layer = network.add_activation(inputT0, type=trt.ActivationType.SIGMOID)
    selu_mult_layer = network.add_elementwise(inputT0, selu_sigmoid_layer.get_output(0), op=trt.ElementWiseOperation.PROD)

    # output
    network.mark_output(selu_mult_layer.get_output(0))

    engineString = builder.build_serialized_network(network, config)
    
    return engineString

In [7]:
trt_engineStr = trt_create(nIn, hIn, cOut, weight, bias)

In [12]:
def trt_inference(nIn, hIn, cOut, engineString, raw_data):
    print(engineString)
    print("Runtime")
    logger = trt.Logger(trt.Logger.ERROR)
    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    context = engine.create_execution_context()

    # dynamic shape configure
    print("Set input shape")
    context.set_input_shape("inputT0", (nIn, 2, hIn))
    context.set_binding_shape(0, (nIn, 2, hIn))
    origin_inputshape = context.get_binding_shape(0)

    print("Set input shape completed")

    data = np.array(raw_data)

    _, stream = cudart.cudaStreamCreate()
    print("Reshaping")

    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    print("Reshaped")

    # initialize input and output data
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    # move input to device
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)

    # execute
    print("execute")
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)

    # move output back to host
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)

    # wait for everything
    cudart.cudaStreamSynchronize(stream)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)

    return outputH0

In [14]:
trt_output = trt_inference(nIn, hIn, cOut, trt_engineStr, data)
trt_output = trt_output.reshape(-1)
print("output_trt :", trt_output.shape)
print(trt_output)

<tensorrt.tensorrt.IHostMemory object at 0x7f08447a4730>
Runtime
Set input shape
Set input shape completed
Reshaping
Reshaped
execute
output_trt : (4,)
[0.        0.7310586 1.7615942 2.8577223]


  context.set_binding_shape(0, (nIn, 2, hIn))
  origin_inputshape = context.get_binding_shape(0)
  outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
  outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))


In [10]:
! nvidia-smi

Wed Sep 20 06:44:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   51C    P0    42W / 250W |    312MiB / 32768MiB |      0%   E. Process |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   35C    P0    25W / 250W |      4MiB / 32768MiB |      0%   E. Process |
|       