In [1]:
from ctypes import cdll, c_char_p
libcudart = cdll.LoadLibrary('cudart64_12.dll')
libcudart.cudaGetErrorString.restype = c_char_p
def cudaSetDevice(device_idx):
    ret = libcudart.cudaSetDevice(device_idx)
    if ret != 0:
        error_string = libcudart.cudaGetErrorString(ret)
        raise RuntimeError("cudaSetDevice: " + error_string)
cudaSetDevice(0) #No need but keep this when multi gpu case

In [2]:
from trt_utils import *

In [3]:
from os.path import join
def convert(sourceDir, dstDir, componentName, dataType):
    engine_seri = build_engine(join(sourceDir, componentName+'.onnx'), dataType)
    save_engine(engine_seri, join(dstDir, componentName+'.trt'))

In [4]:
convert('../onnx_model/seperable/fp16/', './seperable/fp16/','combiner', 'fp16')
convert('../onnx_model/seperable/fp16/', './seperable/fp16/','decomposer', 'fp16')
convert('../onnx_model/seperable/fp16/', './seperable/fp16/','editor', 'fp16')
convert('../onnx_model/seperable/fp16/', './seperable/fp16/','morpher', 'fp16')
convert('../onnx_model/seperable/fp16/', './seperable/fp16/','rotator', 'fp16')

convert('../onnx_model/seperable/fp32/', './seperable/fp32/','combiner', 'fp32')
convert('../onnx_model/seperable/fp32/', './seperable/fp32/','decomposer', 'fp32')
convert('../onnx_model/seperable/fp32/', './seperable/fp32/','editor', 'fp32')
convert('../onnx_model/seperable/fp32/', './seperable/fp32/','morpher', 'fp32')
convert('../onnx_model/seperable/fp32/', './seperable/fp32/','rotator', 'fp32')

In [5]:
convert('../onnx_model/standard/fp16/', './standard/fp16/','combiner', 'fp16')
convert('../onnx_model/standard/fp16/', './standard/fp16/','decomposer', 'fp16')
convert('../onnx_model/standard/fp16/', './standard/fp16/','editor', 'fp16')
convert('../onnx_model/standard/fp16/', './standard/fp16/','morpher', 'fp16')
convert('../onnx_model/standard/fp16/', './standard/fp16/','rotator', 'fp16')

convert('../onnx_model/standard/fp32/', './standard/fp32/','combiner', 'fp32')
convert('../onnx_model/standard/fp32/', './standard/fp32/','decomposer', 'fp32')
convert('../onnx_model/standard/fp32/', './standard/fp32/','editor', 'fp32')
convert('../onnx_model/standard/fp32/', './standard/fp32/','morpher', 'fp32')
convert('../onnx_model/standard/fp32/', './standard/fp32/','rotator', 'fp32')

In [6]:
import onnxruntime as ort
providers = [("CUDAExecutionProvider", {"device_id": 0, #torch.cuda.current_device(),
                                        "user_compute_stream": str(int(cuda.Stream().handle))})]
sess_options = ort.SessionOptions()
def testVerify(sourceDir, dstDir, componentName, dataType, inputNames, inputShapes):
    if dataType == 'fp16':
        dtype = np.float16  
    elif dataType == 'fp32':
        dtype = np.float32
    else:
        dtype = np.int8

    engine = load_engine(join(dstDir, componentName+'.trt'))
    proc = Processor(engine, len(inputNames))
    ort_sess = ort.InferenceSession(join(sourceDir, componentName+'.onnx'), sess_options=sess_options, providers=providers)

    for i in range(5):
        inputs = [np.random.rand(*shape).astype(dtype) for shape in inputShapes]
        input_dict = {k:v for k,v in zip(inputNames,inputs)}
        
        trt_res = proc.inference(inputs)
        onnx_res = ort_sess.run(None,input_dict)
        for i in range(len(trt_res)):
            print(dataType+" ["+str(i)+"] MSE is: ",((trt_res[i] - onnx_res[i]) ** 2).mean())

In [7]:
print("Decomposer:")
decomposer_input_names = ['input_image']
decomposer_input_shapes = [(1,4,512,512)]
testVerify('../onnx_model/seperable/fp32/', './seperable/fp32/','decomposer', 'fp32', decomposer_input_names, decomposer_input_shapes)
testVerify('../onnx_model/seperable/fp16/', './seperable/fp16/','decomposer', 'fp16', decomposer_input_names, decomposer_input_shapes)

print("Combiner:")
combiner_input_names = ['input_image','eyebrow_background_layer', "eyebrow_layer", 'eyebrow_pose']
combiner_input_shapes = [(1,4,512,512), (1,4,128,128), (1,4,128,128), (1,12)]
testVerify('../onnx_model/seperable/fp32/', './seperable/fp32/','combiner', 'fp32', combiner_input_names, combiner_input_shapes)
testVerify('../onnx_model/seperable/fp16/', './seperable/fp16/','combiner', 'fp16', combiner_input_names, combiner_input_shapes)

print("Morpher:")
morpher_input_names = ['input_image', 'im_morpher_crop', 'face_pose', '/face_morpher/body/downsample_blocks.3/downsample_blocks.3.3/Relu_output_0']
morpher_input_shapes = [(1,4,512,512), (1,4,192,192), (1,27), (1,512,24,24)]
testVerify('../onnx_model/seperable/fp32/', './seperable/fp32/','morpher', 'fp32', morpher_input_names, morpher_input_shapes)
testVerify('../onnx_model/seperable/fp16/', './seperable/fp16/','morpher', 'fp16', morpher_input_names, morpher_input_shapes)

print("Rotator:")
rotator_input_shapes = [(1,4,256,256), (1,6)]
rotator_input_names = ['face_morphed_half', 'rotation_pose']
testVerify('../onnx_model/seperable/fp32/', './seperable/fp32/','rotator', 'fp32', rotator_input_names, rotator_input_shapes)
testVerify('../onnx_model/seperable/fp16/', './seperable/fp16/','rotator', 'fp16', rotator_input_names, rotator_input_shapes)

print("Editor:")
editor_input_shapes = [(1,4,512,512), (1,4,512,512), (1,2,512,512), (1,6)]
editor_input_names = ['morphed_image', 'rotated_warped_image','rotated_grid_change','rotation_pose']
testVerify('../onnx_model/seperable/fp32/', './seperable/fp32/','editor', 'fp32', editor_input_names, editor_input_shapes)
testVerify('../onnx_model/seperable/fp16/', './seperable/fp16/','editor', 'fp16', editor_input_names, editor_input_shapes)

Decomposer:
fp32 [0] MSE is:  5.5564864e-09
fp32 [1] MSE is:  5.6317884e-09
fp32 [0] MSE is:  1.8512889e-09
fp32 [1] MSE is:  2.6791607e-11
fp32 [0] MSE is:  3.3631946e-09
fp32 [1] MSE is:  9.647969e-10
fp32 [0] MSE is:  5.430127e-09
fp32 [1] MSE is:  9.04328e-09
fp32 [0] MSE is:  2.6379228e-09
fp32 [1] MSE is:  3.1235996e-09
fp16 [0] MSE is:  6e-08
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  6e-08
fp16 [1] MSE is:  6e-08
fp16 [0] MSE is:  6e-08
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  6e-08
fp16 [1] MSE is:  2e-07
fp16 [0] MSE is:  6e-08
fp16 [1] MSE is:  0.0
Combiner:
fp32 [0] MSE is:  1.6824368e-06
fp32 [1] MSE is:  1.7882407e-05
fp32 [0] MSE is:  2.1098022e-06
fp32 [1] MSE is:  2.4393868e-05
fp32 [0] MSE is:  1.5645966e-06
fp32 [1] MSE is:  1.786362e-05
fp32 [0] MSE is:  1.0597455e-06
fp32 [1] MSE is:  1.1453707e-05
fp32 [0] MSE is:  1.8161437e-06
fp32 [1] MSE is:  1.8803821e-05
fp16 [0] MSE is:  1.84e-05
fp16 [1] MSE is:  0.000194
fp16 [0] MSE is:  3.064e-05
fp16 [1] MSE is:  0.000307


In [8]:
print("Decomposer:")
decomposer_input_names = ['input_image']
decomposer_input_shapes = [(1,4,512,512)]
testVerify('../onnx_model/standard/fp32/', './standard/fp32/','decomposer', 'fp32', decomposer_input_names, decomposer_input_shapes)
testVerify('../onnx_model/standard/fp16/', './standard/fp16/','decomposer', 'fp16', decomposer_input_names, decomposer_input_shapes)

print("Combiner:")
combiner_input_names = ['input_image','eyebrow_background_layer', "eyebrow_layer", 'eyebrow_pose']
combiner_input_shapes = [(1,4,512,512), (1,4,128,128), (1,4,128,128), (1,12)]
testVerify('../onnx_model/standard/fp32/', './standard/fp32/','combiner', 'fp32', combiner_input_names, combiner_input_shapes)
testVerify('../onnx_model/standard/fp16/', './standard/fp16/','combiner', 'fp16', combiner_input_names, combiner_input_shapes)

print("Morpher:")
morpher_input_names = ['input_image', 'im_morpher_crop', 'face_pose', '/face_morpher/downsample_blocks.3/downsample_blocks.3.2/Relu_output_0']
morpher_input_shapes = [(1,4,512,512), (1,4,192,192), (1,27), (1,512,24,24)]
testVerify('../onnx_model/standard/fp32/', './standard/fp32/','morpher', 'fp32', morpher_input_names, morpher_input_shapes)
testVerify('../onnx_model/standard/fp16/', './standard/fp16/','morpher', 'fp16', morpher_input_names, morpher_input_shapes)

print("Rotator:")
rotator_input_shapes = [(1,4,256,256), (1,6)]
rotator_input_names = ['face_morphed_half', 'rotation_pose']
testVerify('../onnx_model/standard/fp32/', './standard/fp32/','rotator', 'fp32', rotator_input_names, rotator_input_shapes)
testVerify('../onnx_model/standard/fp16/', './standard/fp16/','rotator', 'fp16', rotator_input_names, rotator_input_shapes)

print("Editor:")
editor_input_shapes = [(1,4,512,512), (1,4,512,512), (1,2,512,512), (1,6)]
editor_input_names = ['morphed_image', 'rotated_warped_image','rotated_grid_change','rotation_pose']
testVerify('../onnx_model/standard/fp32/', './standard/fp32/','editor', 'fp32', editor_input_names, editor_input_shapes)
testVerify('../onnx_model/standard/fp16/', './standard/fp16/','editor', 'fp16', editor_input_names, editor_input_shapes)

Decomposer:
fp32 [0] MSE is:  1.0812705e-10
fp32 [1] MSE is:  1.1684789e-12
fp32 [0] MSE is:  2.3819308e-10
fp32 [1] MSE is:  8.095282e-10
fp32 [0] MSE is:  3.5907755e-12
fp32 [1] MSE is:  2.4184487e-12
fp32 [0] MSE is:  2.499066e-11
fp32 [1] MSE is:  6.815441e-14
fp32 [0] MSE is:  1.7270113e-12
fp32 [1] MSE is:  4.1523317e-15
fp16 [0] MSE is:  0.0
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  0.0
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  0.0
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  1e-07
fp16 [1] MSE is:  0.0
fp16 [0] MSE is:  0.0
fp16 [1] MSE is:  0.0
Combiner:
fp32 [0] MSE is:  2.1398036e-07
fp32 [1] MSE is:  8.430146e-07
fp32 [0] MSE is:  8.6369944e-08
fp32 [1] MSE is:  3.10355e-07
fp32 [0] MSE is:  9.717198e-08
fp32 [1] MSE is:  3.9280366e-07
fp32 [0] MSE is:  9.082552e-08
fp32 [1] MSE is:  4.425158e-07
fp32 [0] MSE is:  3.4650296e-07
fp32 [1] MSE is:  1.5518996e-06
fp16 [0] MSE is:  1.496e-05
fp16 [1] MSE is:  6.37e-05
fp16 [0] MSE is:  1.73e-05
fp16 [1] MSE is:  8.11e-05
fp16 [0] MSE is:

In [10]:
editor_s32 = Processor(load_engine(join('./standard/fp32/', 'editor.trt')), 4)
editor_s16 = Processor(load_engine(join('./standard/fp16/', 'editor.trt')), 4)
editor_d32 = Processor(load_engine(join('./seperable/fp32/', 'editor.trt')), 4)
editor_d16 = Processor(load_engine(join('./seperable/fp16/', 'editor.trt')), 4)
for i in range(100):
    editor_s32.kickoff()
    editor_s16.kickoff()
    editor_d32.kickoff()
    editor_d16.kickoff()

from tqdm import tqdm

print('Standard 32')
for i in tqdm(range(10000)):
    editor_s32.kickoff()

print('Standard 16')
for i in tqdm(range(10000)):
    editor_s16.kickoff()

print('seperable 32')
for i in tqdm(range(10000)):
    editor_d32.kickoff()

print('seperable 16')
for i in tqdm(range(10000)):
    editor_d16.kickoff()

Standard 32


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:25<00:00, 68.87it/s]


Standard 16


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:13<00:00, 135.89it/s]


seperable 32


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:26<00:00, 115.97it/s]


seperable 16


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:56<00:00, 175.73it/s]


In [18]:
def fullbench(model_dir, dtype, iters = 100000):
    decomposer_engine = load_engine(join(model_dir, 'decomposer.trt'))
    decomposer_proc = Processor(decomposer_engine, 4)
    
    combiner_engine = load_engine(join(model_dir, 'combiner.trt'))
    combiner_proc = Processor(combiner_engine, 4)
    
    morpher_engine = load_engine(join(model_dir, 'morpher.trt'))
    morpher_proc = Processor(morpher_engine, 4)
    
    rotator_engine = load_engine(join(model_dir, 'rotator.trt'))
    rotator_proc = Processor(rotator_engine, 2)
    
    editor_engine = load_engine(join(model_dir, 'editor.trt'))
    editor_proc = Processor(editor_engine, 2)

    for i in range(100): #preheat
        combiner_proc.kickoff()
        morpher_proc.kickoff()
        rotator_proc.kickoff()
        editor_proc.kickoff()
    
    for i in tqdm(range(iters)):
        if i % 15 == 0:
            combiner_proc.kickoff()
        morpher_proc.kickoff()
        rotator_proc.kickoff()
        editor_proc.kickoff()

In [19]:
fullbench('./standard/fp32/', np.float32, 1000)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:30<00:00, 32.41it/s]


In [20]:
fullbench('./standard/fp16/', np.float16, 1000)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 72.38it/s]


In [21]:
fullbench('./seperable/fp32/', np.float32, 1000)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 62.56it/s]


In [22]:
fullbench('./seperable/fp16/', np.float16, 1000)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:10<00:00, 93.45it/s]
