In [1]:
import torch
import torchvision.transforms as transforms
import torch2trt
from torch2trt import TRTModule
import trt_pose.models
from trt_pose.draw_objects import DrawObjects
from trt_pose.parse_objects import ParseObjects
import trt_pose.coco
import math
import os
import numpy as np
import traitlets
import pickle
import sys
import time
import json


  warn(f"Failed to load image Python extension: {e}")


In [2]:
sys.path.append("../")
WIDTH = 224
HEIGHT = 224
batch_size = 1
MODEL_WEIGHTS = 'model/hand_pose_resnet18_att_244_244.pt'
OPTIMIZED_MODEL = 'model/hand_pose_resnet18_att_244_244.trt'
device = torch.device('cuda')


In [3]:

with open('preprocess/hand_pose.json', 'r') as f:
    hand_pose = json.load(f)

num_parts = len(hand_pose['keypoints'])
num_links = len(hand_pose['skeleton'])
topology = trt_pose.coco.coco_category_to_topology(hand_pose)
model = trt_pose.models.resnet18_baseline_att(
    num_parts, 2 * num_links).cuda().eval()
data = torch.zeros((batch_size, 3, HEIGHT, WIDTH)).cuda()

model.load_state_dict(torch.load(MODEL_WEIGHTS))
model_trt = torch2trt.torch2trt(
    model, [data], fp16_mode=True, max_workspace_size=1 << 25)
torch.save(model_trt.state_dict(), OPTIMIZED_MODEL)

model_trt = TRTModule()
model_trt.load_state_dict(torch.load(OPTIMIZED_MODEL))




<All keys matched successfully>

In [4]:
# provided by Nvidia
import time
import numpy as np

import torch.backends.cudnn as cudnn
cudnn.benchmark = True


def benchmark(model, device="cuda", input_shape=(batch_size, 3, 224, 224), dtype='fp32', nwarmup=50, nruns=100):
    input_data = torch.randn(input_shape)
    input_data = input_data.to(device)

    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            features = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i % 10 == 0:
                print('Iteration %d/%d, ave batch time %.2f ms' %
                      (i, nruns, np.mean(timings)*1000))

    print('Average batch time: %.2f ms' % (np.mean(timings)*1000))


In [5]:
# CPU benchmark
model.to("cpu")
benchmark(model, device="cpu")


Warm up ...
Start timing ...
Iteration 10/100, ave batch time 115.80 ms
Iteration 20/100, ave batch time 116.92 ms
Iteration 30/100, ave batch time 118.53 ms
Iteration 40/100, ave batch time 119.83 ms
Iteration 50/100, ave batch time 119.63 ms
Iteration 60/100, ave batch time 118.68 ms
Iteration 70/100, ave batch time 117.36 ms
Iteration 80/100, ave batch time 116.79 ms
Iteration 90/100, ave batch time 116.66 ms
Iteration 100/100, ave batch time 116.28 ms
Average batch time: 116.28 ms


In [6]:
# CUDA benchmark
model = model.to("cuda")
benchmark(model)


Warm up ...
Start timing ...
Iteration 10/100, ave batch time 5.65 ms
Iteration 20/100, ave batch time 5.74 ms
Iteration 30/100, ave batch time 5.74 ms
Iteration 40/100, ave batch time 5.80 ms
Iteration 50/100, ave batch time 5.74 ms
Iteration 60/100, ave batch time 5.73 ms
Iteration 70/100, ave batch time 5.70 ms
Iteration 80/100, ave batch time 5.70 ms
Iteration 90/100, ave batch time 5.67 ms
Iteration 100/100, ave batch time 5.65 ms
Average batch time: 5.65 ms


In [7]:
# TRT optimzed model benchmark
benchmark(model_trt)


Warm up ...
Start timing ...
Iteration 10/100, ave batch time 1.08 ms
Iteration 20/100, ave batch time 1.10 ms
Iteration 30/100, ave batch time 1.09 ms
Iteration 40/100, ave batch time 1.09 ms
Iteration 50/100, ave batch time 1.09 ms
Iteration 60/100, ave batch time 1.09 ms
Iteration 70/100, ave batch time 1.11 ms
Iteration 80/100, ave batch time 1.10 ms
Iteration 90/100, ave batch time 1.10 ms
Iteration 100/100, ave batch time 1.09 ms
Average batch time: 1.09 ms
