tensorRT in C++ (#47)

* init tensorRT * add makefiles * fix cmake * fix typo * profile * add a slow implementation of gauss smooth * fix cmake * Paf cpp (#61) * WIP * install libopencv-dev * remove python * optimize select_peak * cleanup header * use channel first * build in docker * optimize smooth * fix path * optimize resize_area * download-testdata.sh * fix * optimize max pool * optimize * draw results * pre allocate temp memory * remove unused files * move folder * remove duplicated files * rm unused file * flatten folder * mv * remove unused file * rm * fix cmake * rm * flatten folder * cleanup constants * simplify * fix yaml * create a docker image for building with CUDA * fix * fix missing dependencies * fix * add a cli tool * cleanup * add tf-runner interface * finish OpenposeRunnerImpl * export base model * use in-tree build * fix build * use CHANNEL FIRST! * cleanup debug log * support batch * update * cleanup * merge master * cleanup * update tests * remove debug code * remove bazel * remove debug code * rm * rm * fix test * fix test
tensorlayer · Sep 20, 2018 · ae2686d · ae2686d
1 parent 02888df
commit ae2686d
Show file tree

Hide file tree

Showing 41 changed files with 1,966 additions and 25 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+docker/*.deb
diff --git a/.gitignore b/.gitignore
@@ -1,12 +1,18 @@
 __pycache__
 .idea
+*.a
 *.avi
+*.dylib
+*.gz
 *.jpg
 *.log
+*.out
 *.png
 *.uff
 /3rdparty
+/bazel-*
 /checkpoints
+/cmake-build
 /coco
 /data
 /models

diff --git a/.travis.yml b/.travis.yml
@@ -1,22 +1,30 @@
 # https://docs.travis-ci.com/user/languages/python/
 language: python
 
+# https://docs.travis-ci.com/user/docker/
+services:
+- docker
+
 python:
-  - '3.6'
-  - '2.7'
+- '3.6'
+- '2.7'
 
 sudo: required
 
 before_install:
-- sudo apt install -y swig
+- docker info
+- sudo apt install -y swig # FIXME: can't install libopencv-dev on travis CI
 - pip install -r requirements.txt
 - pip install pycocotools  # must be installed after cython is installed
 - pip install git+https://github.com/tensorlayer/tensorlayer.git  # TODO: create a latest release for TL
 - ./scripts/install-pafprocess.sh
 - ./scripts/download-test-data.sh
 
 script:
+- make docker-build
 - python ./test_inference.py --path-to-npz='' --images=$(ls data/media/*.jpg | sort | head -n 3 | tr '\n' ',') --base-model=vgg
 - python ./test_inference.py --path-to-npz='' --images=$(ls data/media/*.jpg | sort | head -n 3 | tr '\n' ',') --base-model=vggtiny
 - python ./test_inference.py --path-to-npz='' --images=$(ls data/media/*.jpg | sort | head -n 3 | tr '\n' ',') --base-model=mobilenet
+# TODO: reenable when data_format is supported in BN
+# - python ./test_inference.py --path-to-npz='' --images=$(ls data/media/*.jpg | sort | head -n 3 | tr '\n' ',') --base-model=hao28_experimental
 # TODO: add move tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,9 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
+PROJECT(openpose-plus)
+
+SET(CMAKE_CXX_STANDARD 11)
+INCLUDE(src/build.cmake)
+
+IF(DEFINED ENV{HAVE_CUDA})
+    INCLUDE(src/build-gpu.cmake)
+ENDIF()
diff --git a/Makefile b/Makefile
@@ -0,0 +1,20 @@
+MAKEFILE ?= Makefile.config
+include $(MAKEFILE)
+
+default: build_with_cmake
+# default: docker-build-gpu
+
+cmake_targets:
+	mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR); cmake $(CMAKE_FLAGS) $(CURDIR)
+
+build_with_cmake: cmake_targets
+	make -C $(BUILD_DIR) -j $(NPROC)
+
+CPU_TAG = openpose-plus:builder
+docker-build:
+	docker build --rm -t $(CPU_TAG) -f docker/Dockerfile.builder-cpu .
+
+GPU_TAG = openpose-plus:builder-gpu
+docker-build-gpu:
+	docker build --rm -t $(GPU_TAG) -f docker/Dockerfile.builder-gpu .
diff --git a/Makefile.config b/Makefile.config
@@ -0,0 +1,13 @@
+BUILD_DIR = $(CURDIR)/cmake-build/$(shell uname)
+
+
+ifeq ($(shell uname), Darwin)
+	NPROC = $(shell sysctl -n hw.ncpu)
+else
+	NPROC = $(shell nproc)
+endif
+
+
+CMAKE_FLAGS = \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+    -DCMAKE_BUILD_TYPE=Release \
diff --git a/docker/.gitignore b/docker/.gitignore
@@ -0,0 +1 @@
+*.deb
diff --git a/docker/Dockerfile.builder-cpu b/docker/Dockerfile.builder-cpu
@@ -0,0 +1,10 @@
+FROM ubuntu:xenial
+
+RUN apt update && \
+    apt install -y g++ cmake libopencv-dev libgflags-dev
+ADD . /openpose-plus
+WORKDIR /openpose-plus
+RUN make build_with_cmake
+RUN curl -sLOJ https://github.com/tensorlayer/fast-openpose/files/2378505/network-outputs.gz && \
+    gzip -d network-outputs.gz && \
+    tar -xf network-outputs
diff --git a/docker/Dockerfile.builder-gpu b/docker/Dockerfile.builder-gpu
@@ -0,0 +1,8 @@
+FROM tensorrt:snapshot
+
+RUN apt install -y g++ cmake libopencv-dev
+ADD . /openpose-plus
+WORKDIR /openpose-plus
+
+ENV HAVE_CUDA=1
+RUN make build_with_cmake
diff --git a/docker/Dockerfile.tensorrt b/docker/Dockerfile.tensorrt
@@ -0,0 +1,28 @@
+FROM ubuntu:xenial
+
+ARG NVIDIA_CUDA_PREFIX=http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64
+ARG NVIDIA_ML_PREFIX=http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/
+
+ARG CUDA_REPO=cuda-repo-ubuntu1604_9.0.176-1_amd64.deb
+ARG ML_REPO=nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb
+ARG RT_REPO=nv-tensorrt-repo-ubuntu1604-cuda9.0-ga-trt4.0.1.6-20180612_1-1_amd64.deb
+
+ADD sources.list.ustc /etc/apt/sources.list
+RUN apt update && apt install -y curl && \
+    apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
+
+RUN curl -sLOJ ${NVIDIA_CUDA_PREFIX}/${CUDA_REPO} && \
+    curl -sLOJ ${NVIDIA_ML_PREFIX}/${ML_REPO} && \
+    dpkg -i ${CUDA_REPO} && \
+    dpkg -i ${ML_REPO}
+
+ADD ${RT_REPO} /tmp/
+RUN dpkg -i /tmp/${RT_REPO} \
+    && apt update
+
+RUN apt install -y \
+    libnvinfer-dev=4.1.2-1+cuda9.0  \
+    cuda-cudart-dev-9-0=9.0.176-1 \
+    cuda-libraries-dev-9-0
+
+RUN apt install -y g++ cmake libopencv-dev
diff --git a/docker/Makefile b/docker/Makefile
@@ -0,0 +1,12 @@
+TAG = tensorrt:snapshot
+
+TENSORRT_REPO_DEB = nv-tensorrt-repo-ubuntu1604-cuda9.0-ga-trt4.0.1.6-20180612_1-1_amd64.deb
+
+tensorrt-image: $(TENSORRT_REPO_DEB)
+	docker build --rm -t $(TAG) -f Dockerfile.tensorrt .
+
+$(TENSORRT_REPO_DEB):
+	cp ~/Downloads/$(TENSORRT_REPO_DEB) .
+
+run:
+	docker run --rm -it $(TAG) bash
diff --git a/docker/sources.list.ustc b/docker/sources.list.ustc
@@ -0,0 +1,15 @@
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial main restricted
+
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial-updates main restricted
+
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial universe
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial-updates universe
+
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial multiverse
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial-updates multiverse
+
+deb http://mirrors.ustc.edu.cn/ubuntu/ xenial-backports main restricted universe multiverse
+
+deb http://mirrors.ustc.edu.cn/ubuntu xenial-security main restricted
+deb http://mirrors.ustc.edu.cn/ubuntu xenial-security universe
+deb http://mirrors.ustc.edu.cn/ubuntu xenial-security multiverse
diff --git a/scripts/freeze-graph.sh b/scripts/freeze-graph.sh
@@ -31,7 +31,7 @@ GRAPH_FILE=${CHECKPOINT_DIR}/graph.pb.txt
 CHECKPOINT=${CHECKPOINT_DIR}/saved_checkpoint-0
 OUTPUT_GRAPH=${CHECKPOINT_DIR}/freezed
 
-OUTPUT_NODE_NAMES=image,upsample_size,upsample_heatmat,tensor_peaks,upsample_pafmat
+OUTPUT_NODE_NAMES=image,outputs/conf,outputs/paf
 
 freeze() {
     python3 ${FREEZE_GRAPH_BIN} \
@@ -41,11 +41,11 @@ freeze() {
         --output_node_names ${OUTPUT_NODE_NAMES}
 }
 
-# BASE_MODEL=vgg
-# PATH_TO_NPZ=${HOME}/Downloads/vgg450000.npz
+BASE_MODEL=vgg
+PATH_TO_NPZ=${HOME}/Downloads/vgg450000_no_cpm.npz
 
-BASE_MODEL=mobilenet
-PATH_TO_NPZ=${HOME}/Downloads/mbn28000.npz
+# BASE_MODEL=mobilenet
+# PATH_TO_NPZ=${HOME}/Downloads/mbn28000.npz
 
 measure ./export.py --base-model=${BASE_MODEL} --full=True --path-to-npz=${PATH_TO_NPZ}
 measure freeze
diff --git a/scripts/profile.sh b/scripts/profile.sh
@@ -17,13 +17,12 @@ measure() {
 cd $(dirname $0)/..
 
 export PYTHONUNBUFFERED=1
+# DATA_DIR=$(pwd)/data/media
 
 MODEL_DIR=${HOME}/Downloads
-DATA_DIR=${HOME}/Downloads/new-tests
-IMAGES=$(ls ${DATA_DIR}/*.png | sort | tr '\n' ',')
+DATA_DIR=$HOME/var/data/openpose
 
-# DATA_DIR=$(pwd)/data/media
-# IMAGES=$(ls ${DATA_DIR}/*.jpg | sort | tr '\n' ',')
+IMAGES=$(ls ${DATA_DIR}/examples/media/*.png | sort | tr '\n' ',')
 
 LIMIT=100
 
@@ -54,10 +53,11 @@ profile_model() {
 }
 
 mkdir -p logs
-measure profile_model vggtiny new-models/hao18/pose350000.npz NHWC
+# measure profile_model vggtiny new-models/hao18/pose350000.npz NHWC
 # measure profile_model mobilenet mbn280000.npz NHWC
 # measure profile_model vgg vgg450000_no_cpm.npz NHWC
 # measure profile_model vgg vgg450000_no_cpm.npz NCHW # npz shape, is the same, but inference doesn't work yet
 # measure profile_model hao28_experimental hao28/pose345000.npz NHWC
 
+# measure profile_model hao28_experimental hao28/pose345000.npz NHWC
 measure profile_model hao28_experimental hao28/pose345000.npz NCHW
diff --git a/scripts/run-uff-cpp.sh b/scripts/run-uff-cpp.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -e
+
+export HAVE_CUDA=1
+
+make
+
+D=$HOME/var/data/openpose/126
+
+# IMAGE=$HOME/Downloads/new-tests/cam0_27.png
+IMAGES=$D/cam2_3938.png,$D/cam1_2386.png
+
+# MODEL_FILE=$HOME/Downloads/vgg.uff
+MODEL_FILE=$HOME/Downloads/vggtiny.uff
+
+./cmake-build/$(uname -s)/uff-runner_main \
+    --model_file=${MODEL_FILE} \
+    --image_files=${IMAGES}
diff --git a/scripts/run-uff.sh → scripts/run-uff-py.sh b/scripts/run-uff.sh → scripts/run-uff-py.sh
@@ -2,7 +2,7 @@
 set -e
 
 MODEL_DIR=$HOME/Downloads
-DATA_DIR=$HOME/Downloads/new-tests
+DATA_DIR=$HOME/var/data/openpose
 
 # cam0_27.png
 # cam0_59.png
@@ -41,13 +41,13 @@ test_hao28_model() {
 }
 
 # test_vgg_model \
-#     ./data/media/COCO_val2014_000000000192.jpg \
-#     ${DATA_DIR}/cam0_27.png
+#     ${DATA_DIR}/examples/media/COCO_val2014_000000000192.jpg \
+#     ${DATA_DIR}/new-tests/cam0_27.png
 
-test_vggtiny_model \
-    ./data/media/COCO_val2014_000000000192.jpg \
-    ${DATA_DIR}/cam0_27.png
+# test_vggtiny_model \
+#     ${DATA_DIR}/examples/media/COCO_val2014_000000000192.jpg \
+#     ${DATA_DIR}/new-tests/cam0_27.png
 
-# test_hao28_model \
-#     ./data/media/COCO_val2014_000000000192.jpg \
-#     ${DATA_DIR}/cam0_27.png
+test_hao28_model \
+    ${DATA_DIR}/examples/media/COCO_val2014_000000000192.jpg \
+    ${DATA_DIR}/new-tests/cam0_27.png
diff --git a/src/.gitignore b/src/.gitignore
@@ -0,0 +1 @@
+/pafprocess
diff --git a/src/build-gpu.cmake b/src/build-gpu.cmake
@@ -0,0 +1,11 @@
+# FIXME: use TARGET_LINK_DIRECTORIES and TARGET_INCLUDE_DIRECTORIES
+LINK_DIRECTORIES(/usr/local/cuda-9.0/targets/x86_64-linux/lib)
+INCLUDE_DIRECTORIES(/usr/local/cuda-9.0/targets/x86_64-linux/include
+                    /usr/local/cuda-9.0/targets/x86_64-linux/include/crt)
+
+ADD_EXECUTABLE(uff-runner_main
+               ${CMAKE_CURRENT_LIST_DIR}/uff-runner.cpp
+               ${CMAKE_CURRENT_LIST_DIR}/tracer.cpp
+               ${CMAKE_CURRENT_LIST_DIR}/uff-runner_main.cpp
+               ${CMAKE_CURRENT_LIST_DIR}/cuda_buffer.cpp)
+TARGET_LINK_LIBRARIES(uff-runner_main input_image paf vis gflags nvinfer cudart nvparsers)
diff --git a/src/build.cmake b/src/build.cmake
@@ -0,0 +1,20 @@
+FIND_PACKAGE(opencv)
+FIND_PACKAGE(gflags)
+
+
+ADD_LIBRARY(input_image ${CMAKE_CURRENT_LIST_DIR}/input.cpp)
+TARGET_LINK_LIBRARIES(input_image opencv_core opencv_imgproc opencv_highgui)
+
+ADD_LIBRARY(paf
+            ${CMAKE_CURRENT_LIST_DIR}/paf.cpp
+            ${CMAKE_CURRENT_LIST_DIR}/post-process.cpp)
+TARGET_LINK_LIBRARIES(paf opencv_core opencv_imgproc opencv_highgui)
+
+ADD_LIBRARY(vis ${CMAKE_CURRENT_LIST_DIR}/vis.cpp)
+TARGET_LINK_LIBRARIES(vis opencv_core opencv_imgproc opencv_highgui)
+
+ADD_EXECUTABLE(fake-runner
+               ${CMAKE_CURRENT_LIST_DIR}/fake_uff-runner.cpp
+               ${CMAKE_CURRENT_LIST_DIR}/tracer.cpp
+               ${CMAKE_CURRENT_LIST_DIR}/uff-runner_main.cpp)
+TARGET_LINK_LIBRARIES(fake-runner input_image paf vis gflags)
diff --git a/src/coco.h b/src/coco.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <utility>
+#include <vector>
+
+constexpr int COCO_N_PARTS = 18;
+constexpr int COCO_N_PAIRS = 19;
+
+using idx_pair_t = std::pair<int, int>;
+using coco_pair_list_t = std::vector<idx_pair_t>;
+
+const coco_pair_list_t COCOPAIRS_NET = {
+    {12, 13},  // 6
+    {20, 21},  // 10
+    {14, 15},  // 7
+    {16, 17},  // 8
+    {22, 23},  // 11
+    {24, 25},  // 12
+    {0, 1},    // 0
+    {2, 3},    // 1
+    {4, 5},    // 2
+    {6, 7},    // 3
+    {8, 9},    // 4
+    {10, 11},  // 5
+    {28, 29},  // 14
+    {30, 31},  // 15
+    {34, 35},  // 17
+    {32, 33},  // 16
+    {36, 37},  // 18
+    {18, 19},  // 9
+    {26, 27},  // 13
+};
+
+const coco_pair_list_t COCOPAIRS = {
+    {1, 2},    // 6
+    {1, 5},    // 10
+    {2, 3},    // 7
+    {3, 4},    // 8
+    {5, 6},    // 11
+    {6, 7},    // 12
+    {1, 8},    // 0
+    {8, 9},    // 1
+    {9, 10},   // 2
+    {1, 11},   // 3
+    {11, 12},  // 4
+    {12, 13},  // 5
+    {1, 0},    // 14
+    {0, 14},   // 15
+    {14, 16},  // 17
+    {0, 15},   // 16
+    {15, 17},  // 18
+    {2, 16},   // * 9
+    {5, 17},   // * 13
+};
+
+inline bool is_virtual_pair(int pair_id) { return pair_id > 16; }