From 7d384f7ec548dd09225b2feac23ef0eb92167fde Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Fri, 20 Nov 2020 03:38:43 +0800
Subject: [PATCH 1/4] feat: enable pifpaf inference support

---
 cmake/hyperpose.cmake                         |    5 +
 ...ator_api_batched_images_pifpaf.example.cpp |   77 ++
 include/hyperpose/hyperpose.hpp               |    1 +
 include/hyperpose/operator/parser/pifpaf.hpp  |   23 +
 src/pifpaf.cpp                                |  161 +++
 src/pifpaf_decoder/aiapp.hpp                  |  116 ++
 src/pifpaf_decoder/image_based.hpp            |  140 ++
 src/pifpaf_decoder/math_helpers.cpp           |   94 ++
 src/pifpaf_decoder/math_helpers.hpp           |   21 +
 src/pifpaf_decoder/object_detection.hpp       |   48 +
 .../openpifpaf_postprocessor.cpp              | 1162 +++++++++++++++++
 .../openpifpaf_postprocessor.hpp              |  188 +++
 12 files changed, 2036 insertions(+)
 create mode 100644 examples/operator_api_batched_images_pifpaf.example.cpp
 create mode 100644 include/hyperpose/operator/parser/pifpaf.hpp
 create mode 100644 src/pifpaf.cpp
 create mode 100644 src/pifpaf_decoder/aiapp.hpp
 create mode 100644 src/pifpaf_decoder/image_based.hpp
 create mode 100644 src/pifpaf_decoder/math_helpers.cpp
 create mode 100644 src/pifpaf_decoder/math_helpers.hpp
 create mode 100644 src/pifpaf_decoder/object_detection.hpp
 create mode 100644 src/pifpaf_decoder/openpifpaf_postprocessor.cpp
 create mode 100644 src/pifpaf_decoder/openpifpaf_postprocessor.hpp

diff --git a/cmake/hyperpose.cmake b/cmake/hyperpose.cmake
index 6ebb96b5..28131965 100644
--- a/cmake/hyperpose.cmake
+++ b/cmake/hyperpose.cmake
@@ -5,10 +5,15 @@ set(POSE_LIB_NAME hyperpose)
 INCLUDE(cmake/cuda.cmake)
 FIND_PACKAGE(OpenCV REQUIRED)
 
+FILE(GLOB PIFPAF_DECODER
+        src/pifpaf_decoder/*.cpp)
+
 ADD_LIBRARY(
         ${POSE_LIB_NAME} # SHARED
         src/logging.cpp
         src/tensorrt.cpp
+        src/pifpaf.cpp
+        ${PIFPAF_DECODER}
         src/paf.cpp
         src/data.cpp
         src/stream.cpp
diff --git a/examples/operator_api_batched_images_pifpaf.example.cpp b/examples/operator_api_batched_images_pifpaf.example.cpp
new file mode 100644
index 00000000..a187f1bb
--- /dev/null
+++ b/examples/operator_api_batched_images_pifpaf.example.cpp
@@ -0,0 +1,77 @@
+#include "utils.hpp"
+#include <gflags/gflags.h>
+#include <hyperpose/hyperpose.hpp>
+#include <string_view>
+
+// Model flags
+DEFINE_string(model_file, "../data/models/openpifpaf-resnet50.onnx", "Path to the model.");
+
+DEFINE_bool(logging, false, "Print the logging information or not.");
+DEFINE_int32(input_height, 640, "Height of input image.");
+DEFINE_int32(input_width, 427, "Width of input image.");
+
+DEFINE_string(input_folder, "../data/media", "Folder of images to inference.");
+
+int main(int argc, char** argv)
+{
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    // * Collect data into batch.
+    std::vector<cv::Mat> batch = glob_images(FLAGS_input_folder);
+
+    if (batch.empty()) {
+        example_log() << "No input images got. Exiting.\n";
+        exit(-1);
+    }
+
+    example_log() << "Batch shape: [" << batch.size() << ", 3, " << FLAGS_input_height << ", " << FLAGS_input_width << "]\n";
+
+    // * Create TensorRT engine.
+    namespace hp = hyperpose;
+    if (FLAGS_logging)
+        hp::enable_logging();
+
+    auto engine = [&] {
+        using namespace hp::dnn;
+        constexpr std::string_view onnx_suffix = ".onnx";
+        constexpr std::string_view uff_suffix = ".uff";
+
+        if (std::equal(onnx_suffix.crbegin(), onnx_suffix.crend(), FLAGS_model_file.crbegin()))
+            return tensorrt(onnx{ FLAGS_model_file }, { FLAGS_input_width, FLAGS_input_height }, batch.size());
+
+        example_log() << "Your model file's suffix is not [.onnx | .uff]. Your model file path: " << FLAGS_model_file;
+        example_log() << "Trying to be viewed as a serialized TensorRT model.";
+
+        return tensorrt(tensorrt_serialized{ FLAGS_model_file }, { FLAGS_input_width, FLAGS_input_height }, batch.size());
+    }();
+
+    hp::parser::pifpaf parser{};
+
+    using clk_t = std::chrono::high_resolution_clock;
+    auto beg = clk_t::now();
+    {
+        // * TensorRT Inference.
+        auto feature_map_packets = engine.inference(batch);
+        for (const auto& packet : feature_map_packets)
+            for (const auto& feature_map : packet)
+                example_log() << feature_map << std::endl;
+
+        // * Paf.
+        std::vector<std::vector<hp::human_t>> pose_vectors;
+        pose_vectors.reserve(feature_map_packets.size());
+        for (auto&& packet : feature_map_packets) {
+            pose_vectors.push_back(parser.process(packet[0], packet[1]));
+        }
+
+        std::cout << batch.size() << " images got processed. FPS = "
+                  << 1000. * batch.size() / std::chrono::duration<double, std::milli>(clk_t::now() - beg).count()
+                  << '\n';
+
+        for (size_t i = 0; i < batch.size(); ++i) {
+            cv::resize(batch[i], batch[i], { FLAGS_input_width, FLAGS_input_height });
+            for (auto&& pose : pose_vectors[i])
+                hp::draw_human(batch[i], pose);
+            cv::imwrite("output_" + std::to_string(i) + ".png", batch[i]);
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/hyperpose/hyperpose.hpp b/include/hyperpose/hyperpose.hpp
index a851c758..b9df8fcb 100644
--- a/include/hyperpose/hyperpose.hpp
+++ b/include/hyperpose/hyperpose.hpp
@@ -9,6 +9,7 @@
 
 #include "operator/dnn/tensorrt.hpp"
 #include "operator/parser/paf.hpp"
+#include "operator/parser/pifpaf.hpp"
 #include "operator/parser/proposal_network.hpp"
 
 #include "stream/stream.hpp"
\ No newline at end of file
diff --git a/include/hyperpose/operator/parser/pifpaf.hpp b/include/hyperpose/operator/parser/pifpaf.hpp
new file mode 100644
index 00000000..5da7990c
--- /dev/null
+++ b/include/hyperpose/operator/parser/pifpaf.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "paf.hpp"
+#include "../../utility/data.hpp"
+
+namespace hyperpose::parser {
+
+class pifpaf{
+public:
+    explicit pifpaf() = default;
+    std::vector<human_t> process(const feature_map_t& pif, const feature_map_t& paf);
+    template <typename C>
+    std::vector<human_t> process(C&& feature_map_containers)
+    {
+        // 1@pif, 2@paf.
+        assert(feature_map_containers.size() == 2);
+        return process(feature_map_containers[0], feature_map_containers[1]);
+    }
+private:
+    float m_keypoint_thresh = 0.001f;
+};
+
+} // namespace hyperpose
\ No newline at end of file
diff --git a/src/pifpaf.cpp b/src/pifpaf.cpp
new file mode 100644
index 00000000..ca4b71f6
--- /dev/null
+++ b/src/pifpaf.cpp
@@ -0,0 +1,161 @@
+#include <hyperpose/operator/parser/pifpaf.hpp>
+#include "pifpaf_decoder/openpifpaf_postprocessor.hpp"
+
+namespace hyperpose::parser {
+
+// TODO: Name ORDER!
+std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map_t& pif) {
+    // Helpful links (Chinese):
+    // https://zhuanlan.zhihu.com/p/93896207
+    // https://zhuanlan.zhihu.com/p/68073113
+    // pif: [17, 5, h, w] => KEY POINTS;
+    // 5: [conf, dx, dy, b, scale]
+    // Example: array([ 0.00527313,  0.13620843, -0.32253477,  0.3263721 ,  0.90980804], dtype=float32)
+    // heat map: f(x, y) = \sum_ij conf * N(x, y|ij)
+    // paf: [19, 9, h, w] => LIMBS;
+    // 9: [conf, [x1, y1, x2, y2], [b1, b2], [s1, s2]]
+    // Example: [ 0.00712654, -0.54057586,  5.4075847 ,  3.0354404 ,  3.1246614 ,  1.0621283 , -3.5857565 ,  2.6072054 ,  3.8406293 ],
+    // TODO: OPTIMIZE THIS.
+
+    lpdnn::aiapp_impl::OpenPifPafPostprocessor pp;
+    size_t h = pif.shape()[pif.shape().size() - 2];
+    size_t w = pif.shape().back();
+    std::vector<float> pif_conf, pif_xy, pif_s, paf_conf, paf_xy1, paf_xy2, paf_b1, paf_b2;
+
+    const auto tensor_sharding_to_vector = [](const feature_map_t& tensor, std::vector<float>& vec, size_t dim2) {
+        size_t d0 = tensor.shape()[0];
+        size_t d1 = tensor.shape()[1];
+        size_t h = tensor.shape()[2];
+        size_t w = tensor.shape()[3];
+        for (int i = 0; i < d0; ++i) {
+            for (int j = 0; j < h; ++j) {
+                for (int k = 0; k < w; ++k) {
+                    vec.push_back(tensor.view<float>()[
+                                      i * d1 * w * h +
+                                      dim2 * h * w +
+                                      j * w +
+                                      k
+                                  ]);
+                }
+            }
+        }
+    };
+
+    const auto tensor_sharding_to_offset_vector = [](const feature_map_t& tensor, std::vector<float>& vec, size_t dimx, size_t dimy) {
+        size_t d0 = tensor.shape()[0];
+        size_t d1 = tensor.shape()[1];
+        size_t h = tensor.shape()[2];
+        size_t w = tensor.shape()[3];
+        for (int i = 0; i < d0; ++i) {
+            // X first & Then Y
+            for (int j = 0; j < h; ++j) {
+                for (int k = 0; k < w; ++k) {
+                    vec.push_back(tensor.view<float>()[
+                                      i * d1 * w * h +
+                                      dimx * h * w +
+                                      j * w +
+                                      k
+                                  ]);
+                }
+            }
+
+            for (int j = 0; j < h; ++j) {
+                for (int k = 0; k < w; ++k) {
+                    vec.push_back(tensor.view<float>()[
+                                      i * d1 * w * h +
+                                      dimy * h * w +
+                                      j * w +
+                                      k
+                                  ]);
+                }
+            }
+        }
+    };
+
+    pif_conf.reserve(17 * h * w);
+    tensor_sharding_to_vector(pif, pif_conf, 0);
+
+    pif_xy.reserve(17 * 2 * h * w);
+    tensor_sharding_to_offset_vector(pif, pif_xy, 1, 2);
+
+    pif_s.reserve(17 * h * w);
+    tensor_sharding_to_vector(pif, pif_s, 4);
+
+    // [19, 9, h, w] -> [conf, p1, p2, b1, b2, ...]
+    paf_conf.reserve(19 * h * w);
+    tensor_sharding_to_vector(paf, paf_conf, 0);
+
+    paf_xy1.reserve(2 * 19 * h * w);
+    tensor_sharding_to_offset_vector(paf, paf_xy1, 1, 2);
+
+    paf_xy2.reserve(2 * 19 * h * w);
+    tensor_sharding_to_offset_vector(paf, paf_xy2, 3, 4);
+
+    paf_b1.reserve(19 * h * w);
+    tensor_sharding_to_vector(paf, paf_b1, 5);
+
+    paf_b2.reserve(19 * h * w);
+    tensor_sharding_to_vector(paf, paf_b2, 6);
+
+    // TODO: RECOVER THE INP{W, H};
+    auto apires = pp.postprocess_0_8(640, 427, w, h,
+                                     pif_conf.data(), pif_xy.data(), pif_s.data(),
+                                     paf_conf.data(), paf_xy1.data(), paf_xy2.data(), paf_b1.data(), paf_b2.data());
+
+//    std::cout << "Check pif[0]\t" << pif.view<float>()[0] << std::endl;
+    std::vector<human_t> ret{};
+    ret.reserve(apires.items.size());
+//    std::cout << apires.items.size() << "...size\n";
+
+    /*
+     *
+ OpenPifPaf COCO Topology: https://miro.medium.com/max/366/0*KFrFQVj3OoGAtt6o.png
+HyperPose: Unified Topology
+     *
+     */
+
+    for (auto&& item : apires.items) {
+        if (item.landmarks.points.empty())
+            continue;
+        human_t man{};
+        man.score = item.confidence;
+
+        auto p2p = [this](const auto& src, auto& dst) {
+            if (src.confidence > 0.) {
+                dst.score = 1;// src.confidence; FIXME
+                dst.x = src.position.x / 10000.;
+                dst.y = src.position.y / 10000.;
+//                std::cout << dst.x << ' ' << dst.y << '\n';
+                dst.has_value = true;
+            }
+        };
+
+        auto& from = item.landmarks.points;
+        auto& to = man.parts;
+        // OpenPifPaf -> HyperPose
+        p2p(from[0], to[0]);
+        // ! to [1]
+        constexpr std::array<size_t, 16> from_index = {
+            6, 8, 10, 5, 7, 9,
+            12, 14, 16, 11, 13, 15,
+            2, 1, 4, 3
+        };
+
+        for (size_t i = 0; i < from_index.size(); ++i) {
+            p2p(from[from_index[i]], to[i+2]);
+        }
+
+        if (to[2].has_value && to[5].has_value) {
+            to[1].x = (to[2].x + to[5].x) / 2;;
+            to[1].y = (to[2].y + to[5].y) / 2;;
+            to[1].has_value = true;
+            to[1].score = (to[2].score + to[5].score) / 2;
+        }
+
+        ret.push_back(man);
+    }
+
+    return ret;
+}
+
+} // namespace hyperpose
\ No newline at end of file
diff --git a/src/pifpaf_decoder/aiapp.hpp b/src/pifpaf_decoder/aiapp.hpp
new file mode 100644
index 00000000..1beb0774
--- /dev/null
+++ b/src/pifpaf_decoder/aiapp.hpp
@@ -0,0 +1,116 @@
+///
+/// Ai-app base interface and types
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace lpdnn {
+namespace ai_app {
+
+/// Aiapp Blob
+/// This could be improved to allow referring to existing data
+/// thus avoding unneeded data-copy, for example by using shared_ptr.
+struct Blob {
+  /// Data dimensions. Mandatory if the blob represents a tensor.
+  std::vector<int> dim;
+
+  /// Data. Mandatory if the blob represents a tensor.
+  std::vector<float> data;
+
+  /// Optional raw representation.
+  std::vector<uint8_t> raw;
+
+  /// Optional CBOR representation when data is structured.
+  std::vector<uint8_t> cbor;
+
+  /// Optional additional information
+  /// (eg, description of internal representation: "NCHW,8bits,dp3").
+  std::string info;
+};
+
+/// AI-App interface
+class Aiapp {
+ public:
+  virtual ~Aiapp() {}
+
+  /// @return the ai-class id for this aiapp
+  virtual const char* class_id() const = 0;
+
+  /// @return the implementation id for this aiapp
+  virtual const char* impl_id() const = 0;
+
+  /// Initialization options
+  /// \param cfg: configuration string, typically in JSON format.
+  /// \return: true if success
+  virtual bool init(const std::string& cfg) = 0;
+
+  /// Set runtime options for the specified component
+  /// \param opt: runtime options, typically in JSON format.
+  /// \param name: subcomponent name
+  /// \return: true if success
+  virtual bool set_options(const std::string& opt,
+                           const std::string& name = "") = 0;
+
+  /// Introspection methods
+  /// \{
+
+  /// \return: names of all direct subcomponents of the specified component
+  virtual std::vector<std::string> components(
+      const std::string& name = "") const = 0;
+
+  /// \return output(s) of the specified component
+  virtual std::vector<Blob> output(const std::string& name = "") const = 0;
+
+  /// \return metrics of the specified component and all its subcomponents
+  virtual std::string metrics(const std::string& name = "") const = 0;
+
+  /// set end-of-execution at the end of the specified component
+  /// if name is empty any exit-point previously set is removed
+  virtual bool set_exit_after(const std::string& name = "") = 0;
+
+  /// \}
+};
+
+/// AiApp standard processing components
+/// Each ai-app can contain other sub-components.
+/// Each subcomponent can be identified by a pathname, for example:
+///   "preprocessing.normalize"
+///   "inference.net1.conv23"
+struct Component {
+  /// Standard component names. Their use is not mandatory but
+  /// allows an ai-app to be supported by existing tools.
+  static constexpr char const* preprocessing = "preprocessing";
+  static constexpr char const* inference = "inference";
+  static constexpr char const* postprocessing = "postprocessing";
+
+  /// Ai-app interface parameters
+  static constexpr char const* interface = "interface";
+
+  /// Name separator in a component pathname string.
+  /// Component names can't contain the separator except possibly for the leafs
+  static constexpr char separator = '.';
+
+  /// Concatenate component names in a component pathname
+  static std::string join(const std::string& path, const std::string& comp) {
+    return path + separator + comp;
+  }
+};
+
+/// AiApp Metrics
+struct Metrics {
+  /// Standard metrics. All timings are in microseconds.
+  static constexpr char const* init_time = "init_time";
+  static constexpr char const* inference_time = "inference_time";
+  static constexpr char const* inference_cpu_time = "inference_cpu_time";
+};
+
+}  // namespace ai_app
+}  // namespace lpdnn
diff --git a/src/pifpaf_decoder/image_based.hpp b/src/pifpaf_decoder/image_based.hpp
new file mode 100644
index 00000000..938cedaa
--- /dev/null
+++ b/src/pifpaf_decoder/image_based.hpp
@@ -0,0 +1,140 @@
+///
+/// Ai-app interface and types for image-based ai-apps
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include "aiapp.hpp"
+
+namespace lpdnn::ai_app {
+
+/// 2-dimensional size
+struct Dim2d {
+  int x;
+  int y;
+};
+
+/// Rectangle
+struct Rect {
+  Dim2d origin;
+  Dim2d size;
+
+  [[nodiscard]] bool empty() const { return size.x <= 0 || size.y <= 0; }
+};
+
+/// Landmarks
+struct Landmark {
+  Dim2d position;
+  float confidence;  /// Negative value if N/A
+};
+
+struct Landmarks {
+  /// Landmark specification identifier
+  std::string type;
+  /// Landmark points
+  std::vector<Landmark> points;
+};
+
+/// Image representation.
+/// The data of a RAW image consists of *y scanlines of *x pixels,
+/// with each pixel consisting of N interleaved 8-bit components; the first
+/// pixel pointed to is top-left-most in the image. There is no padding between
+/// image scanlines or between pixels, regardless of format. The number of
+/// components N is 3 for RGB images, 4 for RGBA, 1 for grayscale.
+/// Support for 8bits RGB format is MANDATORY for all image-processing AiApps.
+/// An image can be constructed from a std::vector<uint8_t>, or a std::string
+/// or raw data pointer and size. When passing rvalues vector or strings, the
+/// image will take ownership of the data, otherwise will just keep reference.
+class Image {
+ protected:
+  /// Contains image data if we have ownership of it
+  std::vector<uint8_t> _image_content;
+
+ public:
+  /// Image format
+  enum class Format {
+    raw_grayscale = 1,  /// 8bits grayscale
+    raw_rgb8 = 3,       /// 8bits RGB *MANDATORY*
+    raw_rgba8 = 4,      /// 8bits RGBA
+
+    encoded = 256,  /// Standard JPEG/BMP/PNG/TIFF format
+
+    custom = 512  /// Custom format. Use attributes field for more details.
+  };
+
+  /// Don't take data ownership.
+  /// img_dim parameter can be omitted in case of encoded images since
+  /// this information will be extracted from the image content itself.
+  Image(Format img_format, const std::vector<uint8_t>& data, Dim2d img_dim = {})
+      : Image(img_format, data.data(), data.size(), img_dim) {}
+
+  /// Take data ownership
+  Image(Format img_format, std::vector<uint8_t>&& data, Dim2d img_dim = {})
+      : _image_content(std::move(data)),
+        format{img_format},
+        dim(img_dim),
+        data{_image_content.data()},
+        data_size{_image_content.size()} {}
+
+  /// Don't take data ownership.
+  Image(Format img_format, const std::string& data, Dim2d img_dim = {})
+      : Image(img_format, (uint8_t*)data.c_str(), data.size(), img_dim) {}
+
+  /// Take data ownership
+  Image(Format img_format, std::string&& data, Dim2d img_dim = {})
+      : Image(img_format,
+              std::vector<uint8_t>((uint8_t*)data.c_str(),
+                                   (uint8_t*)data.c_str() + data.size()),
+              img_dim) {
+    data.clear();
+  }
+
+  /// Don't take data ownership
+  /// img_data_size is mandatory in case of encoded images.
+  Image(Format img_format, const uint8_t* img_data, size_t img_data_size,
+        Dim2d img_dim = {})
+      : format{img_format},
+        dim(img_dim),
+        data{img_data},
+        data_size{img_data_size} {}
+
+  /// Utility factory methods
+  static Image encoded(const std::vector<uint8_t>& data) {
+    return Image(Format::encoded, data);
+  }
+
+  /// Image format
+  Format format;
+
+  /// Image dimensions (for raw images)
+  Dim2d dim;
+
+  /// Region of interest inside the image (all if empty)
+  Rect roi{};
+
+  /// Custom attributes.
+  /// This is ai-app specific and allows to specify custom data formats.
+  std::string attributes;
+
+  /// Pointer to image data (no ownership of the data).
+  const uint8_t* data;
+
+  /// Size of image data. Mandatory for encoded images.
+  size_t data_size;
+
+  /// Additional optional information about the image.
+  /// May be required by some aiapps.
+  Landmarks landmarks;
+};
+
+/// Abstract image-based AiApp
+class Image_based : virtual public Aiapp {
+ public:
+  /// @return supported image formats (ordered by preference)
+  [[nodiscard]] virtual std::vector<Image::Format> image_formats() const = 0;
+};
+
+}  // namespace lpdnn
diff --git a/src/pifpaf_decoder/math_helpers.cpp b/src/pifpaf_decoder/math_helpers.cpp
new file mode 100644
index 00000000..a571780a
--- /dev/null
+++ b/src/pifpaf_decoder/math_helpers.cpp
@@ -0,0 +1,94 @@
+
+#include "math_helpers.hpp"
+#include <cassert>
+
+#ifdef __APPLE__
+#define MATH_HELPERS_ACCELERATE 1
+#else
+#define MATH_HELPERS_ACCELERATE 0
+#endif
+
+#if MATH_HELPERS_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#else
+#include <cmath>
+#endif
+
+void vfill(float*x, unsigned long n, float v) {
+#if MATH_HELPERS_ACCELERATE
+  vDSP_vfill(&v, x, 1, n);
+#else
+  // Slow version
+  for (unsigned long i = 0; i < n; ++i) {
+    x[i] = v;
+  }
+#endif
+}
+
+void vadd(const float *a, const float *b, float *c, unsigned long n) {
+#if MATH_HELPERS_ACCELERATE
+  vDSP_vadd(a, 1, b, 1, c, 1, n);
+#else
+  // Slow version
+  for (unsigned long i = 0; i < n; ++i) {
+    c[i] = a[i] + b[i];
+  }
+#endif
+}
+
+void vexp(float *x, unsigned long n) {
+#if MATH_HELPERS_ACCELERATE
+  int n_ = (int)n;
+  vvexpf(x, x, &n_);
+#else
+  // Slow version
+  for (unsigned long i = 0; i < n; ++i) {
+    x[i] = std::exp(x[i]);
+  }
+#endif
+}
+
+void vmul(const float *a, const float *b, float *c, unsigned long n) {
+#if MATH_HELPERS_ACCELERATE
+  vDSP_vmul(a, 1, b, 1, c, 1, n);
+#else
+  // Slow version
+  for (unsigned long i = 0; i < n; ++i) {
+    c[i] = a[i] * b[i];
+  }
+#endif
+}
+
+void vsmul(const float *a, float b, float *c, unsigned long n) {
+#if MATH_HELPERS_ACCELERATE
+  vDSP_vsmul(a, 1, &b, c, 1, n);
+#else
+  // Slow version
+  for (unsigned long i = 0; i < n; ++i) {
+    c[i] = a[i] * b;
+  }
+#endif
+}
+
+float vargmax(const float *x, unsigned long n, int* i) {
+  assert(n > 0);
+#if MATH_HELPERS_ACCELERATE
+  float maxValue = 0.0f;
+  vDSP_Length maxIndex = 0;
+  vDSP_maxvi(x, 1, &maxValue, &maxIndex, n);
+  *i = (int)maxIndex;
+  return maxValue;
+#else
+  // Slow version
+  float maxValue = x[0];
+  unsigned long maxIndex = 0;
+  for (unsigned long i = 1; i < n; ++i) {
+    if (x[i] > maxValue) {
+      maxValue = x[i];
+      maxIndex = i;
+    }
+  }
+  *i = (int)maxIndex;
+  return maxValue;
+#endif
+}
diff --git a/src/pifpaf_decoder/math_helpers.hpp b/src/pifpaf_decoder/math_helpers.hpp
new file mode 100644
index 00000000..d187bc51
--- /dev/null
+++ b/src/pifpaf_decoder/math_helpers.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+// x[i] = v
+void vfill(float*x, unsigned long n, float v);
+
+// c[i] = a[i] + b[i]
+void vadd(const float *a, const float *b, float *c, unsigned long n);
+
+// x[i] = exp(x[i])
+void vexp(float *x, unsigned long n);
+
+// c[i] = a[i] * b[i]
+void vmul(const float *a, const float *b, float *c, unsigned long n);
+
+// c[i] = a[i] * b
+void vsmul(const float *a, float b, float *c, unsigned long n);
+
+// out = max(x)
+// i = argmax(x)
+float vargmax(const float *x, unsigned long n, int* i);
+
diff --git a/src/pifpaf_decoder/object_detection.hpp b/src/pifpaf_decoder/object_detection.hpp
new file mode 100644
index 00000000..7a7bc673
--- /dev/null
+++ b/src/pifpaf_decoder/object_detection.hpp
@@ -0,0 +1,48 @@
+///
+/// Ai-app interface for object detection
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include "image_based.hpp"
+
+namespace lpdnn::ai_app {
+
+/// Object detection AiApp
+class Object_detection : virtual public Image_based {
+ public:
+  struct Result {
+    struct Item {
+      float confidence{};
+      int class_index{};
+      Rect bounding_box{};
+      Landmarks landmarks;
+    };
+
+    bool success{};
+    std::vector<Item> items;
+  };
+
+  /// Set minimum detectable object size
+  /// @return true if success
+  virtual bool set_min_size(Dim2d minSize) = 0;
+
+  /// Set maximum detectable object size
+  /// @return true if success
+  virtual bool set_max_size(Dim2d maxSize) = 0;
+
+  /// Perform inference.
+  virtual Result execute(const Image& input) = 0;
+
+  /// @return Names of classes
+  virtual std::vector<std::string> classes() = 0;
+
+  /// @return our aiapp class id
+  [[nodiscard]] const char* class_id() const override { return ai_class_id; }
+  static constexpr char const* ai_class_id = "com_bonseyes::object_detection";
+};
+
+}  // namespace lpdnn
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
new file mode 100644
index 00000000..acf536fd
--- /dev/null
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
@@ -0,0 +1,1162 @@
+#include <cstdlib>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <cstring>
+#include "openpifpaf_postprocessor.hpp"
+#include "math_helpers.hpp"
+
+namespace lpdnn::aiapp_impl {
+
+const int OpenPifPafPostprocessor::bones[19][2] = {
+  {16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, { 6, 12}, { 7, 13},
+  { 6,  7}, { 6,  8}, { 7,  9}, { 8, 10}, { 9, 11}, { 2,  3}, { 1,  2},
+  { 1,  3}, { 2,  4}, { 3,  5}, { 4,  6}, { 5,  7},
+};
+
+constexpr int C = 17;
+constexpr float stride = 8.0f;
+constexpr float seedThreshold = 0.2f;
+constexpr float keypointThreshold = 0.001f;
+constexpr float instanceThreshold = 0.2f;
+
+/*
+  Creates a (2, h, w) tensor where the first part is:
+      0, 1, 2, 3, ..., w-1,
+      0, 1, 2, 3, ..., w-1,
+      0, 1, 2, 3, ..., w-1,
+      ...
+  and the second part is:
+      0, 0, 0, 0, ..., 0,
+      1, 1, 1, 1, ..., 1,
+      2, 2, 2, 2, ..., 2,
+      ...
+  Used for normaling the PIFs and PAFs.
+*/
+static std::vector<float> makeIndexField(int h, int w) {
+  std::vector<float> indexField(2 * h * w);
+  float* ptr = indexField.data();
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      ptr[ y     *w + x] = (float)x;
+      ptr[(y + h)*w + x] = (float)y;
+    }
+  }
+  return indexField;
+}
+
+static void scalarSquareAddConstant(float* field,
+                                    int fieldH,
+                                    int fieldW,
+                                    const std::vector<float>& x,
+                                    const std::vector<float>& y,
+                                    const std::vector<float>& width,
+                                    const std::vector<float>& v)
+{
+    // minx_np = np.round(x_np - width_np).astype(np.int)
+    // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
+    std::vector<int> minx(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+        minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
+    }
+
+    // miny_np = np.round(y_np - width_np).astype(np.int)
+    // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
+    std::vector<int> miny(y.size());
+    for (size_t i = 0; i < y.size(); ++i) {
+        miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
+    }
+
+    // maxx_np = np.round(x_np + width_np).astype(np.int)
+    // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
+    std::vector<int> maxx(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+        maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
+    }
+
+    // maxy_np = np.round(y_np + width_np).astype(np.int)
+    // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
+    std::vector<int> maxy(y.size());
+    for (size_t i = 0; i < y.size(); ++i) {
+        maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
+    }
+
+    // for i in range(minx.shape[0]):
+    //     for xx in range(minx[i], maxx[i]):
+    //         for yy in range(miny[i], maxy[i]):
+    //             field[yy, xx] += v[i]
+    for (size_t i = 0; i < minx.size(); ++i) {
+        for (int yy = miny[i]; yy < maxy[i]; ++yy) {
+            for (int xx = minx[i]; xx < maxx[i]; ++xx) {
+                field[yy * fieldW + xx] += v[i];
+            }
+        }
+    }
+}
+
+static void scalarSquareAddGauss(float* field,
+                                 int fieldH,
+                                 int fieldW,
+                                 const std::vector<float>& x,
+                                 const std::vector<float>& y,
+                                 const std::vector<float>& sigma_,
+                                 const std::vector<float>& v,
+                                 float truncate = 2.0f)
+{
+    // sigma_np = np.maximum(1.0, sigma_np)
+    // width_np = np.maximum(1.0, truncate * sigma_np)
+    auto sigma = sigma_;
+    std::vector<float> width(sigma.size());
+    for (size_t i = 0; i < sigma.size(); ++i) {
+        sigma[i] = std::max(1.0f, sigma[i]);
+        width[i] = std::max(1.0f, truncate * sigma[i]);
+    }
+
+    // NOTE: The minx, miny, maxx, maxxy code is the same as in scalarSquareAddConstant().
+    // Could probably extract that and do it just once.
+
+    // minx_np = np.round(x_np - width_np).astype(np.int)
+    // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
+    std::vector<int> minx(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+        minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
+    }
+
+    // miny_np = np.round(y_np - width_np).astype(np.int)
+    // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
+    std::vector<int> miny(y.size());
+    for (size_t i = 0; i < y.size(); ++i) {
+        miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
+    }
+
+    // maxx_np = np.round(x_np + width_np).astype(np.int)
+    // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
+    std::vector<int> maxx(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+        maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
+    }
+
+    // maxy_np = np.round(y_np + width_np).astype(np.int)
+    // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
+    std::vector<int> maxy(y.size());
+    for (size_t i = 0; i < y.size(); ++i) {
+        maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
+    }
+
+    // for i in range(minx.shape[0]):
+    //     for xx in range(minx[i], maxx[i]):
+    //         deltax = xx - x[i]
+    //         for yy in range(miny[i], maxy[i]):
+    //             deltay = yy - y[i]
+    //             vv = v[i] * np.exp(-0.5 * (deltax**2 + deltay**2) / sigma[i]**2)
+    //             field[yy, xx] += vv
+    for (size_t i = 0; i < minx.size(); ++i) {
+        for (int xx = minx[i]; xx < maxx[i]; ++xx) {
+            float deltax = (float)xx - x[i];
+            for (int yy = miny[i]; yy < maxy[i]; ++yy) {
+                float deltay = (float)yy - y[i];
+                float vv = v[i] * std::exp(-0.5f * (deltax*deltax + deltay*deltay) / (sigma[i]*sigma[i]));
+                field[yy * fieldW + xx] += vv;
+            }
+        }
+    }
+
+    /*
+    // For debugging
+    for (int y = 0; y < fieldH; ++y) {
+      for (int x = 0; x <fieldW; ++x) {
+        printf("%f, ", field[y*fieldW + x]);
+      }
+      printf("\n");
+    }
+    */
+}
+
+static void scalarSquareAddSingle(float* field,
+                                  int fieldH,
+                                  int fieldW,
+                                  float x,
+                                  float y,
+                                  float width,
+                                  float value)
+{
+    // minx = max(0, int(round(x - width)))
+    // miny = max(0, int(round(y - width)))
+    auto minx = std::max(0, (int)std::round(x - width));
+    auto miny = std::max(0, (int)std::round(y - width));
+
+    // maxx = max(minx + 1, min(field.shape[1], int(round(x + width)) + 1))
+    // maxy = max(miny + 1, min(field.shape[0], int(round(y + width)) + 1))
+    auto maxx = std::max(minx + 1, std::min(fieldW, (int)std::round(x + width) + 1));
+    auto maxy = std::max(miny + 1, std::min(fieldH, (int)std::round(y + width) + 1));
+
+    if (minx >= fieldW) { return; }
+    if (miny >= fieldH) { return; }
+
+    // field[miny:maxy, minx:maxx] += value
+    for (auto yy = miny; yy < maxy; ++yy) {
+        for (auto xx = minx; xx < maxx; ++xx) {
+            field[yy * fieldW + xx] += value;
+        }
+    }
+}
+
+/**
+  Combines the different PAF outputs into one big (19, 2, 4, h, w) tensor.
+
+  The input tensors have the shape (19, h, w) except for j1/j2Fields, which
+  are (38, h, w).
+*/
+void OpenPifPafPostprocessor::normalizePAF(const float* intensityFields,
+                                           const float* j1Fields,
+                                           const float* j2Fields,
+                                           const float* j1FieldsLogb,
+                                           const float* j2FieldsLogb)
+{
+    float* pafPtr = paf.data();
+
+    // Strides for the first dimension of the input tensors:
+    const size_t if_stride_0   = H * W;
+    const size_t j1f_stride_0  = H * W;
+    const size_t j1bf_stride_0 = H * W;
+    const size_t j2f_stride_0  = H * W;
+    const size_t j2bf_stride_0 = H * W;
+
+    for (int i = 0; i < 19; ++i) {
+        // Copy the next h*w values from intensityFields.
+        size_t ifOffset = i * if_stride_0;
+        size_t outOffset = i * paf_stride_0;
+        memcpy(pafPtr + outOffset, intensityFields + ifOffset, H * W * sizeof(float));
+
+        // Copy the next 2 h*w values from j1Fields.
+        size_t j1fOffset = (i * 2) * j1f_stride_0;
+        outOffset += paf_stride_2;
+        memcpy(pafPtr + outOffset, j1Fields + j1fOffset, 2 * H * W * sizeof(float));
+
+        // Also add the index field to the values from j1Fields.
+        vadd(indexField.data(), j1Fields + j1fOffset, pafPtr + outOffset, 2 * H * W);
+
+        // Copy the next h*w values from j1FieldsLogb and exponentiate.
+        size_t j1bfOffset = i * j1bf_stride_0;
+        outOffset += paf_stride_2 * 2;
+        memcpy(pafPtr + outOffset, j1FieldsLogb + j1bfOffset, H * W * sizeof(float));
+        vexp(pafPtr + outOffset, H * W);
+
+        // Copy the same h*w values from intensityFields again.
+        outOffset = i * paf_stride_0 + paf_stride_1;
+        memcpy(pafPtr + outOffset, intensityFields + ifOffset, H * W * sizeof(float));
+
+        // Copy the next 2 h*w values from j2Fields.
+        size_t j2fOffset = (i * 2) * j2f_stride_0;
+        outOffset += paf_stride_2;
+        memcpy(pafPtr + outOffset, j2Fields + j2fOffset, 2 * H * W * sizeof(float));
+
+        // Also add the index field to the values from j2Fields.
+        vadd(indexField.data(), j2Fields + j2fOffset, pafPtr + outOffset, 2 * H * W);
+
+        // Copy the next h*w values from j2FieldsLogb and exponentiate.
+        size_t j2bfOffset = i * j2bf_stride_0;
+        outOffset += paf_stride_2 * 2;
+        memcpy(pafPtr + outOffset, j2FieldsLogb + j2bfOffset, H * W * sizeof(float));
+        vexp(pafPtr + outOffset, H * W);
+    }
+
+    // NOTE: We could do the exponentiation for j1/j2FieldsLogb in the Core ML
+    // model already.
+
+    /*
+    // For debugging
+    for (int y = 0; y < H; ++y) {
+      printf("%d: ", y);
+      for (int x = 0; x < W; ++x) {
+        printf("%f, ", paf[9*paf_stride_0 + 2*paf_stride_1 + 7*paf_stride_2 + y*W + x]);
+      }
+      printf("\n");
+    }
+    */
+}
+
+/**
+  Combines the different PIF outputs into one big (17, 4, h, w) tensor.
+
+  The input tensors have the shape (17, h, w) except for jointFields, which
+  is (34, h, w).
+*/
+void OpenPifPafPostprocessor::normalizePIF(const float* jointIntensityFields,
+                                           const float* jointFields,
+                                           const float* scaleFields)
+{
+    float* pifPtr = pif.data();
+
+    // Strides for the first dimension of the input tensors:
+    const size_t iif_stride_0 = H * W;
+    const size_t jf_stride_0  = H * W;
+    const size_t sf_stride_0  = H * W;
+
+    // The PyTorch code concatenates the following tensors:
+    //   (17, 1, h, w)
+    //   (17, 2, h, w)
+    //   (17, 1, h, w)
+    // along the 2nd axis into one tensor of shape (17, 4, h, w). But the
+    // tensors from Core ML have the following shapes:
+    //   (17, h, w)
+    //   (34, h, w)
+    //   (17, h, w)
+    // Fortunately, (17, 2, ...) has the same memory layout as (34, ...),
+    // so we can simply do a bunch of memcpy's.
+
+    for (int i = 0; i < 17; ++i) {
+        // Copy the next h*w values from jointIntensityFields.
+        size_t jifOffset = i * iif_stride_0;
+        size_t outOffset = i * pif_stride_0;
+        memcpy(pifPtr + outOffset, jointIntensityFields + jifOffset, H * W * sizeof(float));
+
+        // Copy the next 2 h*w values from jointFields.
+        size_t jfOffset = (i * 2) * jf_stride_0;
+        outOffset += pif_stride_1;
+        memcpy(pifPtr + outOffset, jointFields + jfOffset, 2 * H * W * sizeof(float));
+
+        // Also add the index field to the values from jointFields.
+        vadd(indexField.data(), jointFields + jfOffset, pifPtr + outOffset, 2 * H * W);
+
+        // Copy the next h*w values from scaleFields.
+        size_t sfOffset = i * sf_stride_0;
+        outOffset += pif_stride_1 * 2;
+        memcpy(pifPtr + outOffset, scaleFields + sfOffset, H * W * sizeof(float));
+    }
+}
+
+OpenPifPafPostprocessor::Target_intensity
+OpenPifPafPostprocessor::targetIntensities(const std::vector<float>& pif,
+                                           float v_th, bool coreOnly)
+{
+    const float pif_nn = 16.0f;
+
+    const size_t targets_stride_0 = H_hr * W_hr;
+    const size_t scales_stride_0  = H_hr * W_hr;
+    const size_t ns_stride_0      = H_hr * W_hr;
+
+    // These tensors need to be emptied out on each frame.
+    vfill(targetsCoreOnly.data(), targetsCoreOnly.size(), 0.0f);
+    vfill(targets.data(), targets.size(), 0.0f);
+    vfill(scales.data(), scales.size(), 0.0f);
+    vfill(ns.data(), ns.size(), 0.0f);
+
+    std::vector<float> v;
+    std::vector<float> x;
+    std::vector<float> y;
+    std::vector<float> s;
+
+    for (int i = 0; i < C; ++i) {
+        // Threshold pif[i, ...], which is a (4, h, w) tensor. Copy the values
+        // that are over the threshold into four vectors: v, x, y, s. Multiply
+        // x, y, s with the stride.
+        //
+        // v, x, y, s = p[:, p[0] > v_th]
+        // x = x * self.stride
+        // y = y * self.stride
+        // s = s * self.stride
+        v.clear();
+        x.clear();
+        y.clear();
+        s.clear();
+        const size_t pifOffset = i * pif_stride_0;
+        const size_t xOffset = pifOffset + pif_stride_1;
+        const size_t yOffset = xOffset + pif_stride_1;
+        const size_t sOffset = yOffset + pif_stride_1;
+        for (int j = 0; j < H*W; ++j) {
+            float p = pif[pifOffset + j];
+            if (p > v_th) {
+                v.push_back(p);
+                x.push_back(pif[xOffset + j] * stride);
+                y.push_back(pif[yOffset + j] * stride);
+                s.push_back(pif[sOffset + j] * stride);
+            }
+        }
+
+        /*
+        // For debugging
+        printf("iteration: %d\n", i);
+        printf("v:\n"); for (auto n : v) printf("%f, ", n); printf("\n");
+        printf("x:\n"); for (auto n : x) printf("%f, ", n); printf("\n");
+        printf("y:\n"); for (auto n : y) printf("%f, ", n); printf("\n");
+        printf("s:\n"); for (auto n : s) printf("%f, ", n); printf("\n");
+        */
+
+        // Create a high-resolution confidence map for this keypoint.
+
+        // v / pif_nn
+        std::vector<float> v_over_pif_nn(v.size());
+        vsmul(v.data(), 1.0f / pif_nn, v_over_pif_nn.data(), v.size());
+
+        // The original code computed the "core only" version in a separate step
+        // but that duplicates a bunch of work, so we do it at the same time.
+        const auto tco = targetsCoreOnly.data() + i * targets_stride_0;
+        scalarSquareAddGauss(tco, H_hr, W_hr, x, y, s, v_over_pif_nn, 0.5);
+
+        // s * v
+        std::vector<float> s_times_v(v.size());
+        vmul(s.data(), v.data(), s_times_v.data(), v.size());
+
+        const auto t = targets.data() + i * targets_stride_0;
+        const auto scale = scales.data() + i * scales_stride_0;
+        const auto n = ns.data() + i * ns_stride_0;
+        scalarSquareAddGauss(t, H_hr, W_hr, x, y, s, v_over_pif_nn);
+        scalarSquareAddConstant(scale, H_hr, W_hr, x, y, s, s_times_v);
+        scalarSquareAddConstant(n, H_hr, W_hr, x, y, s, v);
+    }
+
+    // m = ns > 0
+    // scales[m] = scales[m] / ns[m]
+    for (size_t i = 0; i < scales.size(); ++i) {
+        const auto d = ns[i];
+        if (d > 0) { scales[i] /= d; }
+    }
+
+    return Target_intensity{ targets, scales, targetsCoreOnly };
+}
+
+OpenPifPafPostprocessor::Paf_target
+OpenPifPafPostprocessor::scorePafTarget(const std::vector<float>& pafvec,
+                                        const std::vector<float>& pifhr,
+                                        float pifhr_floor,
+                                        float score_th) const
+{
+    std::vector<std::vector<float>> scored_forward;
+    std::vector<std::vector<float>> scored_backward;
+
+    for (int c = 0; c < 19; ++c) {
+        // The PAF has shape (19, 2, 4, h, w). We're looking at one (2, 4, h, w)
+        // slice at a time in this loop.
+        const size_t pafOffset = c * paf_stride_0;
+
+        // scores = np.min(fourds[:, 0], axis=0)
+        // mask = scores > score_th
+        // scores = scores[mask]
+        std::vector<float> scores;
+        std::vector<int> mask;
+        for (int i = 0; i < H * W; ++i) {
+            auto a = pafvec[pafOffset + i];
+            auto b = pafvec[pafOffset + paf_stride_1 + i];
+            auto score = std::min(a, b);
+            if (score > score_th) {
+                scores.push_back(score);
+                mask.push_back(i);
+            }
+        }
+
+        // fourds = fourds[:, :, mask]
+        const size_t scores_size = scores.size();
+        std::vector<float> masked(2 * 4 * scores_size);
+        for (size_t i = 0; i < mask.size(); ++i) {
+            const auto m = mask[i];
+            masked[i                ] = pafvec[pafOffset                                 + m];
+            masked[i + scores_size  ] = pafvec[pafOffset                + paf_stride_2   + m];
+            masked[i + scores_size*2] = pafvec[pafOffset                + paf_stride_2*2 + m];
+            masked[i + scores_size*3] = pafvec[pafOffset                + paf_stride_2*3 + m];
+            masked[i + scores_size*4] = pafvec[pafOffset + paf_stride_1                  + m];
+            masked[i + scores_size*5] = pafvec[pafOffset + paf_stride_1 + paf_stride_2   + m];
+            masked[i + scores_size*6] = pafvec[pafOffset + paf_stride_1 + paf_stride_2*2 + m];
+            masked[i + scores_size*7] = pafvec[pafOffset + paf_stride_1 + paf_stride_2*3 + m];
+        }
+
+        std::vector<float> scores_b(scores_size);
+        if (pifhr_floor < 1.0f) {
+            // ij_b = np.round(fourds[0, 1:3] * self.stride).astype(np.int)
+            // ij_b[0] = np.clip(ij_b[0], 0, self._pifhr.shape[2] - 1)
+            // ij_b[1] = np.clip(ij_b[1], 0, self._pifhr.shape[1] - 1)
+            std::vector<int> ij_b(2 * scores_size);
+            for (size_t i = 0; i < scores_size*2; ++i) {
+                const int v = (int)std::round(masked[scores_size + i] * stride);
+                ij_b[i] = std::min(std::max(0, v), i < scores_size ? W_hr - 1 : H_hr - 1);
+            }
+
+            // pifhr_b = self._pifhr[j1i, ij_b[1], ij_b[0]]
+            // scores_b = scores * (pifhr_floor + (1.0 - pifhr_floor) * pifhr_b)
+            const auto j1i = bones[c][0] - 1;
+            for (size_t i = 0; i < scores_b.size(); ++i) {
+                const auto pifhr_b = pifhr[j1i * pifhr_stride_0 + ij_b[scores_size + i] * pifhr_stride_1 + ij_b[i]];
+                scores_b[i] = scores[i] * (pifhr_floor + (1.0f - pifhr_floor) * pifhr_b);
+            }
+        } else {
+            scores_b = scores;
+        }
+
+        // mask_b = scores_b > score_th
+        std::vector<int> mask_b;
+        for (int i = 0; i < (int)scores_b.size(); ++i) {
+            if (scores_b[i] > score_th) { mask_b.push_back(i); }
+        }
+
+        // scored_backward.append(np.concatenate((
+        //     np.expand_dims(scores_b[mask_b], 0),
+        //     fourds[1, 1:4][:, mask_b],
+        //     fourds[0, 1:4][:, mask_b],
+        // )))
+        const size_t mask_b_size = mask_b.size();
+        std::vector<float> result_b(7 * mask_b_size);
+        for (size_t i = 0; i < mask_b_size; ++i) {
+            const auto m = mask_b[i];
+            result_b[i                ] = scores_b[m];
+            result_b[i + mask_b_size  ] = masked[scores_size*5 + m];
+            result_b[i + mask_b_size*2] = masked[scores_size*6 + m];
+            result_b[i + mask_b_size*3] = masked[scores_size*7 + m];
+            result_b[i + mask_b_size*4] = masked[scores_size   + m];
+            result_b[i + mask_b_size*5] = masked[scores_size*2 + m];
+            result_b[i + mask_b_size*6] = masked[scores_size*3 + m];
+        }
+        scored_backward.push_back(result_b);
+
+        std::vector<float> scores_f(scores_size);
+        if (pifhr_floor < 1.0f) {
+            // ij_f = np.round(fourds[1, 1:3] * self.stride).astype(np.int)
+            // ij_f[0] = np.clip(ij_f[0], 0, self._pifhr.shape[2] - 1)
+            // ij_f[1] = np.clip(ij_f[1], 0, self._pifhr.shape[1] - 1)
+            std::vector<int> ij_f(2 * scores_size);
+            for (size_t i = 0; i < scores_size*2; ++i) {
+                const int v = (int)std::round(masked[scores_size*5 + i] * stride);
+                ij_f[i] = std::min(std::max(0, v), i < scores_size ? W_hr - 1 : H_hr - 1);
+            }
+
+            // pifhr_f = self._pifhr[j2i, ij_f[1], ij_f[0]]
+            // scores_f = scores * (pifhr_floor + (1.0 - pifhr_floor) * pifhr_f)
+            const auto j2i = bones[c][1] - 1;
+            for (size_t i = 0; i < scores_f.size(); ++i) {
+                const auto pifhr_f = pifhr[j2i * pifhr_stride_0 + ij_f[scores_size + i] * pifhr_stride_1 + ij_f[i]];
+                scores_f[i] = scores[i] * (pifhr_floor + (1.0f - pifhr_floor) * pifhr_f);
+            }
+        } else {
+            scores_f = scores;
+        }
+
+        // mask_f = scores_f > score_th
+        std::vector<int> mask_f;
+        for (int i = 0; i < (int)scores_b.size(); ++i) {
+            if (scores_f[i] > score_th) { mask_f.push_back(i); }
+        }
+
+        // scored_forward.append(np.concatenate((
+        //     np.expand_dims(scores_f[mask_f], 0),
+        //     fourds[0, 1:4][:, mask_f],
+        //     fourds[1, 1:4][:, mask_f],
+        // )))
+        const size_t mask_f_size = mask_f.size();
+        std::vector<float> result_f(7 * mask_f_size);
+        for (size_t i = 0; i < mask_f_size; ++i) {
+            const auto m = mask_f[i];
+            result_f[i                ] = scores_f[m];
+            result_f[i + mask_f_size  ] = masked[scores_size   + m];
+            result_f[i + mask_f_size*2] = masked[scores_size*2 + m];
+            result_f[i + mask_f_size*3] = masked[scores_size*3 + m];
+            result_f[i + mask_f_size*4] = masked[scores_size*5 + m];
+            result_f[i + mask_f_size*5] = masked[scores_size*6 + m];
+            result_f[i + mask_f_size*6] = masked[scores_size*7 + m];
+        }
+        scored_forward.push_back(result_f);
+
+        /*
+        // For debugging
+        printf("iteration: %d\n", c);
+        printf("scores:\n"); for (auto n : scores) printf("%f, ", n); printf("\n");
+        printf("mask:\n"); for (auto n : mask) printf("%d, ", n); printf("\n");
+        printf("masked:\n"); for (auto n : masked) printf("%f, ", n); printf("\n");
+        printf("scores_b:\n"); for (auto n : scores_b) printf("%f, ", n); printf("\n");
+        printf("scores_f:\n"); for (auto n : scores_f) printf("%f, ", n); printf("\n");
+        */
+    }
+    return Paf_target{ scored_forward, scored_backward };
+}
+
+std::vector<OpenPifPafPostprocessor::Pifhr_seed>
+OpenPifPafPostprocessor::pifhrSeeds(const std::vector<float>& pifhrScales,
+                                    const std::vector<float>& pifhrCore)
+{
+    std::vector<Pifhr_seed> seeds;
+
+    for (int field_i = 0; field_i < 17; ++field_i) {
+        const size_t pifhrScalesOffset = field_i * pifhr_stride_0;
+        const size_t pifhrCoreOffset = field_i * pifhr_stride_0;
+
+        // candidates = np.concatenate((index_fields, np.expand_dims(f, 0)), 0)
+        // mask = f > self.seed_threshold
+        std::vector<int> mask;
+        for (int i = 0; i < H_hr * W_hr; ++i) {
+            const auto value = pifhrCore[pifhrCoreOffset + i];
+            if (value > seedThreshold) { mask.push_back(i); }
+        }
+
+        // candidates = np.moveaxis(candidates[:, mask], 0, -1)
+        // This is a (count, 3) tensor where count is #elements over threshold.
+        std::vector<float> masked(mask.size() * 3);
+        for (size_t i = 0; i < mask.size(); ++i) {
+            const auto m = mask[i];
+            masked[i*3    ] = indexField_hr[m];
+            masked[i*3 + 1] = indexField_hr[m + H_hr*W_hr];
+            masked[i*3 + 2] = pifhrCore[pifhrCoreOffset + m];
+        }
+
+        // occupied = np.zeros(s.shape)
+        std::vector<float> occupied(H_hr * W_hr, 0.0f);
+
+        std::vector<int> sorted(mask.size());
+        std::iota(sorted.begin(), sorted.end(), 0);
+        std::sort(sorted.begin(), sorted.end(), [masked] (int const& a, int const& b) {
+            return masked[a*3 + 2] > masked[b*3 + 2];
+        });
+
+        // for c in sorted(candidates, key=lambda c: c[2], reverse=True):
+        for (auto c : sorted) {
+            const auto c_0 = masked[c*3];
+            const auto c_1 = masked[c*3 + 1];
+            const auto c_2 = masked[c*3 + 2];
+
+            // i, j = int(c[0]), int(c[1])
+            const auto i = (int)c_0;
+            const auto j = (int)c_1;
+            if (occupied[j*W_hr + i] > 0) { continue; }
+
+            // width = max(4, s[j, i])
+            const auto s = pifhrScales[pifhrScalesOffset + j * pifhr_stride_1 + i];
+            const auto width = std::max(4.0f, s);
+
+            // scalar_square_add_single(occupied, c[0], c[1], width / 2.0, 1.0)
+            scalarSquareAddSingle(occupied.data(), H_hr, W_hr, c_0, c_1, width / 2.0f, 1.0f);
+
+            // seeds.append((c[2], field_i, c[0] / self.stride, c[1] / self.stride))
+            seeds.emplace_back( c_2, field_i, c_0 / stride, c_1 / stride );
+        }
+    }
+
+    // seeds = list(sorted(seeds, reverse=True))
+    std::sort(seeds.begin(), seeds.end(), [] (const Pifhr_seed& a, const Pifhr_seed& b) {
+        const auto ca = std::get<0>(a);
+        const auto cb = std::get<0>(b);
+        return ca > cb;
+    });
+
+    // if len(seeds) > 500:
+    //     if seeds[500][0] > 0.1:
+    //         seeds = [s for s in seeds if s[0] > 0.1]
+    //     else:
+    //         seeds = seeds[:500]
+    if (seeds.size() > 500) {
+        seeds.resize(500);
+    }
+    return seeds;
+}
+
+std::vector<float>
+OpenPifPafPostprocessor::pafCenter(const std::vector<float>& paf_field,
+                                   float x, float y, float sigma)
+{
+    std::vector<int> mask;
+    const int paf_stride = (int)paf_field.size() / 7;
+    for (int i = 0; i < paf_stride; ++i) {
+        const bool take = (paf_field[  paf_stride + i] > x - sigma * paf_field[3*paf_stride + i]) &&
+                          (paf_field[  paf_stride + i] < x + sigma * paf_field[3*paf_stride + i]) &&
+                          (paf_field[2*paf_stride + i] > y - sigma * paf_field[3*paf_stride + i]) &&
+                          (paf_field[2*paf_stride + i] < y + sigma * paf_field[3*paf_stride + i]);
+        if (take) { mask.push_back(i); }
+    }
+    if (mask.empty()) { return {}; }
+
+    const int mask_size = (int)mask.size();
+    const int out_stride = mask_size;
+    std::vector<float> result(7 * mask_size, 0.0f);
+    for (int j = 0; j < 7; ++j) {
+        for (int i = 0; i < mask_size; ++i) {
+            const int m = mask[i];
+            result[j*out_stride + i] = paf_field[j*paf_stride + m];
+        }
+    }
+    return result;
+}
+
+OpenPifPafPostprocessor::Connection
+OpenPifPafPostprocessor::growConnection(float x, float y,
+                                        const std::vector<float>& paf_field_)
+{
+    // # source value
+    // paf_field = paf_center(paf_field, xy[0], xy[1], sigma=2.0)
+    // if paf_field.shape[1] == 0:
+    //     return 0, 0, 0
+    const auto paf_field = pafCenter(paf_field_, x, y, 2.0f);
+    if (paf_field.empty()) { return Connection{ 0, 0, 0}; }
+
+    // # source distance
+    // d = np.linalg.norm(np.expand_dims(xy, 1) - paf_field[1:3], axis=0)
+    // b_source = paf_field[3] * 3.0
+    // # combined value and source distance
+    // v = paf_field[0]
+    // scores = np.exp(-1.0 * d / b_source) * v  # two-tailed cumulative Laplace
+    const int paf_stride = (int)paf_field.size() / 7;
+    std::vector<float> scores(paf_stride);
+    for (int i = 0; i < paf_stride; ++i) {
+        const auto a = x - paf_field[paf_stride   + i];
+        const auto b = y - paf_field[paf_stride*2 + i];
+        const auto d = std::sqrt(a*a + b*b);
+        const auto b_source = paf_field[paf_stride*3 + i] * 3.0f;
+        const auto v = paf_field[i];
+        scores[i] = std::exp(-d / b_source) * v;
+    }
+
+    // return self._target_with_maxscore(paf_field[4:7], scores)
+    int max_i;
+    const float score = vargmax(scores.data(), scores.size(), &max_i);
+    return Connection{ paf_field[paf_stride*4 + max_i], paf_field[paf_stride*5 + max_i], score };
+}
+
+std::vector<OpenPifPafPostprocessor::frontier_t> OpenPifPafPostprocessor::frontier(Annotation& ann) {
+    std::vector<frontier_t> f;
+
+    for (int connection_i = 0; connection_i < numBones; ++connection_i) {
+        const auto bone = bones[connection_i];
+        const auto j1i = bone[0] - 1;
+        const auto j2i = bone[1] - 1;
+        if (ann.keypoints[j1i*3 + 2] > 0.0f && ann.keypoints[j2i*3 + 2] == 0.0f) {
+            f.emplace_back( ann.keypoints[j1i*3 + 2], connection_i, true, j1i, j2i );
+        }
+    }
+
+    for (int connection_i = 0; connection_i < numBones; ++connection_i) {
+        const auto bone = bones[connection_i];
+        const auto j1i = bone[0] - 1;
+        const auto j2i = bone[1] - 1;
+        if (ann.keypoints[j2i*3 + 2] > 0.0f && ann.keypoints[j1i*3 + 2] == 0.0f) {
+            f.emplace_back( ann.keypoints[j2i*3 + 2], connection_i, false, j1i, j2i );
+        }
+    }
+
+    std::sort(f.begin(), f.end(), [] (const frontier_t& a, const frontier_t& b) {
+        const auto ca = std::get<0>(a);
+        const auto cb = std::get<0>(b);
+        return ca > cb;
+    });
+
+    return f;
+}
+
+OpenPifPafPostprocessor::frontier_t OpenPifPafPostprocessor::frontierIter(Annotation& ann) {
+    while (frontierActive) {
+        // unblocked_frontier = [f for f in self.frontier()
+        //                       if (f[1], f[2]) not in block_frontier]
+        std::vector<frontier_t> unblockedFrontier;
+        for (auto f : frontier(ann)) {
+            const auto connection_id = std::get<1>(f);
+            const auto forward = std::get<2>(f);
+            if (blockFrontier.find(std::tuple<int, bool>{ connection_id, forward }) == blockFrontier.end()) {
+                unblockedFrontier.push_back(f);
+            }
+        }
+
+        /*
+        // For debugging
+        printf("unblockedFrontier ");
+        for (auto n : unblockedFrontier) {
+          printf("(%f, %d, %s, %d, %d), ", std::get<0>(n), std::get<1>(n),
+                                           std::get<2>(n) ? "true" : "false",
+                                           std::get<3>(n), std::get<4>(n));
+        }
+        printf("\n");
+        */
+
+        // if not unblocked_frontier:
+        //     break
+        if (unblockedFrontier.empty()) {
+            frontierActive = false;
+            break;
+        }
+
+        // first = unblocked_frontier[0]
+        // yield first
+        // block_frontier.add((first[1], first[2]))
+        const auto first = unblockedFrontier[0];
+        const auto connection_id = std::get<1>(first);
+        const auto forward = std::get<2>(first);
+        blockFrontier.insert(std::tuple<int, bool>{ connection_id, forward });
+        return first;
+    }
+    return {};
+}
+
+void OpenPifPafPostprocessor::grow(Annotation& ann,
+                                   const std::vector<std::vector<float>>& pafForward,
+                                   const std::vector<std::vector<float>>& pafBackward,
+                                   float th)
+{
+    frontierActive = true;
+    blockFrontier.clear();
+
+    while (true) {
+        const auto f = frontierIter(ann);
+        if (!frontierActive) { return; }
+
+        const auto i = std::get<1>(f);
+        const auto forward = std::get<2>(f);
+        const auto j1i = std::get<3>(f);
+        const auto j2i = std::get<4>(f);
+
+        // For debugging
+        //printf("grow: %d %s %d %d\n", i, forward ? "true" : "false", j1i, j2i);
+
+        float x, y, v;
+        std::vector<float> directed_paf_field;
+        std::vector<float> directed_paf_field_reverse;
+        if (forward) {
+            x = ann.keypoints[j1i*3    ];
+            y = ann.keypoints[j1i*3 + 1];
+            v = ann.keypoints[j1i*3 + 2];
+            directed_paf_field = pafForward[i];
+            directed_paf_field_reverse = pafBackward[i];
+        } else {
+            x = ann.keypoints[j2i*3    ];
+            y = ann.keypoints[j2i*3 + 1];
+            v = ann.keypoints[j2i*3 + 2];
+            directed_paf_field = pafBackward[i];
+            directed_paf_field_reverse = pafForward[i];
+        }
+
+        const auto t = growConnection(x, y, directed_paf_field);
+        const auto new_x = std::get<0>(t);
+        const auto new_y = std::get<1>(t);
+        auto new_v = std::get<2>(t);
+
+        if (new_v < th) { continue; }
+
+        // reverse match
+        if (th >= 0.1) {
+            const auto t1 = growConnection(new_x, new_y, directed_paf_field_reverse);
+            const auto reverse_x = std::get<0>(t1);
+            const auto reverse_y = std::get<1>(t1);
+            const auto reverse_v = std::get<2>(t1);
+            if (reverse_v < th) { continue; }
+            if (std::abs(x - reverse_x) + std::abs(y - reverse_y) > 1.0f) { continue; }
+        }
+
+        new_v = std::sqrt(new_v * v);  // geometric mean
+
+        if (forward) {
+            if (new_v > ann.keypoints[j2i*3 + 2]) {
+                ann.keypoints[j2i*3    ] = new_x;
+                ann.keypoints[j2i*3 + 1] = new_y;
+                ann.keypoints[j2i*3 + 2] = new_v;
+            }
+        } else {
+            if (new_v > ann.keypoints[j1i*3 + 2]) {
+                ann.keypoints[j1i*3    ] = new_x;
+                ann.keypoints[j1i*3 + 1] = new_y;
+                ann.keypoints[j1i*3 + 2] = new_v;
+            }
+        }
+    }
+}
+
+void OpenPifPafPostprocessor::fillJointScales(Annotation& ann,
+                                              const std::vector<float>& scales,
+                                              int fieldH,
+                                              int fieldW,
+                                              float hr_scale)
+{
+    for (int k = 0; k < numKeypoints; ++k) {
+        const auto x = ann.keypoints[k*3];
+        const auto y = ann.keypoints[k*3 + 1];
+        const auto v = ann.keypoints[k*3 + 2];
+        if (v == 0) { continue; }
+
+        // i = max(0, min(scale_field.shape[1] - 1, int(round(xyv[0] * hr_scale))))
+        // j = max(0, min(scale_field.shape[0] - 1, int(round(xyv[1] * hr_scale))))
+        const auto i = std::max(0, std::min(fieldW - 1, (int)std::round(x * hr_scale)));
+        const auto j = std::max(0, std::min(fieldH - 1, (int)std::round(y * hr_scale)));
+
+        // self.joint_scales[xyv_i] = scale_field[j, i] / hr_scale
+        ann.jointScales[k] = scales[k*pifhr_stride_0 + j*pifhr_stride_1 + i] / hr_scale;
+    }
+}
+
+std::vector<OpenPifPafPostprocessor::Annotation>
+OpenPifPafPostprocessor::decodeAnnotations(const std::vector<float>& pifhr,
+                                           const std::vector<float>& pifhrScales,
+                                           const std::vector<float>& pifhrCore,
+                                           const std::vector<std::vector<float>>& pafForward,
+                                           const std::vector<std::vector<float>>& pafBackward)
+{
+    const auto seeds = pifhrSeeds(pifhrScales, pifhrCore);
+
+    // This is a (17, H_hr, W_hr) tensor.
+    std::vector<float> occupied(17 * H_hr * W_hr, 0.0f);
+
+    std::vector<Annotation> annotations;
+    for (auto& seed : seeds) {
+        const auto v = std::get<0>(seed);
+        const auto f = std::get<1>(seed);
+        const auto x = std::get<2>(seed);
+        const auto y = std::get<3>(seed);
+
+        const auto i = std::min(std::max(0, (int)std::round(x * stride)), W_hr - 1);
+        const auto j = std::min(std::max(0, (int)std::round(y * stride)), H_hr - 1);
+        if (occupied[f*H_hr*W_hr + j*W_hr + i] > 0.0f) { continue; }
+
+        Annotation ann(f, x, y, v);
+        grow(ann, pafForward, pafBackward);
+        fillJointScales(ann, pifhrScales, H_hr, W_hr, stride);
+        annotations.push_back(ann);
+
+        for (int i = 0; i < numKeypoints; ++i) {
+            const auto x = ann.keypoints[i*3];
+            const auto y = ann.keypoints[i*3 + 1];
+            const auto v = ann.keypoints[i*3 + 2];
+            if (v == 0) { continue; }
+
+            const auto width = ann.jointScales[i] * stride;
+            scalarSquareAddSingle(occupied.data() + i*H_hr*W_hr, H_hr, W_hr,
+                                  x * stride, y * stride, width / 2.0f, 1.0f);
+        }
+    }
+    return annotations;
+}
+
+std::vector<OpenPifPafPostprocessor::Annotation> OpenPifPafPostprocessor::softNMS(std::vector<Annotation>& annotations) {
+    float maxx = 0.0f;
+    float maxy = 0.0f;
+    for (auto& ann : annotations) {
+        for (int k = 0; k < numKeypoints; ++k) {
+            auto x = ann.keypoints[k*3];
+            auto y = ann.keypoints[k*3 + 1];
+            if (x > maxx) { maxx = x; }
+            if (y > maxy) { maxy = y; }
+        }
+    }
+
+    const auto h = (int)(maxy + 1);
+    const auto w = (int)(maxx + 1);
+    std::vector<float> occupied(17 * h * w, 0.0f);
+
+    std::vector<int> sorted(annotations.size());
+    std::iota(sorted.begin(), sorted.end(), 0);
+    std::sort(sorted.begin(), sorted.end(), [annotations] (int const& a, int const& b) {
+        return annotations[a].score() > annotations[b].score();
+    });
+
+    for (auto a : sorted) {
+        Annotation& ann = annotations[a];
+        for (int k = 0; k < numKeypoints; ++k) {
+            const auto x = ann.keypoints[k*3    ];
+            const auto y = ann.keypoints[k*3 + 1];
+            const auto v = ann.keypoints[k*3 + 2];
+            if (v == 0) { continue; }
+
+            const auto i = std::min(std::max(0, (int)std::round(x)), w - 1);
+            const auto j = std::min(std::max(0, (int)std::round(y)), h - 1);
+
+            if (occupied[k*h*w + j*w + i] > 0.0f) {
+                ann.keypoints[k*3 + 2] = 0.0f;
+            } else {
+                scalarSquareAddSingle(occupied.data() + k*h*w, h, w, x, y, ann.jointScales[k], 1.0f);
+            }
+        }
+    }
+
+    std::vector<Annotation> filtered;
+    for (auto& ann : annotations) {
+        for (int k = 0; k < numKeypoints; ++k) {
+            if (ann.keypoints[k*3 + 2] > 0.0f) {
+                filtered.push_back(ann);
+                break;
+            }
+        }
+    }
+    return filtered;
+
+    // Note: The original code sorts here on the score (descending), but
+    // we sort again later on so it's a bit quicker if we skip that here.
+}
+
+void OpenPifPafPostprocessor::initTensors(int tensorWidth, int tensorHeight) {
+    H = tensorHeight;
+    W = tensorWidth;
+    H_hr = H * (int)stride;
+    W_hr = W * (int)stride;
+
+    paf_stride_2 = H * W;
+    paf_stride_1 = 4 * paf_stride_2;
+    paf_stride_0 = 2 * paf_stride_1;
+
+    pif_stride_1 = H * W;
+    pif_stride_0 = 4 * pif_stride_1;
+
+    pifhr_stride_1 = W_hr;
+    pifhr_stride_0 = H_hr * pifhr_stride_1;
+
+    indexField = makeIndexField(H, W);
+    indexField_hr = makeIndexField(H_hr, W_hr);
+    paf = std::vector<float>(19 * 2 * 4 * H * W);
+    pif = std::vector<float>(17 * 4 * H * W);
+
+    const int shape = C * H_hr * W_hr;
+    targetsCoreOnly = std::vector<float>(shape);
+    targets = std::vector<float>(shape);
+    scales = std::vector<float>(shape);
+    ns = std::vector<float>(shape);
+}
+
+ai_app::Object_detection::Result OpenPifPafPostprocessor::postprocess_0_8(
+    int inputWidth, int inputHeight,
+    int tensorWidth, int tensorHeight,
+    const float* pif_c,
+    const float* pif_r,
+    const float* pif_s,
+    const float* paf_c,
+    const float* paf_r1,
+    const float* paf_r2,
+    const float* paf_b1,
+    const float* paf_b2)
+{
+    this->inputWidth = inputWidth;
+    this->inputHeight = inputHeight;
+
+    // Allocate the intermediate tensors the first time or when the size changes.
+    if (W != tensorWidth || H != tensorHeight) {
+        initTensors(tensorWidth, tensorHeight);
+    }
+
+    normalizePAF(paf_c, paf_r1, paf_r2, paf_b1, paf_b2);
+    normalizePIF(pif_c, pif_r, pif_s);
+
+    const auto ti = targetIntensities(pif);
+    const auto pifhr = std::get<0>(ti);
+    const auto pifhrScales = std::get<1>(ti);
+    const auto pifhrCore = std::get<2>(ti);
+
+    /*
+    // For debugging
+    for (int c = 0; c < 17; ++c) {
+      for (int y = 0; y < H_hr; ++y) {
+        for (int x = 0; x < W_hr; ++x) {
+          printf("%f, ", pifhrCore[c*136*248 + y*248 + x]);
+        }
+      }
+      printf("\n");
+    }
+    */
+
+    // This returns two lists that each contain 19 tensors of shape (7, ?)
+    // where the second dimension can vary in size (depends on thresholds).
+    const auto pt = scorePafTarget(paf, pifhr);
+    const auto pafForward = std::get<0>(pt);
+    const auto pafBackward = std::get<1>(pt);
+
+    /*
+    // For debugging
+    printf("pafForward:\n");
+    for (auto& i : pafForward) {
+      for (auto j : i) { printf("%f, ", j); } printf("\n");
+    }
+    printf("\npafBackward:\n");
+    for (auto i : pafBackward) {
+      for (auto& j : i) { printf("%f, ", j); } printf("\n");
+    }
+    */
+
+    auto annotations = decodeAnnotations(pifhr, pifhrScales, pifhrCore, pafForward, pafBackward);
+
+    // Scale to input size
+    const float output_stride = 8.0f;
+    for (auto& ann : annotations) {
+        for (int k = 0; k < numKeypoints; ++k) {
+            ann.keypoints[k*3    ] *= output_stride;
+            ann.keypoints[k*3 + 1] *= output_stride;
+            ann.jointScales[k]     *= output_stride;
+        }
+    }
+
+    // Non-maximum suppression
+    if (!annotations.empty()) {
+        annotations = softNMS(annotations);
+    }
+
+    // Threshold
+    std::vector<Annotation> thresholded;
+    for (auto& ann : annotations) {
+        for (int k = 0; k < numKeypoints; ++k) {
+            if (ann.keypoints[k*3 + 2] < keypointThreshold) {
+                ann.keypoints[k*3 + 2] = 0.0f;
+            }
+        }
+        if (ann.score() >= instanceThreshold) {
+            thresholded.push_back(ann);
+        }
+    }
+
+    std::sort(thresholded.begin(), thresholded.end(), [] (const Annotation& a, const Annotation& b) {
+        return a.score() > b.score();
+    });
+
+    // Convert to normalized coordinates
+    for (auto& ann : thresholded) {
+        for (int k = 0; k < numKeypoints; ++k) {
+            ann.keypoints[k*3    ] /= inputWidth;
+            ann.keypoints[k*3 + 1] /= inputHeight;
+        }
+    }
+
+    /*
+    // For debugging
+    for (auto ann : thresholded) {
+      printf("Keypoints:\n");
+      for (auto k : ann.keypoints) {
+        printf("%f, ", k);
+      }
+      printf("\nJoint scales:\n");
+      for (auto k : ann.jointScales) {
+        printf("%f, ", k);
+      }
+      printf("\n");
+    }
+    */
+
+    ai_app::Object_detection::Result result;
+    result.success = true;
+    for (auto& ann : thresholded) {
+        ai_app::Landmarks landmarks;
+        landmarks.type = "body_pose_pifpaf";
+
+        int minx =  std::numeric_limits<int>::max(),
+            miny =  std::numeric_limits<int>::max(),
+            maxx = -std::numeric_limits<int>::max(),
+            maxy = -std::numeric_limits<int>::max();
+
+        for (int k = 0; k < numKeypoints; ++k) {
+            const int x = ann.keypoints[k*3    ] * 10000; // FIXME: MAGIC NUMBER.
+            const int y = ann.keypoints[k*3 + 1] * 10000;
+            const auto v = ann.keypoints[k*3 + 2];
+
+            if (v > 0.0f) {
+                if (x < minx) { minx = x; }
+                if (x > maxx) { maxx = x; }
+                if (y < miny) { miny = y; }
+                if (y > maxy) { maxy = y; }
+            }
+
+            ai_app::Landmark landmark;
+            landmark.confidence = v;
+            landmark.position.x = x;
+            landmark.position.y = y;
+            landmarks.points.push_back(landmark);
+        }
+
+        ai_app::Object_detection::Result::Item item;
+        item.confidence = ann.score();
+        item.class_index = 1;
+        item.bounding_box.origin.x = minx;
+        item.bounding_box.origin.y = miny;
+        item.bounding_box.size.x = maxx - minx;
+        item.bounding_box.size.y = maxy - miny;
+        item.landmarks = landmarks;
+
+        result.items.push_back(item);
+    }
+    return result;
+}
+
+}
\ No newline at end of file
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.hpp b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
new file mode 100644
index 00000000..9124b1f1
--- /dev/null
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
@@ -0,0 +1,188 @@
+#pragma once
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <set>
+
+#include "object_detection.hpp"
+
+namespace lpdnn::aiapp_impl {
+
+/**
+  Post-processing logic for OpenPifPaf
+
+  \note This object caches the big tensors to save on memory allocations.
+  This means it's best to make one instance of this class and keep using it.
+  For the most efficient results, make sure the input tensors are always the
+  same width and height.
+
+  \note This code is not threadsafe. Don't call it from multiple threads at
+  the same time. If you must use multiple threads, give each thread its own
+  instance of this class.
+ */
+class OpenPifPafPostprocessor
+{
+public:
+  OpenPifPafPostprocessor() : H(0), W(0) { }
+
+  /**
+    Applies post-processing to OpenPifPaf output.
+
+    \param inpWidth Width of the input tensor in pixels.
+    \param inpHeight Height of the input tensor in pixels.
+    \param tensorWidth Width of the neural network's PIF and PAF outputs.
+    \param tensorHeight Height of the neural network's PIF and PAF outputs.
+  */
+  ai_app::Object_detection::Result postprocess_0_8(
+    int inpWidth, int inpHeight, int tensorWidth, int tensorHeight,
+    const float* pif_c,  // 17xHxW
+    const float* pif_r,  // 34xHxW
+    const float* pif_s,  // 17xHxW
+    const float* paf_c,  // 19xHxW
+    const float* paf_r1, // 38xHxW
+    const float* paf_r2, // 38xHxW
+    const float* paf_b1, // 19xHxW
+    const float* paf_b2  // 19xHxW
+  );
+
+public:
+  static const int numKeypoints = 17;
+  static const int numBones = 19;
+
+  // Connections between the different keypoint indices.
+  // Note: these start at 1, not 0!
+  static const int bones[19][2];
+
+private:
+  struct Annotation {
+    // Array of `numKeypoints * 3` elements:
+    // - element `i*3 + 0` is x-coordinate (normalized)
+    // - element `i*3 + 1` is y-coordinate (normalized)
+    // - element `i*3 + 2` is confidence score
+    std::vector<float> keypoints;
+
+    std::vector<float> jointScales;
+
+    Annotation(int j, float x, float y, float v) : keypoints(numKeypoints * 3),
+                                                   jointScales(numKeypoints)
+    {
+      keypoints[j*3    ] = x;
+      keypoints[j*3 + 1] = y;
+      keypoints[j*3 + 2] = v;
+    }
+
+    /**
+      Overall confidence score for the entire skeleton.
+    */
+    [[nodiscard]] float score() const {
+      float maxv = 0.0f;
+      float vv = 0.0f;
+      for (int k = 0; k < numKeypoints; ++k) {
+        auto v = keypoints[k*3 + 2];
+        maxv = std::max(maxv, v);
+        vv += v * v;
+      }
+      return 0.1f * maxv + 0.9f * vv / (float)numKeypoints;
+    }
+  };
+
+  // 0: confidence of origin
+  // 1: connection index
+  // 2: forward?
+  // 3: joint index 1 (not corrected for forward)
+  // 4: joint index 2 (not corrected for forward)
+  typedef std::tuple<float, int, bool, int, int> frontier_t;
+  typedef std::tuple<std::vector<float>, std::vector<float>, std::vector<float>> Target_intensity;
+  typedef std::tuple<std::vector<std::vector<float>>, std::vector<std::vector<float>>> Paf_target;
+  typedef std::tuple<float, int, float, float> Pifhr_seed;
+  typedef std::tuple<float, float, float> Connection;
+
+private:
+  void initTensors(int tensorWidth, int tensorHeight);
+
+  void normalizePAF(const float* intensityFields,
+                    const float* j1Fields,
+                    const float* j2Fields,
+                    const float* j1FieldsLogb,
+                    const float* j2FieldsLogb);
+
+  void normalizePIF(const float* jointIntensityFields,
+                    const float* jointFields,
+                    const float* scaleFields);
+
+  Target_intensity
+  targetIntensities(const std::vector<float>& pif,
+                    float v_th = 0.1f,
+                    bool coreOnly = false);
+
+  Paf_target
+  scorePafTarget(const std::vector<float>& pafvec,
+                 const std::vector<float>& pifhr,
+                 float pifhr_floor = 0.01f,
+                 float score_th = 0.1f) const;
+
+  std::vector<Pifhr_seed>
+  pifhrSeeds(const std::vector<float>& pifhrScales,
+             const std::vector<float>& pifhrCore);
+
+  static std::vector<float> pafCenter(const std::vector<float>& paf_field,
+                               float x, float y, float sigma = 1.0f);
+
+  static Connection
+  growConnection(float x, float y, const std::vector<float>& paf_field_);
+
+  static std::vector<frontier_t> frontier(Annotation& ann);
+
+  frontier_t frontierIter(Annotation& ann);
+
+  void grow(Annotation& ann,
+            const std::vector<std::vector<float>>& pafForward,
+            const std::vector<std::vector<float>>& pafBackward,
+            float th = 0.1f);
+
+  void fillJointScales(Annotation& ann,
+                       const std::vector<float>& scales,
+                       int fieldH,
+                       int fieldW,
+                       float hr_scale);
+
+  std::vector<Annotation>
+  decodeAnnotations(const std::vector<float>& pifhr,
+                    const std::vector<float>& pifhrScales,
+                    const std::vector<float>& pifhrCore,
+                    const std::vector<std::vector<float>>& pafForward,
+                    const std::vector<std::vector<float>>& pafBackward);
+
+  std::vector<Annotation> softNMS(std::vector<Annotation>& annotations);
+
+private:
+  // Used to normalize the skeleton keypoint coordinates to [0, 1].
+  float inputWidth, inputHeight;
+
+  // Tensor dimensions (hr = high-resolution).
+  int H, W, H_hr, W_hr;
+
+  // Strides for tensor dimensions.
+  size_t paf_stride_2, paf_stride_1, paf_stride_0;
+  size_t pif_stride_1, pif_stride_0;
+  size_t pifhr_stride_1, pifhr_stride_0;
+
+  // Temporary tensors.
+  std::vector<float> indexField;     // 2 x H x W
+  std::vector<float> indexField_hr;  // 2 x H x W
+  std::vector<float> paf;            // 19 x 2 x 4 x H x W
+  std::vector<float> pif;            // 17 x     4 x H x W
+
+  // Filled in by targetIntensities().
+  std::vector<float> targetsCoreOnly;
+  std::vector<float> targets;
+  std::vector<float> scales;
+  std::vector<float> ns;
+
+  std::set<std::tuple<int, bool>> blockFrontier;
+  bool frontierActive;
+};
+
+}
+

From f897e945517597c5b88fd0926dced3bed4f5a738 Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Sat, 26 Jun 2021 02:35:36 +0800
Subject: [PATCH 2/4] refact: rm comments

---
 examples/cli.cpp                                | 15 ++++++++++-----
 src/pifpaf.cpp                                  |  3 ---
 src/pifpaf_decoder/openpifpaf_postprocessor.cpp |  5 -----
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/examples/cli.cpp b/examples/cli.cpp
index 80a54aa4..c37000ed 100644
--- a/examples/cli.cpp
+++ b/examples/cli.cpp
@@ -9,13 +9,14 @@
 #define kSTREAM "stream"
 #define kPAF "paf"
 #define kPPN "ppn"
+#define kPIFPAF "pifpaf"
 
 // Model Configuration.
 DEFINE_string(model, "../data/models/TinyVGG-V1-HW=256x384.uff", "Path to the model.");
 DEFINE_string(
     post,
     kPAF,
-    "Post-processing method. (`" kPAF "` -> [Part Affine Field] or `" kPPN "` -> [Pose Proposal Network])");
+    "Post-processing method. (`" kPAF "` -> [Part Affine Field] or `" kPPN "` -> [Pose Proposal Network]) or `" kPIFPAF "` -> [Pif Paf]");
 DEFINE_int32(w, 384, "Width of input image.");
 DEFINE_int32(h, 256, "Height of input image.");
 DEFINE_int32(max_batch_size, 8, "Max batch size for inference engine to execute.");
@@ -37,18 +38,19 @@ namespace hp = hyperpose;
 
 class parser_variant {
 public:
+    using var_t = std::variant<hp::parser::pose_proposal, hp::parser::paf, hp::parser::pifpaf>;
     template <typename Container>
     std::vector<hp::human_t> process(Container&& feature_map_containers)
     {
         return std::visit([&feature_map_containers](auto& arg) { return arg.process(feature_map_containers); }, m_parser);
     }
-    parser_variant(std::variant<hp::parser::pose_proposal, hp::parser::paf> v)
+    parser_variant(var_t v)
         : m_parser(std::move(v))
     {
     }
 
 private:
-    std::variant<hp::parser::pose_proposal, hp::parser::paf> m_parser;
+    var_t m_parser;
 };
 //parser_variant parser{parser};
 
@@ -142,14 +144,17 @@ int main(int argc, char** argv)
     }();
     cli_log() << "DNN engine is built.\n";
 
-    auto parser = parser_variant{ [&engine]() -> std::variant<hp::parser::pose_proposal, hp::parser::paf> {
+    auto parser = parser_variant{ [&engine]() -> parser_variant::var_t {
         if (FLAGS_post == kPAF)
             return hp::parser::paf{};
 
         if (FLAGS_post == kPPN)
             return hp::parser::pose_proposal(engine.input_size());
 
-        cli_log() << "ERROR: Unknown post-processing flag: `" << FLAGS_post << "`. Use `paf` or `ppn` please.\n";
+        if (FLAGS_post == kPIFPAF)
+            return hp::parser::pifpaf{};
+
+        cli_log() << "ERROR: Unknown post-processing flag: `" << FLAGS_post << "`. Use `paf`, `ppn` or `pifpaf` please.\n";
         std::exit(-1);
     }() };
 
diff --git a/src/pifpaf.cpp b/src/pifpaf.cpp
index ca4b71f6..1482f6bb 100644
--- a/src/pifpaf.cpp
+++ b/src/pifpaf.cpp
@@ -102,10 +102,8 @@ std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map
                                      pif_conf.data(), pif_xy.data(), pif_s.data(),
                                      paf_conf.data(), paf_xy1.data(), paf_xy2.data(), paf_b1.data(), paf_b2.data());
 
-//    std::cout << "Check pif[0]\t" << pif.view<float>()[0] << std::endl;
     std::vector<human_t> ret{};
     ret.reserve(apires.items.size());
-//    std::cout << apires.items.size() << "...size\n";
 
     /*
      *
@@ -125,7 +123,6 @@ HyperPose: Unified Topology
                 dst.score = 1;// src.confidence; FIXME
                 dst.x = src.position.x / 10000.;
                 dst.y = src.position.y / 10000.;
-//                std::cout << dst.x << ' ' << dst.y << '\n';
                 dst.has_value = true;
             }
         };
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
index acf536fd..21049d96 100644
--- a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
@@ -489,11 +489,6 @@ OpenPifPafPostprocessor::scorePafTarget(const std::vector<float>& pafvec,
             if (scores_b[i] > score_th) { mask_b.push_back(i); }
         }
 
-        // scored_backward.append(np.concatenate((
-        //     np.expand_dims(scores_b[mask_b], 0),
-        //     fourds[1, 1:4][:, mask_b],
-        //     fourds[0, 1:4][:, mask_b],
-        // )))
         const size_t mask_b_size = mask_b.size();
         std::vector<float> result_b(7 * mask_b_size);
         for (size_t i = 0; i < mask_b_size; ++i) {

From 85bfb5f9f96e1d40ee716681f86dc01b5d50ce9e Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Sat, 26 Jun 2021 12:01:41 +0800
Subject: [PATCH 3/4] feat: openpifpaf decoder finalized

---
 README.md                                     |    1 +
 examples/cli.cpp                              |    8 +-
 examples/gen_serialized_engine.example.cpp    |    2 +-
 ...ator_api_batched_images_pifpaf.example.cpp |    2 +-
 include/hyperpose/operator/parser/pifpaf.hpp  |   13 +-
 src/human.cpp                                 |    2 +-
 src/pifpaf.cpp                                |  111 +-
 src/pifpaf_decoder/aiapp.hpp                  |  200 +-
 src/pifpaf_decoder/image_based.hpp            |  270 +--
 src/pifpaf_decoder/math_helpers.cpp           |  103 +-
 src/pifpaf_decoder/math_helpers.hpp           |   17 +-
 src/pifpaf_decoder/object_detection.hpp       |   72 +-
 .../openpifpaf_postprocessor.cpp              | 1937 ++++++++---------
 .../openpifpaf_postprocessor.hpp              |  266 +--
 14 files changed, 1299 insertions(+), 1705 deletions(-)

diff --git a/README.md b/README.md
index 2db67570..7806a2d4 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,7 @@ We compare the prediction performance of HyperPose with [OpenPose 1.6](https://g
 | OpenPose (TinyVGG)  | 34.7 MB       | 384 x 256          | **124.925 FPS**         | N/A                   |
 | OpenPose (MobileNet) | 17.9 MB       | 432 x 368          | **84.32 FPS**           | 8.5 FPS (TF-Pose)         |
 | OpenPose (ResNet18)  | 45.0 MB       | 432 x 368          | **62.52 FPS**           | N/A                  |
+| OpenPifPaf (ResNet50)  | 97.6 MB       | 97 x 129          | **178.6 FPS**           | 35.3                  |
 
 </a>
 <p align="center">
diff --git a/examples/cli.cpp b/examples/cli.cpp
index c37000ed..dc4dd9f5 100644
--- a/examples/cli.cpp
+++ b/examples/cli.cpp
@@ -19,7 +19,7 @@ DEFINE_string(
     "Post-processing method. (`" kPAF "` -> [Part Affine Field] or `" kPPN "` -> [Pose Proposal Network]) or `" kPIFPAF "` -> [Pif Paf]");
 DEFINE_int32(w, 384, "Width of input image.");
 DEFINE_int32(h, 256, "Height of input image.");
-DEFINE_int32(max_batch_size, 8, "Max batch size for inference engine to execute.");
+DEFINE_int32(max_batch_size, 4, "Max batch size for inference engine to execute.");
 
 // Execution Mode
 DEFINE_bool(imshow, true, "Whether to open an imshow window.");
@@ -152,7 +152,7 @@ int main(int argc, char** argv)
             return hp::parser::pose_proposal(engine.input_size());
 
         if (FLAGS_post == kPIFPAF)
-            return hp::parser::pifpaf{};
+            return hp::parser::pifpaf(engine.input_size().height, engine.input_size().width);
 
         cli_log() << "ERROR: Unknown post-processing flag: `" << FLAGS_post << "`. Use `paf`, `ppn` or `pifpaf` please.\n";
         std::exit(-1);
@@ -184,6 +184,7 @@ int main(int argc, char** argv)
     if (FLAGS_runtime == kOPERATOR) {
         if (images.empty()) { // For CAP.
 
+            auto beg = clk_t::now();
             auto writer = make_writer();
             while (cap.isOpened()) {
                 cv::Mat mat;
@@ -227,6 +228,9 @@ int main(int argc, char** argv)
                         break;
                 }
             }
+            auto inference_time = std::chrono::duration<double, std::milli>(clk_t::now() - beg).count();
+            std::cout << cap.get(cv::CAP_PROP_FRAME_COUNT) << " images got processed in " << inference_time << " ms, FPS = "
+                      << 1000. * cap.get(cv::CAP_PROP_FRAME_COUNT) / inference_time << '\n';
         } else { // For Vec<Image>.
             auto beg = clk_t::now();
             // * TensorRT Inference.
diff --git a/examples/gen_serialized_engine.example.cpp b/examples/gen_serialized_engine.example.cpp
index f172a930..3f09dd98 100644
--- a/examples/gen_serialized_engine.example.cpp
+++ b/examples/gen_serialized_engine.example.cpp
@@ -12,7 +12,7 @@ DEFINE_string(output_name_list, "outputs/conf,outputs/paf", "The output node nam
 
 DEFINE_int32(input_height, 256, "Height of input image.");
 DEFINE_int32(input_width, 384, "Width of input image.");
-DEFINE_int32(max_batch_size, 32, "The max batch size for the exported serialized model.");
+DEFINE_int32(max_batch_size, 1, "The max batch size for the exported serialized model.");
 
 DEFINE_string(output_model, "", "Path to output serialized model.");
 
diff --git a/examples/operator_api_batched_images_pifpaf.example.cpp b/examples/operator_api_batched_images_pifpaf.example.cpp
index a187f1bb..5f57fa11 100644
--- a/examples/operator_api_batched_images_pifpaf.example.cpp
+++ b/examples/operator_api_batched_images_pifpaf.example.cpp
@@ -45,7 +45,7 @@ int main(int argc, char** argv)
         return tensorrt(tensorrt_serialized{ FLAGS_model_file }, { FLAGS_input_width, FLAGS_input_height }, batch.size());
     }();
 
-    hp::parser::pifpaf parser{};
+    hp::parser::pifpaf parser(engine.input_size().height, engine.input_size().width);
 
     using clk_t = std::chrono::high_resolution_clock;
     auto beg = clk_t::now();
diff --git a/include/hyperpose/operator/parser/pifpaf.hpp b/include/hyperpose/operator/parser/pifpaf.hpp
index 5da7990c..41eb83de 100644
--- a/include/hyperpose/operator/parser/pifpaf.hpp
+++ b/include/hyperpose/operator/parser/pifpaf.hpp
@@ -1,13 +1,16 @@
 #pragma once
 
-#include "paf.hpp"
 #include "../../utility/data.hpp"
+#include "paf.hpp"
 
 namespace hyperpose::parser {
 
-class pifpaf{
+class pifpaf {
 public:
-    explicit pifpaf() = default;
+    inline explicit pifpaf(int h, int w, float thresh = 0.1)
+        : m_net_h(h)
+        , m_net_w(w)
+        , m_keypoint_thresh(thresh){};
     std::vector<human_t> process(const feature_map_t& pif, const feature_map_t& paf);
     template <typename C>
     std::vector<human_t> process(C&& feature_map_containers)
@@ -16,8 +19,10 @@ class pifpaf{
         assert(feature_map_containers.size() == 2);
         return process(feature_map_containers[0], feature_map_containers[1]);
     }
+
 private:
-    float m_keypoint_thresh = 0.001f;
+    int m_net_w, m_net_h;
+    float m_keypoint_thresh;
 };
 
 } // namespace hyperpose
\ No newline at end of file
diff --git a/src/human.cpp b/src/human.cpp
index dfc46893..7473a116 100644
--- a/src/human.cpp
+++ b/src/human.cpp
@@ -7,7 +7,7 @@ namespace hyperpose {
 void draw_human(cv::Mat& img, const human_t& human)
 {
     float n = 1, s = 0, w = 1, e = 0;
-    for(const auto& p : human.parts)
+    for (const auto& p : human.parts)
         if (p.has_value) {
             n = std::min(n, p.y);
             s = std::max(s, p.y);
diff --git a/src/pifpaf.cpp b/src/pifpaf.cpp
index 1482f6bb..4bfc7064 100644
--- a/src/pifpaf.cpp
+++ b/src/pifpaf.cpp
@@ -1,11 +1,12 @@
-#include <hyperpose/operator/parser/pifpaf.hpp>
 #include "pifpaf_decoder/openpifpaf_postprocessor.hpp"
+#include <hyperpose/operator/parser/pifpaf.hpp>
 
 namespace hyperpose::parser {
 
 // TODO: Name ORDER!
-std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map_t& pif) {
-    // Helpful links (Chinese):
+std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map_t& pif)
+{
+    // Helpful links (Chinese)::
     // https://zhuanlan.zhihu.com/p/93896207
     // https://zhuanlan.zhihu.com/p/68073113
     // pif: [17, 5, h, w] => KEY POINTS;
@@ -18,99 +19,35 @@ std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map
     // TODO: OPTIMIZE THIS.
 
     lpdnn::aiapp_impl::OpenPifPafPostprocessor pp;
+    pp.keypointThreshold = m_keypoint_thresh;
     size_t h = pif.shape()[pif.shape().size() - 2];
     size_t w = pif.shape().back();
-    std::vector<float> pif_conf, pif_xy, pif_s, paf_conf, paf_xy1, paf_xy2, paf_b1, paf_b2;
 
-    const auto tensor_sharding_to_vector = [](const feature_map_t& tensor, std::vector<float>& vec, size_t dim2) {
-        size_t d0 = tensor.shape()[0];
-        size_t d1 = tensor.shape()[1];
-        size_t h = tensor.shape()[2];
-        size_t w = tensor.shape()[3];
-        for (int i = 0; i < d0; ++i) {
-            for (int j = 0; j < h; ++j) {
-                for (int k = 0; k < w; ++k) {
-                    vec.push_back(tensor.view<float>()[
-                                      i * d1 * w * h +
-                                      dim2 * h * w +
-                                      j * w +
-                                      k
-                                  ]);
-                }
-            }
-        }
-    };
+    std::vector<float> pif_vec{}, paf_vec{};
 
-    const auto tensor_sharding_to_offset_vector = [](const feature_map_t& tensor, std::vector<float>& vec, size_t dimx, size_t dimy) {
+    const auto raw_copy = [](const feature_map_t& tensor, std::vector<float>& vec) {
         size_t d0 = tensor.shape()[0];
         size_t d1 = tensor.shape()[1];
         size_t h = tensor.shape()[2];
         size_t w = tensor.shape()[3];
-        for (int i = 0; i < d0; ++i) {
-            // X first & Then Y
-            for (int j = 0; j < h; ++j) {
-                for (int k = 0; k < w; ++k) {
-                    vec.push_back(tensor.view<float>()[
-                                      i * d1 * w * h +
-                                      dimx * h * w +
-                                      j * w +
-                                      k
-                                  ]);
-                }
-            }
-
-            for (int j = 0; j < h; ++j) {
-                for (int k = 0; k < w; ++k) {
-                    vec.push_back(tensor.view<float>()[
-                                      i * d1 * w * h +
-                                      dimy * h * w +
-                                      j * w +
-                                      k
-                                  ]);
-                }
-            }
+        const size_t total_size = d0 * d1 * h * w;
+        vec.reserve(total_size);
+        for (size_t i = 0; i < total_size; ++i) {
+            vec.push_back(tensor.view<float>()[i]);
         }
     };
 
-    pif_conf.reserve(17 * h * w);
-    tensor_sharding_to_vector(pif, pif_conf, 0);
-
-    pif_xy.reserve(17 * 2 * h * w);
-    tensor_sharding_to_offset_vector(pif, pif_xy, 1, 2);
-
-    pif_s.reserve(17 * h * w);
-    tensor_sharding_to_vector(pif, pif_s, 4);
-
-    // [19, 9, h, w] -> [conf, p1, p2, b1, b2, ...]
-    paf_conf.reserve(19 * h * w);
-    tensor_sharding_to_vector(paf, paf_conf, 0);
-
-    paf_xy1.reserve(2 * 19 * h * w);
-    tensor_sharding_to_offset_vector(paf, paf_xy1, 1, 2);
-
-    paf_xy2.reserve(2 * 19 * h * w);
-    tensor_sharding_to_offset_vector(paf, paf_xy2, 3, 4);
-
-    paf_b1.reserve(19 * h * w);
-    tensor_sharding_to_vector(paf, paf_b1, 5);
-
-    paf_b2.reserve(19 * h * w);
-    tensor_sharding_to_vector(paf, paf_b2, 6);
+    raw_copy(pif, pif_vec);
+    raw_copy(paf, paf_vec);
 
     // TODO: RECOVER THE INP{W, H};
-    auto apires = pp.postprocess_0_8(640, 427, w, h,
-                                     pif_conf.data(), pif_xy.data(), pif_s.data(),
-                                     paf_conf.data(), paf_xy1.data(), paf_xy2.data(), paf_b1.data(), paf_b2.data());
+    auto apires = pp.postprocess(m_net_w, m_net_h, w, h, pif_vec, paf_vec);
 
     std::vector<human_t> ret{};
     ret.reserve(apires.items.size());
-
-    /*
-     *
- OpenPifPaf COCO Topology: https://miro.medium.com/max/366/0*KFrFQVj3OoGAtt6o.png
-HyperPose: Unified Topology
-     *
-     */
+    // OpenPifPaf COCO Topology: https://miro.medium.com/max/366/0*KFrFQVj3OoGAtt6o.png
+    // HyperPose: Unified Topology
+    // NOTE: This step is to convert pifpaf topology to hyperpose topology.
 
     for (auto&& item : apires.items) {
         if (item.landmarks.points.empty())
@@ -120,9 +57,9 @@ HyperPose: Unified Topology
 
         auto p2p = [this](const auto& src, auto& dst) {
             if (src.confidence > 0.) {
-                dst.score = 1;// src.confidence; FIXME
-                dst.x = src.position.x / 10000.;
-                dst.y = src.position.y / 10000.;
+                dst.score = 1; // src.confidence; FIXME
+                dst.x = src.position.x / (float)m_net_w;
+                dst.y = src.position.y / (float)m_net_h;
                 dst.has_value = true;
             }
         };
@@ -139,12 +76,14 @@ HyperPose: Unified Topology
         };
 
         for (size_t i = 0; i < from_index.size(); ++i) {
-            p2p(from[from_index[i]], to[i+2]);
+            p2p(from[from_index[i]], to[i + 2]);
         }
 
         if (to[2].has_value && to[5].has_value) {
-            to[1].x = (to[2].x + to[5].x) / 2;;
-            to[1].y = (to[2].y + to[5].y) / 2;;
+            to[1].x = (to[2].x + to[5].x) / 2;
+            ;
+            to[1].y = (to[2].y + to[5].y) / 2;
+            ;
             to[1].has_value = true;
             to[1].score = (to[2].score + to[5].score) / 2;
         }
diff --git a/src/pifpaf_decoder/aiapp.hpp b/src/pifpaf_decoder/aiapp.hpp
index 1beb0774..85c75a09 100644
--- a/src/pifpaf_decoder/aiapp.hpp
+++ b/src/pifpaf_decoder/aiapp.hpp
@@ -15,102 +15,104 @@
 namespace lpdnn {
 namespace ai_app {
 
-/// Aiapp Blob
-/// This could be improved to allow referring to existing data
-/// thus avoding unneeded data-copy, for example by using shared_ptr.
-struct Blob {
-  /// Data dimensions. Mandatory if the blob represents a tensor.
-  std::vector<int> dim;
-
-  /// Data. Mandatory if the blob represents a tensor.
-  std::vector<float> data;
-
-  /// Optional raw representation.
-  std::vector<uint8_t> raw;
-
-  /// Optional CBOR representation when data is structured.
-  std::vector<uint8_t> cbor;
-
-  /// Optional additional information
-  /// (eg, description of internal representation: "NCHW,8bits,dp3").
-  std::string info;
-};
-
-/// AI-App interface
-class Aiapp {
- public:
-  virtual ~Aiapp() {}
-
-  /// @return the ai-class id for this aiapp
-  virtual const char* class_id() const = 0;
-
-  /// @return the implementation id for this aiapp
-  virtual const char* impl_id() const = 0;
-
-  /// Initialization options
-  /// \param cfg: configuration string, typically in JSON format.
-  /// \return: true if success
-  virtual bool init(const std::string& cfg) = 0;
-
-  /// Set runtime options for the specified component
-  /// \param opt: runtime options, typically in JSON format.
-  /// \param name: subcomponent name
-  /// \return: true if success
-  virtual bool set_options(const std::string& opt,
-                           const std::string& name = "") = 0;
-
-  /// Introspection methods
-  /// \{
-
-  /// \return: names of all direct subcomponents of the specified component
-  virtual std::vector<std::string> components(
-      const std::string& name = "") const = 0;
-
-  /// \return output(s) of the specified component
-  virtual std::vector<Blob> output(const std::string& name = "") const = 0;
-
-  /// \return metrics of the specified component and all its subcomponents
-  virtual std::string metrics(const std::string& name = "") const = 0;
-
-  /// set end-of-execution at the end of the specified component
-  /// if name is empty any exit-point previously set is removed
-  virtual bool set_exit_after(const std::string& name = "") = 0;
-
-  /// \}
-};
-
-/// AiApp standard processing components
-/// Each ai-app can contain other sub-components.
-/// Each subcomponent can be identified by a pathname, for example:
-///   "preprocessing.normalize"
-///   "inference.net1.conv23"
-struct Component {
-  /// Standard component names. Their use is not mandatory but
-  /// allows an ai-app to be supported by existing tools.
-  static constexpr char const* preprocessing = "preprocessing";
-  static constexpr char const* inference = "inference";
-  static constexpr char const* postprocessing = "postprocessing";
-
-  /// Ai-app interface parameters
-  static constexpr char const* interface = "interface";
-
-  /// Name separator in a component pathname string.
-  /// Component names can't contain the separator except possibly for the leafs
-  static constexpr char separator = '.';
-
-  /// Concatenate component names in a component pathname
-  static std::string join(const std::string& path, const std::string& comp) {
-    return path + separator + comp;
-  }
-};
-
-/// AiApp Metrics
-struct Metrics {
-  /// Standard metrics. All timings are in microseconds.
-  static constexpr char const* init_time = "init_time";
-  static constexpr char const* inference_time = "inference_time";
-  static constexpr char const* inference_cpu_time = "inference_cpu_time";
-};
-
-}  // namespace ai_app
-}  // namespace lpdnn
+    /// Aiapp Blob
+    /// This could be improved to allow referring to existing data
+    /// thus avoding unneeded data-copy, for example by using shared_ptr.
+    struct Blob {
+        /// Data dimensions. Mandatory if the blob represents a tensor.
+        std::vector<int> dim;
+
+        /// Data. Mandatory if the blob represents a tensor.
+        std::vector<float> data;
+
+        /// Optional raw representation.
+        std::vector<uint8_t> raw;
+
+        /// Optional CBOR representation when data is structured.
+        std::vector<uint8_t> cbor;
+
+        /// Optional additional information
+        /// (eg, description of internal representation: "NCHW,8bits,dp3").
+        std::string info;
+    };
+
+    /// AI-App interface
+    class Aiapp {
+    public:
+        virtual ~Aiapp() {}
+
+        /// @return the ai-class id for this aiapp
+        virtual const char* class_id() const = 0;
+
+        /// @return the implementation id for this aiapp
+        virtual const char* impl_id() const = 0;
+
+        /// Initialization options
+        /// \param cfg: configuration string, typically in JSON format.
+        /// \return: true if success
+        virtual bool init(const std::string& cfg) = 0;
+
+        /// Set runtime options for the specified component
+        /// \param opt: runtime options, typically in JSON format.
+        /// \param name: subcomponent name
+        /// \return: true if success
+        virtual bool set_options(const std::string& opt,
+            const std::string& name = "")
+            = 0;
+
+        /// Introspection methods
+        /// \{
+
+        /// \return: names of all direct subcomponents of the specified component
+        virtual std::vector<std::string> components(
+            const std::string& name = "") const = 0;
+
+        /// \return output(s) of the specified component
+        virtual std::vector<Blob> output(const std::string& name = "") const = 0;
+
+        /// \return metrics of the specified component and all its subcomponents
+        virtual std::string metrics(const std::string& name = "") const = 0;
+
+        /// set end-of-execution at the end of the specified component
+        /// if name is empty any exit-point previously set is removed
+        virtual bool set_exit_after(const std::string& name = "") = 0;
+
+        /// \}
+    };
+
+    /// AiApp standard processing components
+    /// Each ai-app can contain other sub-components.
+    /// Each subcomponent can be identified by a pathname, for example:
+    ///   "preprocessing.normalize"
+    ///   "inference.net1.conv23"
+    struct Component {
+        /// Standard component names. Their use is not mandatory but
+        /// allows an ai-app to be supported by existing tools.
+        static constexpr char const* preprocessing = "preprocessing";
+        static constexpr char const* inference = "inference";
+        static constexpr char const* postprocessing = "postprocessing";
+
+        /// Ai-app interface parameters
+        static constexpr char const* interface = "interface";
+
+        /// Name separator in a component pathname string.
+        /// Component names can't contain the separator except possibly for the leafs
+        static constexpr char separator = '.';
+
+        /// Concatenate component names in a component pathname
+        static std::string join(const std::string& path, const std::string& comp)
+        {
+            return path + separator + comp;
+        }
+    };
+
+    /// AiApp Metrics
+    struct Metrics {
+        /// Standard metrics. All timings are in microseconds.
+        static constexpr char const* init_time = "init_time";
+        static constexpr char const* inference_time = "inference_time";
+        static constexpr char const* inference_cpu_time = "inference_cpu_time";
+    };
+
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/image_based.hpp b/src/pifpaf_decoder/image_based.hpp
index 938cedaa..914e0f7c 100644
--- a/src/pifpaf_decoder/image_based.hpp
+++ b/src/pifpaf_decoder/image_based.hpp
@@ -9,132 +9,144 @@
 
 #include "aiapp.hpp"
 
-namespace lpdnn::ai_app {
-
-/// 2-dimensional size
-struct Dim2d {
-  int x;
-  int y;
-};
-
-/// Rectangle
-struct Rect {
-  Dim2d origin;
-  Dim2d size;
-
-  [[nodiscard]] bool empty() const { return size.x <= 0 || size.y <= 0; }
-};
-
-/// Landmarks
-struct Landmark {
-  Dim2d position;
-  float confidence;  /// Negative value if N/A
-};
-
-struct Landmarks {
-  /// Landmark specification identifier
-  std::string type;
-  /// Landmark points
-  std::vector<Landmark> points;
-};
-
-/// Image representation.
-/// The data of a RAW image consists of *y scanlines of *x pixels,
-/// with each pixel consisting of N interleaved 8-bit components; the first
-/// pixel pointed to is top-left-most in the image. There is no padding between
-/// image scanlines or between pixels, regardless of format. The number of
-/// components N is 3 for RGB images, 4 for RGBA, 1 for grayscale.
-/// Support for 8bits RGB format is MANDATORY for all image-processing AiApps.
-/// An image can be constructed from a std::vector<uint8_t>, or a std::string
-/// or raw data pointer and size. When passing rvalues vector or strings, the
-/// image will take ownership of the data, otherwise will just keep reference.
-class Image {
- protected:
-  /// Contains image data if we have ownership of it
-  std::vector<uint8_t> _image_content;
-
- public:
-  /// Image format
-  enum class Format {
-    raw_grayscale = 1,  /// 8bits grayscale
-    raw_rgb8 = 3,       /// 8bits RGB *MANDATORY*
-    raw_rgba8 = 4,      /// 8bits RGBA
-
-    encoded = 256,  /// Standard JPEG/BMP/PNG/TIFF format
-
-    custom = 512  /// Custom format. Use attributes field for more details.
-  };
-
-  /// Don't take data ownership.
-  /// img_dim parameter can be omitted in case of encoded images since
-  /// this information will be extracted from the image content itself.
-  Image(Format img_format, const std::vector<uint8_t>& data, Dim2d img_dim = {})
-      : Image(img_format, data.data(), data.size(), img_dim) {}
-
-  /// Take data ownership
-  Image(Format img_format, std::vector<uint8_t>&& data, Dim2d img_dim = {})
-      : _image_content(std::move(data)),
-        format{img_format},
-        dim(img_dim),
-        data{_image_content.data()},
-        data_size{_image_content.size()} {}
-
-  /// Don't take data ownership.
-  Image(Format img_format, const std::string& data, Dim2d img_dim = {})
-      : Image(img_format, (uint8_t*)data.c_str(), data.size(), img_dim) {}
-
-  /// Take data ownership
-  Image(Format img_format, std::string&& data, Dim2d img_dim = {})
-      : Image(img_format,
-              std::vector<uint8_t>((uint8_t*)data.c_str(),
-                                   (uint8_t*)data.c_str() + data.size()),
-              img_dim) {
-    data.clear();
-  }
-
-  /// Don't take data ownership
-  /// img_data_size is mandatory in case of encoded images.
-  Image(Format img_format, const uint8_t* img_data, size_t img_data_size,
-        Dim2d img_dim = {})
-      : format{img_format},
-        dim(img_dim),
-        data{img_data},
-        data_size{img_data_size} {}
-
-  /// Utility factory methods
-  static Image encoded(const std::vector<uint8_t>& data) {
-    return Image(Format::encoded, data);
-  }
-
-  /// Image format
-  Format format;
-
-  /// Image dimensions (for raw images)
-  Dim2d dim;
-
-  /// Region of interest inside the image (all if empty)
-  Rect roi{};
-
-  /// Custom attributes.
-  /// This is ai-app specific and allows to specify custom data formats.
-  std::string attributes;
-
-  /// Pointer to image data (no ownership of the data).
-  const uint8_t* data;
-
-  /// Size of image data. Mandatory for encoded images.
-  size_t data_size;
-
-  /// Additional optional information about the image.
-  /// May be required by some aiapps.
-  Landmarks landmarks;
-};
-
-/// Abstract image-based AiApp
-class Image_based : virtual public Aiapp {
- public:
-  /// @return supported image formats (ordered by preference)
-  [[nodiscard]] virtual std::vector<Image::Format> image_formats() const = 0;
-};
-
-}  // namespace lpdnn
+namespace lpdnn {
+namespace ai_app {
+
+    /// 2-dimensional size
+    struct Dim2d {
+        int x;
+        int y;
+    };
+
+    /// Rectangle
+    struct Rect {
+        Dim2d origin;
+        Dim2d size;
+
+        bool empty() const { return size.x <= 0 || size.y <= 0; }
+    };
+
+    /// Landmarks
+    struct Landmark {
+        Dim2d position;
+        float confidence; /// Negative value if N/A
+    };
+
+    struct Landmarks {
+        /// Landmark specification identifier
+        std::string type;
+        /// Landmark points
+        std::vector<Landmark> points;
+    };
+
+    /// Image representation.
+    /// The data of a RAW image consists of *y scanlines of *x pixels,
+    /// with each pixel consisting of N interleaved 8-bit components; the first
+    /// pixel pointed to is top-left-most in the image. There is no padding between
+    /// image scanlines or between pixels, regardless of format. The number of
+    /// components N is 3 for RGB images, 4 for RGBA, 1 for grayscale.
+    /// Support for 8bits RGB format is MANDATORY for all image-processing AiApps.
+    /// An image can be constructed from a std::vector<uint8_t>, or a std::string
+    /// or raw data pointer and size. When passing rvalues vector or strings, the
+    /// image will take ownership of the data, otherwise will just keep reference.
+    class Image {
+    protected:
+        /// Contains image data if we have ownership of it
+        std::vector<uint8_t> _image_content;
+
+    public:
+        /// Image format
+        enum class Format {
+            raw_grayscale = 1, /// 8bits grayscale
+            raw_rgb8 = 3, /// 8bits RGB *MANDATORY*
+            raw_rgba8 = 4, /// 8bits RGBA
+
+            encoded = 256, /// Standard JPEG/BMP/PNG/TIFF format
+
+            custom = 512 /// Custom format. Use attributes field for more details.
+        };
+
+        /// Don't take data ownership.
+        /// img_dim parameter can be omitted in case of encoded images since
+        /// this information will be extracted from the image content itself.
+        Image(Format img_format, const std::vector<uint8_t>& data, Dim2d img_dim = {})
+            : Image(img_format, data.data(), data.size(), img_dim)
+        {
+        }
+
+        /// Take data ownership
+        Image(Format img_format, std::vector<uint8_t>&& data, Dim2d img_dim = {})
+            : _image_content(std::move(data))
+            , format{ img_format }
+            , dim(img_dim)
+            , data{ _image_content.data() }
+            , data_size{ _image_content.size() }
+        {
+        }
+
+        /// Don't take data ownership.
+        Image(Format img_format, const std::string& data, Dim2d img_dim = {})
+            : Image(img_format, (uint8_t*)data.c_str(), data.size(), img_dim)
+        {
+        }
+
+        /// Take data ownership
+        Image(Format img_format, std::string&& data, Dim2d img_dim = {})
+            : Image(img_format,
+                std::vector<uint8_t>((uint8_t*)data.c_str(),
+                    (uint8_t*)data.c_str() + data.size()),
+                img_dim)
+        {
+            data.clear();
+        }
+
+        /// Don't take data ownership
+        /// img_data_size is mandatory in case of encoded images.
+        Image(Format img_format, const uint8_t* img_data, size_t img_data_size,
+            Dim2d img_dim = {})
+            : format{ img_format }
+            , dim(img_dim)
+            , data{ img_data }
+            , data_size{ img_data_size }
+        {
+        }
+
+        /// Utility factory methods
+        static Image encoded(const std::vector<uint8_t>& data)
+        {
+            return Image(Format::encoded, data);
+        }
+
+        /// Image format
+        Format format;
+
+        /// Image dimensions (for raw images)
+        Dim2d dim;
+
+        /// Region of interest inside the image (all if empty)
+        Rect roi{};
+
+        /// Custom attributes.
+        /// This is ai-app specific and allows to specify custom data formats.
+        std::string attributes;
+
+        /// Pointer to image data (no ownership of the data).
+        const uint8_t* data;
+
+        /// Size of image data. Mandatory for encoded images.
+        size_t data_size;
+
+        /// Additional optional information about the image.
+        /// May be required by some aiapps.
+        Landmarks landmarks;
+    };
+
+    /// Abstract image-based AiApp
+    class Image_based : virtual public Aiapp {
+    public:
+        /// @return supported image formats (ordered by preference)
+        virtual std::vector<Image::Format> image_formats() const = 0;
+    };
+
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/math_helpers.cpp b/src/pifpaf_decoder/math_helpers.cpp
index a571780a..f7634da6 100644
--- a/src/pifpaf_decoder/math_helpers.cpp
+++ b/src/pifpaf_decoder/math_helpers.cpp
@@ -1,94 +1,25 @@
-
 #include "math_helpers.hpp"
-#include <cassert>
-
-#ifdef __APPLE__
-#define MATH_HELPERS_ACCELERATE 1
-#else
-#define MATH_HELPERS_ACCELERATE 0
-#endif
-
-#if MATH_HELPERS_ACCELERATE
-#include <Accelerate/Accelerate.h>
-#else
-#include <cmath>
-#endif
 
-void vfill(float*x, unsigned long n, float v) {
-#if MATH_HELPERS_ACCELERATE
-  vDSP_vfill(&v, x, 1, n);
-#else
-  // Slow version
-  for (unsigned long i = 0; i < n; ++i) {
-    x[i] = v;
-  }
-#endif
-}
-
-void vadd(const float *a, const float *b, float *c, unsigned long n) {
-#if MATH_HELPERS_ACCELERATE
-  vDSP_vadd(a, 1, b, 1, c, 1, n);
-#else
-  // Slow version
-  for (unsigned long i = 0; i < n; ++i) {
-    c[i] = a[i] + b[i];
-  }
-#endif
-}
-
-void vexp(float *x, unsigned long n) {
-#if MATH_HELPERS_ACCELERATE
-  int n_ = (int)n;
-  vvexpf(x, x, &n_);
-#else
-  // Slow version
-  for (unsigned long i = 0; i < n; ++i) {
-    x[i] = std::exp(x[i]);
-  }
-#endif
-}
-
-void vmul(const float *a, const float *b, float *c, unsigned long n) {
-#if MATH_HELPERS_ACCELERATE
-  vDSP_vmul(a, 1, b, 1, c, 1, n);
-#else
-  // Slow version
-  for (unsigned long i = 0; i < n; ++i) {
-    c[i] = a[i] * b[i];
-  }
-#endif
+void vfill(float* x, unsigned long n, float v)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        x[i] = v;
+    }
 }
 
-void vsmul(const float *a, float b, float *c, unsigned long n) {
-#if MATH_HELPERS_ACCELERATE
-  vDSP_vsmul(a, 1, &b, c, 1, n);
-#else
-  // Slow version
-  for (unsigned long i = 0; i < n; ++i) {
-    c[i] = a[i] * b;
-  }
-#endif
+void vmul(const float* a, const float* b, float* c, unsigned long n)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        c[i] = a[i] * b[i];
+    }
 }
 
-float vargmax(const float *x, unsigned long n, int* i) {
-  assert(n > 0);
-#if MATH_HELPERS_ACCELERATE
-  float maxValue = 0.0f;
-  vDSP_Length maxIndex = 0;
-  vDSP_maxvi(x, 1, &maxValue, &maxIndex, n);
-  *i = (int)maxIndex;
-  return maxValue;
-#else
-  // Slow version
-  float maxValue = x[0];
-  unsigned long maxIndex = 0;
-  for (unsigned long i = 1; i < n; ++i) {
-    if (x[i] > maxValue) {
-      maxValue = x[i];
-      maxIndex = i;
+void vsmul(const float* a, float b, float* c, unsigned long n)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        c[i] = a[i] * b;
     }
-  }
-  *i = (int)maxIndex;
-  return maxValue;
-#endif
 }
diff --git a/src/pifpaf_decoder/math_helpers.hpp b/src/pifpaf_decoder/math_helpers.hpp
index d187bc51..15dcb087 100644
--- a/src/pifpaf_decoder/math_helpers.hpp
+++ b/src/pifpaf_decoder/math_helpers.hpp
@@ -1,21 +1,10 @@
 #pragma once
 
 // x[i] = v
-void vfill(float*x, unsigned long n, float v);
-
-// c[i] = a[i] + b[i]
-void vadd(const float *a, const float *b, float *c, unsigned long n);
-
-// x[i] = exp(x[i])
-void vexp(float *x, unsigned long n);
+void vfill(float* x, unsigned long n, float v);
 
 // c[i] = a[i] * b[i]
-void vmul(const float *a, const float *b, float *c, unsigned long n);
+void vmul(const float* a, const float* b, float* c, unsigned long n);
 
 // c[i] = a[i] * b
-void vsmul(const float *a, float b, float *c, unsigned long n);
-
-// out = max(x)
-// i = argmax(x)
-float vargmax(const float *x, unsigned long n, int* i);
-
+void vsmul(const float* a, float b, float* c, unsigned long n);
diff --git a/src/pifpaf_decoder/object_detection.hpp b/src/pifpaf_decoder/object_detection.hpp
index 7a7bc673..91c8f3c0 100644
--- a/src/pifpaf_decoder/object_detection.hpp
+++ b/src/pifpaf_decoder/object_detection.hpp
@@ -9,40 +9,42 @@
 
 #include "image_based.hpp"
 
-namespace lpdnn::ai_app {
-
-/// Object detection AiApp
-class Object_detection : virtual public Image_based {
- public:
-  struct Result {
-    struct Item {
-      float confidence{};
-      int class_index{};
-      Rect bounding_box{};
-      Landmarks landmarks;
+namespace lpdnn {
+namespace ai_app {
+
+    /// Object detection AiApp
+    class Object_detection : virtual public Image_based {
+    public:
+        struct Result {
+            struct Item {
+                float confidence;
+                int class_index;
+                Rect bounding_box;
+                Landmarks landmarks;
+            };
+
+            bool success{};
+            std::vector<Item> items;
+        };
+
+        /// Set minimum detectable object size
+        /// @return true if success
+        virtual bool set_min_size(Dim2d minSize) = 0;
+
+        /// Set maximum detectable object size
+        /// @return true if success
+        virtual bool set_max_size(Dim2d maxSize) = 0;
+
+        /// Perform inference.
+        virtual Result execute(const Image& input) = 0;
+
+        /// @return Names of classes
+        virtual std::vector<std::string> classes() = 0;
+
+        /// @return our aiapp class id
+        const char* class_id() const override { return ai_class_id; }
+        static constexpr char const* ai_class_id = "com_bonseyes::object_detection";
     };
 
-    bool success{};
-    std::vector<Item> items;
-  };
-
-  /// Set minimum detectable object size
-  /// @return true if success
-  virtual bool set_min_size(Dim2d minSize) = 0;
-
-  /// Set maximum detectable object size
-  /// @return true if success
-  virtual bool set_max_size(Dim2d maxSize) = 0;
-
-  /// Perform inference.
-  virtual Result execute(const Image& input) = 0;
-
-  /// @return Names of classes
-  virtual std::vector<std::string> classes() = 0;
-
-  /// @return our aiapp class id
-  [[nodiscard]] const char* class_id() const override { return ai_class_id; }
-  static constexpr char const* ai_class_id = "com_bonseyes::object_detection";
-};
-
-}  // namespace lpdnn
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
index 21049d96..d0a52617 100644
--- a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
@@ -1,1157 +1,930 @@
-#include <cstdlib>
+// Heavily modified from openpifpaf/cpp/example.
+
 #include <algorithm>
+#include <array>
 #include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <iostream>
 #include <limits>
+#include <memory>
 #include <numeric>
-#include <cstring>
-#include "openpifpaf_postprocessor.hpp"
-#include "math_helpers.hpp"
-
-namespace lpdnn::aiapp_impl {
-
-const int OpenPifPafPostprocessor::bones[19][2] = {
-  {16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, { 6, 12}, { 7, 13},
-  { 6,  7}, { 6,  8}, { 7,  9}, { 8, 10}, { 9, 11}, { 2,  3}, { 1,  2},
-  { 1,  3}, { 2,  4}, { 3,  5}, { 4,  6}, { 5,  7},
-};
+#include <optional>
+#include <queue>
+#include <tuple>
+#include <utility>
 
-constexpr int C = 17;
-constexpr float stride = 8.0f;
-constexpr float seedThreshold = 0.2f;
-constexpr float keypointThreshold = 0.001f;
-constexpr float instanceThreshold = 0.2f;
-
-/*
-  Creates a (2, h, w) tensor where the first part is:
-      0, 1, 2, 3, ..., w-1,
-      0, 1, 2, 3, ..., w-1,
-      0, 1, 2, 3, ..., w-1,
-      ...
-  and the second part is:
-      0, 0, 0, 0, ..., 0,
-      1, 1, 1, 1, ..., 1,
-      2, 2, 2, 2, ..., 2,
-      ...
-  Used for normaling the PIFs and PAFs.
-*/
-static std::vector<float> makeIndexField(int h, int w) {
-  std::vector<float> indexField(2 * h * w);
-  float* ptr = indexField.data();
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      ptr[ y     *w + x] = (float)x;
-      ptr[(y + h)*w + x] = (float)y;
-    }
-  }
-  return indexField;
-}
+#include "math_helpers.hpp"
+#include "openpifpaf_postprocessor.hpp"
 
-static void scalarSquareAddConstant(float* field,
-                                    int fieldH,
-                                    int fieldW,
-                                    const std::vector<float>& x,
-                                    const std::vector<float>& y,
-                                    const std::vector<float>& width,
-                                    const std::vector<float>& v)
-{
-    // minx_np = np.round(x_np - width_np).astype(np.int)
-    // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
-    std::vector<int> minx(x.size());
-    for (size_t i = 0; i < x.size(); ++i) {
-        minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
-    }
+struct Occupancy {
+    // self.reduction = reduction
+    // self.min_scale_reduced = min_scale / reduction
+    constexpr static float reduction = 2.f;
+    constexpr static float min_scale_reduced = 4.f / reduction;
+    size_t d0, d1, d2; // c h w
+    std::vector<uint8_t> occupancy_view;
 
-    // miny_np = np.round(y_np - width_np).astype(np.int)
-    // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
-    std::vector<int> miny(y.size());
-    for (size_t i = 0; i < y.size(); ++i) {
-        miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
+    Occupancy(size_t d0_, size_t d1_, size_t d2_)
+        : d0(d0_)
+        , d1(d1_)
+        , d2(d2_)
+        , occupancy_view(d0_ * d1_ * d2_)
+    {
     }
 
-    // maxx_np = np.round(x_np + width_np).astype(np.int)
-    // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
-    std::vector<int> maxx(x.size());
-    for (size_t i = 0; i < x.size(); ++i) {
-        maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
-    }
+    bool fuzz_get(size_t f, float y, float x)
+    {
+        if (f >= d0)
+            return true;
 
-    // maxy_np = np.round(y_np + width_np).astype(np.int)
-    // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
-    std::vector<int> maxy(y.size());
-    for (size_t i = 0; i < y.size(); ++i) {
-        maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
-    }
+        // scalar_nonzero_clipped_with_reduction
+        float xx = std::min((float)d2 - 1, std::max(0.f, x / reduction));
+        float yy = std::min((float)d1 - 1, std::max(0.f, y / reduction));
 
-    // for i in range(minx.shape[0]):
-    //     for xx in range(minx[i], maxx[i]):
-    //         for yy in range(miny[i], maxy[i]):
-    //             field[yy, xx] += v[i]
-    for (size_t i = 0; i < minx.size(); ++i) {
-        for (int yy = miny[i]; yy < maxy[i]; ++yy) {
-            for (int xx = minx[i]; xx < maxx[i]; ++xx) {
-                field[yy * fieldW + xx] += v[i];
-            }
-        }
+        return get(f, yy, xx);
     }
-}
 
-static void scalarSquareAddGauss(float* field,
-                                 int fieldH,
-                                 int fieldW,
-                                 const std::vector<float>& x,
-                                 const std::vector<float>& y,
-                                 const std::vector<float>& sigma_,
-                                 const std::vector<float>& v,
-                                 float truncate = 2.0f)
-{
-    // sigma_np = np.maximum(1.0, sigma_np)
-    // width_np = np.maximum(1.0, truncate * sigma_np)
-    auto sigma = sigma_;
-    std::vector<float> width(sigma.size());
-    for (size_t i = 0; i < sigma.size(); ++i) {
-        sigma[i] = std::max(1.0f, sigma[i]);
-        width[i] = std::max(1.0f, truncate * sigma[i]);
+    bool get(size_t d0_, size_t d1_, size_t d2_)
+    {
+        return occupancy_view[(d1 * d2) * d0_ + d2 * d1_ + d2_];
     }
 
-    // NOTE: The minx, miny, maxx, maxxy code is the same as in scalarSquareAddConstant().
-    // Could probably extract that and do it just once.
-
-    // minx_np = np.round(x_np - width_np).astype(np.int)
-    // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
-    std::vector<int> minx(x.size());
-    for (size_t i = 0; i < x.size(); ++i) {
-        minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
-    }
-
-    // miny_np = np.round(y_np - width_np).astype(np.int)
-    // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
-    std::vector<int> miny(y.size());
-    for (size_t i = 0; i < y.size(); ++i) {
-        miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
-    }
-
-    // maxx_np = np.round(x_np + width_np).astype(np.int)
-    // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
-    std::vector<int> maxx(x.size());
-    for (size_t i = 0; i < x.size(); ++i) {
-        maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
-    }
-
-    // maxy_np = np.round(y_np + width_np).astype(np.int)
-    // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
-    std::vector<int> maxy(y.size());
-    for (size_t i = 0; i < y.size(); ++i) {
-        maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
+    void set(size_t d0_, size_t d1_, size_t d2_)
+    {
+        occupancy_view[(d1 * d2) * d0_ + d2 * d1_ + d2_] = 1;
     }
+};
 
-    // for i in range(minx.shape[0]):
-    //     for xx in range(minx[i], maxx[i]):
-    //         deltax = xx - x[i]
-    //         for yy in range(miny[i], maxy[i]):
-    //             deltay = yy - y[i]
-    //             vv = v[i] * np.exp(-0.5 * (deltax**2 + deltay**2) / sigma[i]**2)
-    //             field[yy, xx] += vv
-    for (size_t i = 0; i < minx.size(); ++i) {
-        for (int xx = minx[i]; xx < maxx[i]; ++xx) {
-            float deltax = (float)xx - x[i];
+namespace lpdnn {
+namespace aiapp_impl {
+
+    constexpr int OpenPifPafPostprocessor::bones[19][2] = {
+        { 16, 14 },
+        { 14, 12 },
+        { 17, 15 },
+        { 15, 13 },
+        { 12, 13 },
+        { 6, 12 },
+        { 7, 13 },
+        { 6, 7 },
+        { 6, 8 },
+        { 7, 9 },
+        { 8, 10 },
+        { 9, 11 },
+        { 2, 3 },
+        { 1, 2 },
+        { 1, 3 },
+        { 2, 4 },
+        { 3, 5 },
+        { 4, 6 },
+        { 5, 7 },
+    };
+
+    struct to_point {
+        int field_id;
+        bool possitve;
+    };
+
+    auto BY_SOURCE_MAP = [] {
+        // print(self.by_source)
+        // for i in range(17):
+        //     for (end_i), (caf_i, connect) in self.by_source[i].items():
+        //         data = f'to_point{{{caf_i}, {"true" if connect else "false"}}}'
+        //         print(f'smap[{i}][{end_i}] = {data};')
+        std::array<std::map<int, to_point, std::greater<>>, 17> smap;
+        smap[0][1] = to_point{ 13, true };
+        smap[0][2] = to_point{ 14, true };
+        smap[1][2] = to_point{ 12, true };
+        smap[1][0] = to_point{ 13, false };
+        smap[1][3] = to_point{ 15, true };
+        smap[2][1] = to_point{ 12, false };
+        smap[2][0] = to_point{ 14, false };
+        smap[2][4] = to_point{ 16, true };
+        smap[3][1] = to_point{ 15, false };
+        smap[3][5] = to_point{ 17, true };
+        smap[4][2] = to_point{ 16, false };
+        smap[4][6] = to_point{ 18, true };
+        smap[5][11] = to_point{ 5, true };
+        smap[5][6] = to_point{ 7, true };
+        smap[5][7] = to_point{ 8, true };
+        smap[5][3] = to_point{ 17, false };
+        smap[6][12] = to_point{ 6, true };
+        smap[6][5] = to_point{ 7, false };
+        smap[6][8] = to_point{ 9, true };
+        smap[6][4] = to_point{ 18, false };
+        smap[7][5] = to_point{ 8, false };
+        smap[7][9] = to_point{ 10, true };
+        smap[8][6] = to_point{ 9, false };
+        smap[8][10] = to_point{ 11, true };
+        smap[9][7] = to_point{ 10, false };
+        smap[10][8] = to_point{ 11, false };
+        smap[11][13] = to_point{ 1, false };
+        smap[11][12] = to_point{ 4, true };
+        smap[11][5] = to_point{ 5, false };
+        smap[12][14] = to_point{ 3, false };
+        smap[12][11] = to_point{ 4, false };
+        smap[12][6] = to_point{ 6, false };
+        smap[13][15] = to_point{ 0, false };
+        smap[13][11] = to_point{ 1, true };
+        smap[14][16] = to_point{ 2, false };
+        smap[14][12] = to_point{ 3, true };
+        smap[15][13] = to_point{ 0, true };
+        smap[16][14] = to_point{ 2, true };
+        return smap;
+    }();
+
+    static const int C = 17;
+    static const float STRIDE = 8.0f;
+    static const float seedThreshold = 0.3f; // 0.5
+    //static const float keypointThreshold = 0.15f;
+    static const float instanceThreshold = 0.2f;
+
+    static void scalarSquareAddConstant(float* field,
+        int fieldH,
+        int fieldW,
+        const std::vector<float>& x,
+        const std::vector<float>& y,
+        const std::vector<float>& width,
+        const std::vector<float>& v)
+    {
+        // minx_np = np.round(x_np - width_np).astype(np.int)
+        // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
+        std::vector<int> minx(x.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
+        }
+
+        // miny_np = np.round(y_np - width_np).astype(np.int)
+        // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
+        std::vector<int> miny(y.size());
+        for (size_t i = 0; i < y.size(); ++i) {
+            miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
+        }
+
+        // maxx_np = np.round(x_np + width_np).astype(np.int)
+        // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
+        std::vector<int> maxx(x.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
+        }
+
+        // maxy_np = np.round(y_np + width_np).astype(np.int)
+        // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
+        std::vector<int> maxy(y.size());
+        for (size_t i = 0; i < y.size(); ++i) {
+            maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
+        }
+
+        // for i in range(minx.shape[0]):
+        //     for xx in range(minx[i], maxx[i]):
+        //         for yy in range(miny[i], maxy[i]):
+        //             field[yy, xx] += v[i]
+        for (size_t i = 0; i < minx.size(); ++i) {
             for (int yy = miny[i]; yy < maxy[i]; ++yy) {
-                float deltay = (float)yy - y[i];
-                float vv = v[i] * std::exp(-0.5f * (deltax*deltax + deltay*deltay) / (sigma[i]*sigma[i]));
-                field[yy * fieldW + xx] += vv;
+                for (int xx = minx[i]; xx < maxx[i]; ++xx) {
+                    field[yy * fieldW + xx] += v[i];
+                }
             }
         }
     }
 
-    /*
-    // For debugging
-    for (int y = 0; y < fieldH; ++y) {
-      for (int x = 0; x <fieldW; ++x) {
-        printf("%f, ", field[y*fieldW + x]);
-      }
-      printf("\n");
-    }
-    */
-}
-
-static void scalarSquareAddSingle(float* field,
-                                  int fieldH,
-                                  int fieldW,
-                                  float x,
-                                  float y,
-                                  float width,
-                                  float value)
-{
-    // minx = max(0, int(round(x - width)))
-    // miny = max(0, int(round(y - width)))
-    auto minx = std::max(0, (int)std::round(x - width));
-    auto miny = std::max(0, (int)std::round(y - width));
-
-    // maxx = max(minx + 1, min(field.shape[1], int(round(x + width)) + 1))
-    // maxy = max(miny + 1, min(field.shape[0], int(round(y + width)) + 1))
-    auto maxx = std::max(minx + 1, std::min(fieldW, (int)std::round(x + width) + 1));
-    auto maxy = std::max(miny + 1, std::min(fieldH, (int)std::round(y + width) + 1));
-
-    if (minx >= fieldW) { return; }
-    if (miny >= fieldH) { return; }
-
-    // field[miny:maxy, minx:maxx] += value
-    for (auto yy = miny; yy < maxy; ++yy) {
-        for (auto xx = minx; xx < maxx; ++xx) {
-            field[yy * fieldW + xx] += value;
+    static void scalarSquareAddGaussWitMax(float* field,
+        int fieldH,
+        int fieldW,
+        const std::vector<float>& x,
+        const std::vector<float>& y,
+        const std::vector<float>& sigma_,
+        const std::vector<float>& v,
+        float truncate,
+        float max_val = 1.0f)
+    {
+        // // ganler!
+        // assert(v.size() == x.size() == y.size() == sigma_.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            float csigma = sigma_[i];
+            float truncate_csigma = csigma * truncate;
+            float cx = x[i];
+            float cy = y[i];
+            float cv = v[i];
+            const auto clip = [](float val, float low, float high) {
+                return std::max(low, std::min(high, val));
+            };
+
+            // printf("%f, %f, %f, %f, %f\n", cx, cy, csigma, truncate_csigma, max_val);
+            const int64_t minx = clip(cx - truncate_csigma, 0, fieldW - 1);
+            const int64_t maxx = clip(cx + truncate_csigma + 1, minx + 1, fieldW);
+            const int64_t miny = clip(cy - truncate_csigma, 0, fieldH - 1);
+            const int64_t maxy = clip(cy + truncate_csigma + 1, miny + 1, fieldH);
+            // std::cout << minx << '\t' << maxx << '\t' << miny << '\t' << maxy << '\n';
+            // printf("%lli, %lli, %lli, %lli\n", minx, maxx, miny, maxy);
+
+            for (int64_t xx = minx; xx < maxx; ++xx) {
+                float deltax2 = (xx - cx) * (xx - cx);
+                for (int64_t yy = miny; yy < maxy; ++yy) {
+                    float deltay2 = (yy - cy) * (yy - cy);
+
+                    if (deltax2 + deltay2 > truncate_csigma * truncate_csigma) {
+                        continue;
+                    }
+
+                    const auto approx_exp = [](float x) {
+                        if (x > 2 || x < -2)
+                            return 0.f;
+                        x = 1.f + x / 8;
+                        x *= x;
+                        x *= x;
+                        x *= x;
+                        return x;
+                    };
+                    float vv = (deltax2 < 0.25 && deltay2 < 0.25) ? cv : cv * approx_exp(-0.5 * (deltax2 + deltay2) / (csigma * csigma));
+                    field[yy * fieldW + xx] += vv;
+                    field[yy * fieldW + xx] = std::min(max_val, field[yy * fieldW + xx]);
+                }
+            }
         }
     }
-}
-
-/**
-  Combines the different PAF outputs into one big (19, 2, 4, h, w) tensor.
-
-  The input tensors have the shape (19, h, w) except for j1/j2Fields, which
-  are (38, h, w).
-*/
-void OpenPifPafPostprocessor::normalizePAF(const float* intensityFields,
-                                           const float* j1Fields,
-                                           const float* j2Fields,
-                                           const float* j1FieldsLogb,
-                                           const float* j2FieldsLogb)
-{
-    float* pafPtr = paf.data();
-
-    // Strides for the first dimension of the input tensors:
-    const size_t if_stride_0   = H * W;
-    const size_t j1f_stride_0  = H * W;
-    const size_t j1bf_stride_0 = H * W;
-    const size_t j2f_stride_0  = H * W;
-    const size_t j2bf_stride_0 = H * W;
-
-    for (int i = 0; i < 19; ++i) {
-        // Copy the next h*w values from intensityFields.
-        size_t ifOffset = i * if_stride_0;
-        size_t outOffset = i * paf_stride_0;
-        memcpy(pafPtr + outOffset, intensityFields + ifOffset, H * W * sizeof(float));
-
-        // Copy the next 2 h*w values from j1Fields.
-        size_t j1fOffset = (i * 2) * j1f_stride_0;
-        outOffset += paf_stride_2;
-        memcpy(pafPtr + outOffset, j1Fields + j1fOffset, 2 * H * W * sizeof(float));
-
-        // Also add the index field to the values from j1Fields.
-        vadd(indexField.data(), j1Fields + j1fOffset, pafPtr + outOffset, 2 * H * W);
-
-        // Copy the next h*w values from j1FieldsLogb and exponentiate.
-        size_t j1bfOffset = i * j1bf_stride_0;
-        outOffset += paf_stride_2 * 2;
-        memcpy(pafPtr + outOffset, j1FieldsLogb + j1bfOffset, H * W * sizeof(float));
-        vexp(pafPtr + outOffset, H * W);
-
-        // Copy the same h*w values from intensityFields again.
-        outOffset = i * paf_stride_0 + paf_stride_1;
-        memcpy(pafPtr + outOffset, intensityFields + ifOffset, H * W * sizeof(float));
-
-        // Copy the next 2 h*w values from j2Fields.
-        size_t j2fOffset = (i * 2) * j2f_stride_0;
-        outOffset += paf_stride_2;
-        memcpy(pafPtr + outOffset, j2Fields + j2fOffset, 2 * H * W * sizeof(float));
-
-        // Also add the index field to the values from j2Fields.
-        vadd(indexField.data(), j2Fields + j2fOffset, pafPtr + outOffset, 2 * H * W);
-
-        // Copy the next h*w values from j2FieldsLogb and exponentiate.
-        size_t j2bfOffset = i * j2bf_stride_0;
-        outOffset += paf_stride_2 * 2;
-        memcpy(pafPtr + outOffset, j2FieldsLogb + j2bfOffset, H * W * sizeof(float));
-        vexp(pafPtr + outOffset, H * W);
-    }
-
-    // NOTE: We could do the exponentiation for j1/j2FieldsLogb in the Core ML
-    // model already.
 
-    /*
-    // For debugging
-    for (int y = 0; y < H; ++y) {
-      printf("%d: ", y);
-      for (int x = 0; x < W; ++x) {
-        printf("%f, ", paf[9*paf_stride_0 + 2*paf_stride_1 + 7*paf_stride_2 + y*W + x]);
-      }
-      printf("\n");
-    }
-    */
-}
-
-/**
-  Combines the different PIF outputs into one big (17, 4, h, w) tensor.
-
-  The input tensors have the shape (17, h, w) except for jointFields, which
-  is (34, h, w).
-*/
-void OpenPifPafPostprocessor::normalizePIF(const float* jointIntensityFields,
-                                           const float* jointFields,
-                                           const float* scaleFields)
-{
-    float* pifPtr = pif.data();
-
-    // Strides for the first dimension of the input tensors:
-    const size_t iif_stride_0 = H * W;
-    const size_t jf_stride_0  = H * W;
-    const size_t sf_stride_0  = H * W;
-
-    // The PyTorch code concatenates the following tensors:
-    //   (17, 1, h, w)
-    //   (17, 2, h, w)
-    //   (17, 1, h, w)
-    // along the 2nd axis into one tensor of shape (17, 4, h, w). But the
-    // tensors from Core ML have the following shapes:
-    //   (17, h, w)
-    //   (34, h, w)
-    //   (17, h, w)
-    // Fortunately, (17, 2, ...) has the same memory layout as (34, ...),
-    // so we can simply do a bunch of memcpy's.
-
-    for (int i = 0; i < 17; ++i) {
-        // Copy the next h*w values from jointIntensityFields.
-        size_t jifOffset = i * iif_stride_0;
-        size_t outOffset = i * pif_stride_0;
-        memcpy(pifPtr + outOffset, jointIntensityFields + jifOffset, H * W * sizeof(float));
-
-        // Copy the next 2 h*w values from jointFields.
-        size_t jfOffset = (i * 2) * jf_stride_0;
-        outOffset += pif_stride_1;
-        memcpy(pifPtr + outOffset, jointFields + jfOffset, 2 * H * W * sizeof(float));
-
-        // Also add the index field to the values from jointFields.
-        vadd(indexField.data(), jointFields + jfOffset, pifPtr + outOffset, 2 * H * W);
-
-        // Copy the next h*w values from scaleFields.
-        size_t sfOffset = i * sf_stride_0;
-        outOffset += pif_stride_1 * 2;
-        memcpy(pifPtr + outOffset, scaleFields + sfOffset, H * W * sizeof(float));
-    }
-}
-
-OpenPifPafPostprocessor::Target_intensity
-OpenPifPafPostprocessor::targetIntensities(const std::vector<float>& pif,
-                                           float v_th, bool coreOnly)
-{
-    const float pif_nn = 16.0f;
-
-    const size_t targets_stride_0 = H_hr * W_hr;
-    const size_t scales_stride_0  = H_hr * W_hr;
-    const size_t ns_stride_0      = H_hr * W_hr;
-
-    // These tensors need to be emptied out on each frame.
-    vfill(targetsCoreOnly.data(), targetsCoreOnly.size(), 0.0f);
-    vfill(targets.data(), targets.size(), 0.0f);
-    vfill(scales.data(), scales.size(), 0.0f);
-    vfill(ns.data(), ns.size(), 0.0f);
-
-    std::vector<float> v;
-    std::vector<float> x;
-    std::vector<float> y;
-    std::vector<float> s;
-
-    for (int i = 0; i < C; ++i) {
-        // Threshold pif[i, ...], which is a (4, h, w) tensor. Copy the values
-        // that are over the threshold into four vectors: v, x, y, s. Multiply
-        // x, y, s with the stride.
-        //
-        // v, x, y, s = p[:, p[0] > v_th]
-        // x = x * self.stride
-        // y = y * self.stride
-        // s = s * self.stride
-        v.clear();
-        x.clear();
-        y.clear();
-        s.clear();
-        const size_t pifOffset = i * pif_stride_0;
-        const size_t xOffset = pifOffset + pif_stride_1;
-        const size_t yOffset = xOffset + pif_stride_1;
-        const size_t sOffset = yOffset + pif_stride_1;
-        for (int j = 0; j < H*W; ++j) {
-            float p = pif[pifOffset + j];
-            if (p > v_th) {
-                v.push_back(p);
-                x.push_back(pif[xOffset + j] * stride);
-                y.push_back(pif[yOffset + j] * stride);
-                s.push_back(pif[sOffset + j] * stride);
+    static void scalarSquareAddSingle(Occupancy& field,
+        int field_idx,
+        int fieldH,
+        int fieldW,
+        float x,
+        float y,
+        float width,
+        float reduction = 1.0,
+        float min_scaled_reduced = 0.0)
+    {
+        if (reduction != 1.0) {
+            x /= reduction;
+            y /= reduction;
+            width = std::max(min_scaled_reduced, width / reduction);
+        }
+
+        // minx = max(0, int(round(x - width)))
+        // miny = max(0, int(round(y - width)))
+        auto minx = std::min(fieldW - 1, std::max(0, (int)(x - width)));
+        auto miny = std::min(fieldH - 1, std::max(0, (int)(y - width)));
+
+        // maxx = max(minx + 1, min(field.shape[1], int(round(x + width)) + 1))
+        // maxy = max(miny + 1, min(field.shape[0], int(round(y + width)) + 1))
+        auto maxx = std::min(fieldW, std::max(minx + 1, std::min(fieldW, (int)(x + width) + 1)));
+        auto maxy = std::min(fieldH, std::max(miny + 1, std::min(fieldH, (int)(y + width) + 1)));
+
+        // field[miny:maxy, minx:maxx] += value
+        for (auto yy = miny; yy < maxy; ++yy) {
+            for (auto xx = minx; xx < maxx; ++xx) {
+                field.set(field_idx, yy, xx);
             }
         }
-
-        /*
-        // For debugging
-        printf("iteration: %d\n", i);
-        printf("v:\n"); for (auto n : v) printf("%f, ", n); printf("\n");
-        printf("x:\n"); for (auto n : x) printf("%f, ", n); printf("\n");
-        printf("y:\n"); for (auto n : y) printf("%f, ", n); printf("\n");
-        printf("s:\n"); for (auto n : s) printf("%f, ", n); printf("\n");
-        */
-
-        // Create a high-resolution confidence map for this keypoint.
-
-        // v / pif_nn
-        std::vector<float> v_over_pif_nn(v.size());
-        vsmul(v.data(), 1.0f / pif_nn, v_over_pif_nn.data(), v.size());
-
-        // The original code computed the "core only" version in a separate step
-        // but that duplicates a bunch of work, so we do it at the same time.
-        const auto tco = targetsCoreOnly.data() + i * targets_stride_0;
-        scalarSquareAddGauss(tco, H_hr, W_hr, x, y, s, v_over_pif_nn, 0.5);
-
-        // s * v
-        std::vector<float> s_times_v(v.size());
-        vmul(s.data(), v.data(), s_times_v.data(), v.size());
-
-        const auto t = targets.data() + i * targets_stride_0;
-        const auto scale = scales.data() + i * scales_stride_0;
-        const auto n = ns.data() + i * ns_stride_0;
-        scalarSquareAddGauss(t, H_hr, W_hr, x, y, s, v_over_pif_nn);
-        scalarSquareAddConstant(scale, H_hr, W_hr, x, y, s, s_times_v);
-        scalarSquareAddConstant(n, H_hr, W_hr, x, y, s, v);
-    }
-
-    // m = ns > 0
-    // scales[m] = scales[m] / ns[m]
-    for (size_t i = 0; i < scales.size(); ++i) {
-        const auto d = ns[i];
-        if (d > 0) { scales[i] /= d; }
     }
 
-    return Target_intensity{ targets, scales, targetsCoreOnly };
-}
-
-OpenPifPafPostprocessor::Paf_target
-OpenPifPafPostprocessor::scorePafTarget(const std::vector<float>& pafvec,
-                                        const std::vector<float>& pifhr,
-                                        float pifhr_floor,
-                                        float score_th) const
-{
-    std::vector<std::vector<float>> scored_forward;
-    std::vector<std::vector<float>> scored_backward;
-
-    for (int c = 0; c < 19; ++c) {
-        // The PAF has shape (19, 2, 4, h, w). We're looking at one (2, 4, h, w)
-        // slice at a time in this loop.
-        const size_t pafOffset = c * paf_stride_0;
-
-        // scores = np.min(fourds[:, 0], axis=0)
-        // mask = scores > score_th
-        // scores = scores[mask]
-        std::vector<float> scores;
-        std::vector<int> mask;
-        for (int i = 0; i < H * W; ++i) {
-            auto a = pafvec[pafOffset + i];
-            auto b = pafvec[pafOffset + paf_stride_1 + i];
-            auto score = std::min(a, b);
-            if (score > score_th) {
-                scores.push_back(score);
-                mask.push_back(i);
+    OpenPifPafPostprocessor::Target_intensity
+    OpenPifPafPostprocessor::targetIntensities(const std::vector<float>& pif,
+        float v_th, bool coreOnly)
+    {
+        constexpr float PIF_NN = 16.0f;
+
+        const size_t targets_stride_0 = H_hr * W_hr;
+        const size_t scales_stride_0 = H_hr * W_hr;
+        const size_t ns_stride_0 = H_hr * W_hr;
+
+        // These tensors need to be emptied out on each frame.
+        vfill(targetsCoreOnly.data(), targetsCoreOnly.size(), 0.0f);
+        vfill(targets.data(), targets.size(), 0.0f);
+        vfill(scales.data(), scales.size(), 0.0f);
+        vfill(ns.data(), ns.size(), 0.0f);
+
+        std::vector<float> v;
+        std::vector<float> x;
+        std::vector<float> y;
+        std::vector<float> s;
+
+        for (int i = 0; i < C; ++i) {
+            // Threshold pif[i, ...], which is a (4, h, w) tensor. Copy the values
+            // that are over the threshold into four vectors: v, x, y, s. Multiply
+            // x, y, s with the stride.
+            //
+            // v, x, y, s = p[:, p[0] > v_th]
+            // x = x * self.stride
+            // y = y * self.stride
+            // s = s * self.stride
+            v.clear();
+            x.clear();
+            y.clear();
+            s.clear();
+            const size_t pifOffset = i * pif_stride_0;
+            const size_t xOffset = pifOffset + pif_stride_1;
+            const size_t yOffset = xOffset + pif_stride_1;
+            const size_t sOffset = yOffset + pif_stride_1 * 2;
+            for (int j = 0; j < H * W; ++j) {
+                float p = pif[pifOffset + j];
+                if (p > v_th) {
+                    v.push_back(p);
+                    x.push_back(pif[xOffset + j] * STRIDE);
+                    y.push_back(pif[yOffset + j] * STRIDE);
+                    s.push_back(std::max(1., 0.5 * pif[sOffset + j] * STRIDE));
+                }
             }
-        }
 
-        // fourds = fourds[:, :, mask]
-        const size_t scores_size = scores.size();
-        std::vector<float> masked(2 * 4 * scores_size);
-        for (size_t i = 0; i < mask.size(); ++i) {
-            const auto m = mask[i];
-            masked[i                ] = pafvec[pafOffset                                 + m];
-            masked[i + scores_size  ] = pafvec[pafOffset                + paf_stride_2   + m];
-            masked[i + scores_size*2] = pafvec[pafOffset                + paf_stride_2*2 + m];
-            masked[i + scores_size*3] = pafvec[pafOffset                + paf_stride_2*3 + m];
-            masked[i + scores_size*4] = pafvec[pafOffset + paf_stride_1                  + m];
-            masked[i + scores_size*5] = pafvec[pafOffset + paf_stride_1 + paf_stride_2   + m];
-            masked[i + scores_size*6] = pafvec[pafOffset + paf_stride_1 + paf_stride_2*2 + m];
-            masked[i + scores_size*7] = pafvec[pafOffset + paf_stride_1 + paf_stride_2*3 + m];
-        }
+            /*
+    // For debugging
+    printf("iteration: %d\n", i);
+    printf("v:\n"); for (auto n : v) printf("%f, ", n); printf("\n");
+    printf("x:\n"); for (auto n : x) printf("%f, ", n); printf("\n");
+    printf("y:\n"); for (auto n : y) printf("%f, ", n); printf("\n");
+    printf("s:\n"); for (auto n : s) printf("%f, ", n); printf("\n");
+    */
 
-        std::vector<float> scores_b(scores_size);
-        if (pifhr_floor < 1.0f) {
-            // ij_b = np.round(fourds[0, 1:3] * self.stride).astype(np.int)
-            // ij_b[0] = np.clip(ij_b[0], 0, self._pifhr.shape[2] - 1)
-            // ij_b[1] = np.clip(ij_b[1], 0, self._pifhr.shape[1] - 1)
-            std::vector<int> ij_b(2 * scores_size);
-            for (size_t i = 0; i < scores_size*2; ++i) {
-                const int v = (int)std::round(masked[scores_size + i] * stride);
-                ij_b[i] = std::min(std::max(0, v), i < scores_size ? W_hr - 1 : H_hr - 1);
+            // Create a high-resolution confidence map for this keypoint.
+            // std::cout << x.size() << '\t'<< y.size() << '\t'<< v.size() << '\t' << s.size() << '\n';
+            // v / pif_nn
+            std::vector<float> v_over_pif_nn(v.size());
+            vsmul(v.data(), 1.0f / PIF_NN, v_over_pif_nn.data(), v.size());
+
+            // The original code computed the "core only" version in a separate step
+            // but that duplicates a bunch of work, so we do it at the same time.
+            const auto tco = targetsCoreOnly.data() + i * targets_stride_0;
+            scalarSquareAddGaussWitMax(tco, H_hr, W_hr, x, y, s, v_over_pif_nn, 1.0f, 1.0f);
+
+            size_t cnt = 0;
+            for (size_t dd = 0; dd < targets_stride_0; ++dd) {
+                if (tco[dd] > 0.01)
+                    ++cnt;
             }
-
-            // pifhr_b = self._pifhr[j1i, ij_b[1], ij_b[0]]
-            // scores_b = scores * (pifhr_floor + (1.0 - pifhr_floor) * pifhr_b)
-            const auto j1i = bones[c][0] - 1;
-            for (size_t i = 0; i < scores_b.size(); ++i) {
-                const auto pifhr_b = pifhr[j1i * pifhr_stride_0 + ij_b[scores_size + i] * pifhr_stride_1 + ij_b[i]];
-                scores_b[i] = scores[i] * (pifhr_floor + (1.0f - pifhr_floor) * pifhr_b);
+            // std::cout << targets_stride_0 << '\t' << i << '\t'<< cnt << '\t' << tco[0] << '\n';
+
+            // s * v
+            std::vector<float> s_times_v(v.size());
+            vmul(s.data(), v.data(), s_times_v.data(), v.size());
+
+            const auto t = targets.data() + i * targets_stride_0;
+            const auto scale = scales.data() + i * scales_stride_0;
+            const auto n = ns.data() + i * ns_stride_0;
+            scalarSquareAddGaussWitMax(t, H_hr, W_hr, x, y, s, v_over_pif_nn, 1.0f);
+            scalarSquareAddConstant(scale, H_hr, W_hr, x, y, s, s_times_v);
+            scalarSquareAddConstant(n, H_hr, W_hr, x, y, s, v);
+        }
+
+        // m = ns > 0
+        // scales[m] = scales[m] / ns[m]
+        for (size_t i = 0; i < scales.size(); ++i) {
+            const auto d = ns[i];
+            if (d > 0) {
+                scales[i] /= d;
             }
-        } else {
-            scores_b = scores;
         }
-
-        // mask_b = scores_b > score_th
-        std::vector<int> mask_b;
-        for (int i = 0; i < (int)scores_b.size(); ++i) {
-            if (scores_b[i] > score_th) { mask_b.push_back(i); }
+        return Target_intensity{ targets, scales, targetsCoreOnly };
+    }
+
+    std::tuple<float, float, float, float>
+    OpenPifPafPostprocessor::growConnectionBlend(float x, float y, float s, const std::array<std::vector<float>, 9>& paf_field)
+    {
+        // # source value
+        // paf_field = paf_center(paf_field, xy[0], xy[1], sigma=2.0)
+        // if paf_field.shape[1] == 0:
+        //     return 0, 0, 0
+        const float sigma = 2.0 * s;
+        const float sigma2 = 0.25 * s * s;
+        size_t score_1_i = 0, score_2_i = 0;
+        float score_1 = 0, score_2 = 0;
+
+        const int paf_stride = paf_field.front().size();
+        for (int i = 0; i < paf_stride; ++i) {
+            if ((paf_field[1][i] < x - sigma) || (paf_field[1][i] > x + sigma) || (paf_field[2][i] < y - sigma) || (paf_field[2][i] > y + sigma))
+                continue;
+            float d2 = (paf_field[1][i] - x) * (paf_field[1][i] - x) + (paf_field[2][i] - y) * (paf_field[2][i] - y);
+            float score = std::exp(-0.5 * d2 / sigma2) * paf_field[0][i];
+            if (score >= score_1) {
+                score_2_i = score_1_i;
+                score_2 = score_1;
+                score_1_i = i;
+                score_1 = score;
+            } else if (score > score_2) {
+                score_2_i = i;
+                score_2 = score;
+            }
         }
 
-        const size_t mask_b_size = mask_b.size();
-        std::vector<float> result_b(7 * mask_b_size);
-        for (size_t i = 0; i < mask_b_size; ++i) {
-            const auto m = mask_b[i];
-            result_b[i                ] = scores_b[m];
-            result_b[i + mask_b_size  ] = masked[scores_size*5 + m];
-            result_b[i + mask_b_size*2] = masked[scores_size*6 + m];
-            result_b[i + mask_b_size*3] = masked[scores_size*7 + m];
-            result_b[i + mask_b_size*4] = masked[scores_size   + m];
-            result_b[i + mask_b_size*5] = masked[scores_size*2 + m];
-            result_b[i + mask_b_size*6] = masked[scores_size*3 + m];
-        }
-        scored_backward.push_back(result_b);
-
-        std::vector<float> scores_f(scores_size);
-        if (pifhr_floor < 1.0f) {
-            // ij_f = np.round(fourds[1, 1:3] * self.stride).astype(np.int)
-            // ij_f[0] = np.clip(ij_f[0], 0, self._pifhr.shape[2] - 1)
-            // ij_f[1] = np.clip(ij_f[1], 0, self._pifhr.shape[1] - 1)
-            std::vector<int> ij_f(2 * scores_size);
-            for (size_t i = 0; i < scores_size*2; ++i) {
-                const int v = (int)std::round(masked[scores_size*5 + i] * stride);
-                ij_f[i] = std::min(std::max(0, v), i < scores_size ? W_hr - 1 : H_hr - 1);
+        if (score_1 == 0)
+            return { 0, 0, 0, 0 };
+
+        auto entry_1 = std::make_tuple(paf_field[3][score_1_i], paf_field[4][score_1_i], paf_field[6][score_1_i], paf_field[8][score_1_i]);
+
+        auto [ex1, ey1, eb1, es1] = entry_1;
+        if (score_2 < 0.01 || score_2 < 0.5 * score_1) {
+            return { ex1, ey1, es1, score_1 * 0.5 };
+        }
+
+        // blend...
+        auto entry_2 = std::make_tuple(paf_field[3][score_2_i], paf_field[4][score_2_i], paf_field[6][score_2_i], paf_field[8][score_2_i]);
+        auto [ex2, ey2, eb2, es2] = entry_2;
+
+        float blend_d2 = (ex1 - ex2) * (ex1 - ex2) + (ey1 - ey2) * (ey1 - ey2);
+        if (blend_d2 > ((es1 * es1) / 4)) {
+            return { ex1, ey1, es1, score_1 * 0.5 };
+        }
+
+        return {
+            // xysv
+            (score_1 * ex1 + score_2 * ex2) / (score_1 + score_2),
+            (score_1 * ey1 + score_2 * ey2) / (score_1 + score_2),
+            (score_1 * es1 + score_2 * es2) / (score_1 + score_2),
+            0.5 * (score_1 + score_2),
+        };
+    }
+
+    using xysv = std::optional<std::tuple<float, float, float, float>>;
+
+    struct queue_item { // -score, xyv, start_i, end_i
+        template <typename... Args>
+        queue_item(Args&&... args)
+            : data(std::make_tuple(std::forward<Args>(args)...))
+        {
+        }
+        std::tuple<float, xysv, int, int> data;
+        friend bool operator>(const queue_item& l, const queue_item& r)
+        {
+            return std::get<0>(l.data) >= std::get<0>(r.data);
+        }
+        friend bool operator<(const queue_item& l, const queue_item& r)
+        {
+            return std::get<0>(l.data) < std::get<0>(r.data);
+        }
+    };
+
+    void OpenPifPafPostprocessor::grow(Annotation& ann,
+        const FBContainer& pafForward,
+        const FBContainer& pafBackward)
+    {
+        // frontierActive = true;
+        // blockFrontier.clear();
+        std::set<std::pair<int, int>> in_frontier{};
+        std::priority_queue<queue_item, std::deque<queue_item>, std::greater<queue_item>> frontier;
+
+        const auto add_to_frontier = [&](size_t start_i) {
+            for (const auto& [end_i, to_p] : BY_SOURCE_MAP[start_i]) {
+                int caf_i = to_p.field_id;
+                // std::cout << "----> " << start_i << '\t' << end_i << '\t' << caf_i << '\n';
+                if (ann.keypoints[3 * end_i + 2] > 0.0) {
+                    // std::cout << "CONTINUE start_i = " << start_i << '\n';
+                    continue;
+                }
+                // found!
+                if (in_frontier.cend() != in_frontier.find(std::make_pair(start_i, end_i))) {
+                    // std::cout << "CONTINUE map already got you!\n";
+                    continue;
+                }
+
+                float max_possible_score = std::sqrt(ann.keypoints[3 * start_i + 2]);
+                // std::cout << "put " << start_i << ' ' << end_i << "\tscore = " << max_possible_score << "\n";
+                frontier.emplace(-max_possible_score, std::nullopt, start_i, end_i);
+                in_frontier.emplace(start_i, end_i);
             }
-
-            // pifhr_f = self._pifhr[j2i, ij_f[1], ij_f[0]]
-            // scores_f = scores * (pifhr_floor + (1.0 - pifhr_floor) * pifhr_f)
-            const auto j2i = bones[c][1] - 1;
-            for (size_t i = 0; i < scores_f.size(); ++i) {
-                const auto pifhr_f = pifhr[j2i * pifhr_stride_0 + ij_f[scores_size + i] * pifhr_stride_1 + ij_f[i]];
-                scores_f[i] = scores[i] * (pifhr_floor + (1.0f - pifhr_floor) * pifhr_f);
+        };
+
+        const auto frontier_get = [&]() -> std::optional<queue_item> {
+            while (!frontier.empty()) {
+                auto entry = frontier.top();
+                frontier.pop();
+
+                {
+                    auto [_a, _b, start_i, end_i] = entry.data;
+                    // std::cout << "POP " << start_i << ' ' << end_i << " has val = " << std::get<1>(entry.data).has_value() << '\n';
+                }
+
+                if (std::get<1>(entry.data).has_value()) {
+                    // std::cout << "RETURN \n";
+                    return entry;
+                }
+
+                auto [_a, _b, start_i, end_i] = entry.data;
+                if (ann.keypoints[end_i * 3 + 2] > 0.0)
+                    continue;
+
+                // connection_value(self, ann, caf_scored, start_i, end_i, *, reverse_match=True):
+                auto new_xysv = [&](size_t start_i, size_t end_i) -> xysv {
+                    const auto& point = BY_SOURCE_MAP[start_i][end_i];
+                    int caf_i = point.field_id;
+                    bool is_forward = point.possitve;
+                    const auto& caf_f = is_forward ? pafForward[caf_i] : pafBackward[caf_i]; // [19, 9, N]
+                    const auto& caf_b = is_forward ? pafBackward[caf_i] : pafForward[caf_i];
+                    auto [x, y, v] = std::make_tuple(ann.keypoints[start_i * 3], ann.keypoints[start_i * 3 + 1], ann.keypoints[start_i * 3 + 2]);
+                    float xy_scale_s = std::max(0.f, ann.jointScales[start_i]);
+                    const auto [nx, ny, ns, nv] = growConnectionBlend(x, y, xy_scale_s, caf_f);
+                    // std::cout << "NEW:\t" << nx << '\t' << ny << '\t' << ns << '\t' << nv << '\n';
+
+                    if (nv == 0)
+                        return std::nullopt;
+
+                    float keypoint_score = std::sqrt(nv * v);
+                    if (keypoint_score < keypointThreshold)
+                        return std::nullopt;
+                    // Use relative threashold
+                    constexpr float keypoint_threshold_rel = 0.5;
+                    if (keypoint_score < v * keypoint_threshold_rel)
+                        return std::nullopt;
+
+                    float xy_scale_t = std::max(0.f, ns);
+                    // if self.reverse_match and reverse_match -> true
+                    const auto [rx, ry, rs, rv] = growConnectionBlend(nx, ny, xy_scale_t, caf_b);
+                    // std::cout << "REVERSE:\t" << rx << '\t' << ry << '\t' << rs << '\t' << rv << '\n';
+                    if (rs == 0 || std::abs(x - rx) + std::abs(y - ry) > xy_scale_s)
+                        return std::nullopt;
+
+                    return std::make_tuple(nx, ny, ns, keypoint_score);
+                }(start_i, end_i);
+
+                if (std::nullopt == new_xysv)
+                    continue;
+
+                frontier.emplace(-std::get<3>(new_xysv.value()), new_xysv, start_i, end_i);
             }
-        } else {
-            scores_f = scores;
-        }
-
-        // mask_f = scores_f > score_th
-        std::vector<int> mask_f;
-        for (int i = 0; i < (int)scores_b.size(); ++i) {
-            if (scores_f[i] > score_th) { mask_f.push_back(i); }
-        }
+            return std::nullopt;
+        };
 
-        // scored_forward.append(np.concatenate((
-        //     np.expand_dims(scores_f[mask_f], 0),
-        //     fourds[0, 1:4][:, mask_f],
-        //     fourds[1, 1:4][:, mask_f],
-        // )))
-        const size_t mask_f_size = mask_f.size();
-        std::vector<float> result_f(7 * mask_f_size);
-        for (size_t i = 0; i < mask_f_size; ++i) {
-            const auto m = mask_f[i];
-            result_f[i                ] = scores_f[m];
-            result_f[i + mask_f_size  ] = masked[scores_size   + m];
-            result_f[i + mask_f_size*2] = masked[scores_size*2 + m];
-            result_f[i + mask_f_size*3] = masked[scores_size*3 + m];
-            result_f[i + mask_f_size*4] = masked[scores_size*5 + m];
-            result_f[i + mask_f_size*5] = masked[scores_size*6 + m];
-            result_f[i + mask_f_size*6] = masked[scores_size*7 + m];
+        for (size_t joint_i = 0; joint_i < N_PIFPAF_KEYPOINTS; ++joint_i) {
+            if (ann.keypoints[3 * joint_i + 2] != 0.0) {
+                // std::cout << "-----joint_i " << joint_i << "\n";
+                add_to_frontier(joint_i);
+            }
         }
-        scored_forward.push_back(result_f);
-
-        /*
-        // For debugging
-        printf("iteration: %d\n", c);
-        printf("scores:\n"); for (auto n : scores) printf("%f, ", n); printf("\n");
-        printf("mask:\n"); for (auto n : mask) printf("%d, ", n); printf("\n");
-        printf("masked:\n"); for (auto n : masked) printf("%f, ", n); printf("\n");
-        printf("scores_b:\n"); for (auto n : scores_b) printf("%f, ", n); printf("\n");
-        printf("scores_f:\n"); for (auto n : scores_f) printf("%f, ", n); printf("\n");
-        */
-    }
-    return Paf_target{ scored_forward, scored_backward };
-}
 
-std::vector<OpenPifPafPostprocessor::Pifhr_seed>
-OpenPifPafPostprocessor::pifhrSeeds(const std::vector<float>& pifhrScales,
-                                    const std::vector<float>& pifhrCore)
-{
-    std::vector<Pifhr_seed> seeds;
-
-    for (int field_i = 0; field_i < 17; ++field_i) {
-        const size_t pifhrScalesOffset = field_i * pifhr_stride_0;
-        const size_t pifhrCoreOffset = field_i * pifhr_stride_0;
-
-        // candidates = np.concatenate((index_fields, np.expand_dims(f, 0)), 0)
-        // mask = f > self.seed_threshold
-        std::vector<int> mask;
-        for (int i = 0; i < H_hr * W_hr; ++i) {
-            const auto value = pifhrCore[pifhrCoreOffset + i];
-            if (value > seedThreshold) { mask.push_back(i); }
-        }
+        while (true) {
+            auto entry = frontier_get();
+            if (!entry.has_value())
+                break;
 
-        // candidates = np.moveaxis(candidates[:, mask], 0, -1)
-        // This is a (count, 3) tensor where count is #elements over threshold.
-        std::vector<float> masked(mask.size() * 3);
-        for (size_t i = 0; i < mask.size(); ++i) {
-            const auto m = mask[i];
-            masked[i*3    ] = indexField_hr[m];
-            masked[i*3 + 1] = indexField_hr[m + H_hr*W_hr];
-            masked[i*3 + 2] = pifhrCore[pifhrCoreOffset + m];
+            auto [_, new_xysv, jsi, jti] = entry.value().data;
+
+            // std::cout << "jsi = " << jsi << ", jti = " << jti << ", ann.data[jti, 2] = " << ann.keypoints[jti * 3 + 2] << '\n';
+            if (ann.keypoints[jti * 3 + 2] > 0.0)
+                continue;
+
+            auto [nx, ny, ns, nv] = new_xysv.value();
+            ann.keypoints[jti * 3 + 0] = nx;
+            ann.keypoints[jti * 3 + 1] = ny;
+            ann.keypoints[jti * 3 + 2] = nv;
+            ann.jointScales[jti] = ns;
+            add_to_frontier(jti);
+        }
+    }
+
+    std::vector<OpenPifPafPostprocessor::Annotation> OpenPifPafPostprocessor::softNMS(std::vector<Annotation>& annotations)
+    {
+        float maxx = 0.0f;
+        float maxy = 0.0f;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                auto x = ann.keypoints[k * 3];
+                auto y = ann.keypoints[k * 3 + 1];
+                if (x > maxx) {
+                    maxx = x;
+                }
+                if (y > maxy) {
+                    maxy = y;
+                }
+            }
         }
 
-        // occupied = np.zeros(s.shape)
-        std::vector<float> occupied(H_hr * W_hr, 0.0f);
+        const auto h = (int)(maxy + 1);
+        const auto w = (int)(maxx + 1);
+        Occupancy occupied(17, h, w);
 
-        std::vector<int> sorted(mask.size());
+        std::vector<int> sorted(annotations.size());
         std::iota(sorted.begin(), sorted.end(), 0);
-        std::sort(sorted.begin(), sorted.end(), [masked] (int const& a, int const& b) {
-            return masked[a*3 + 2] > masked[b*3 + 2];
+        std::sort(sorted.begin(), sorted.end(), [annotations](int const& a, int const& b) {
+            return annotations[a].score() > annotations[b].score();
         });
 
-        // for c in sorted(candidates, key=lambda c: c[2], reverse=True):
-        for (auto c : sorted) {
-            const auto c_0 = masked[c*3];
-            const auto c_1 = masked[c*3 + 1];
-            const auto c_2 = masked[c*3 + 2];
-
-            // i, j = int(c[0]), int(c[1])
-            const auto i = (int)c_0;
-            const auto j = (int)c_1;
-            if (occupied[j*W_hr + i] > 0) { continue; }
-
-            // width = max(4, s[j, i])
-            const auto s = pifhrScales[pifhrScalesOffset + j * pifhr_stride_1 + i];
-            const auto width = std::max(4.0f, s);
-
-            // scalar_square_add_single(occupied, c[0], c[1], width / 2.0, 1.0)
-            scalarSquareAddSingle(occupied.data(), H_hr, W_hr, c_0, c_1, width / 2.0f, 1.0f);
-
-            // seeds.append((c[2], field_i, c[0] / self.stride, c[1] / self.stride))
-            seeds.emplace_back( c_2, field_i, c_0 / stride, c_1 / stride );
-        }
-    }
-
-    // seeds = list(sorted(seeds, reverse=True))
-    std::sort(seeds.begin(), seeds.end(), [] (const Pifhr_seed& a, const Pifhr_seed& b) {
-        const auto ca = std::get<0>(a);
-        const auto cb = std::get<0>(b);
-        return ca > cb;
-    });
-
-    // if len(seeds) > 500:
-    //     if seeds[500][0] > 0.1:
-    //         seeds = [s for s in seeds if s[0] > 0.1]
-    //     else:
-    //         seeds = seeds[:500]
-    if (seeds.size() > 500) {
-        seeds.resize(500);
-    }
-    return seeds;
-}
-
-std::vector<float>
-OpenPifPafPostprocessor::pafCenter(const std::vector<float>& paf_field,
-                                   float x, float y, float sigma)
-{
-    std::vector<int> mask;
-    const int paf_stride = (int)paf_field.size() / 7;
-    for (int i = 0; i < paf_stride; ++i) {
-        const bool take = (paf_field[  paf_stride + i] > x - sigma * paf_field[3*paf_stride + i]) &&
-                          (paf_field[  paf_stride + i] < x + sigma * paf_field[3*paf_stride + i]) &&
-                          (paf_field[2*paf_stride + i] > y - sigma * paf_field[3*paf_stride + i]) &&
-                          (paf_field[2*paf_stride + i] < y + sigma * paf_field[3*paf_stride + i]);
-        if (take) { mask.push_back(i); }
-    }
-    if (mask.empty()) { return {}; }
-
-    const int mask_size = (int)mask.size();
-    const int out_stride = mask_size;
-    std::vector<float> result(7 * mask_size, 0.0f);
-    for (int j = 0; j < 7; ++j) {
-        for (int i = 0; i < mask_size; ++i) {
-            const int m = mask[i];
-            result[j*out_stride + i] = paf_field[j*paf_stride + m];
-        }
-    }
-    return result;
-}
-
-OpenPifPafPostprocessor::Connection
-OpenPifPafPostprocessor::growConnection(float x, float y,
-                                        const std::vector<float>& paf_field_)
-{
-    // # source value
-    // paf_field = paf_center(paf_field, xy[0], xy[1], sigma=2.0)
-    // if paf_field.shape[1] == 0:
-    //     return 0, 0, 0
-    const auto paf_field = pafCenter(paf_field_, x, y, 2.0f);
-    if (paf_field.empty()) { return Connection{ 0, 0, 0}; }
-
-    // # source distance
-    // d = np.linalg.norm(np.expand_dims(xy, 1) - paf_field[1:3], axis=0)
-    // b_source = paf_field[3] * 3.0
-    // # combined value and source distance
-    // v = paf_field[0]
-    // scores = np.exp(-1.0 * d / b_source) * v  # two-tailed cumulative Laplace
-    const int paf_stride = (int)paf_field.size() / 7;
-    std::vector<float> scores(paf_stride);
-    for (int i = 0; i < paf_stride; ++i) {
-        const auto a = x - paf_field[paf_stride   + i];
-        const auto b = y - paf_field[paf_stride*2 + i];
-        const auto d = std::sqrt(a*a + b*b);
-        const auto b_source = paf_field[paf_stride*3 + i] * 3.0f;
-        const auto v = paf_field[i];
-        scores[i] = std::exp(-d / b_source) * v;
-    }
-
-    // return self._target_with_maxscore(paf_field[4:7], scores)
-    int max_i;
-    const float score = vargmax(scores.data(), scores.size(), &max_i);
-    return Connection{ paf_field[paf_stride*4 + max_i], paf_field[paf_stride*5 + max_i], score };
-}
-
-std::vector<OpenPifPafPostprocessor::frontier_t> OpenPifPafPostprocessor::frontier(Annotation& ann) {
-    std::vector<frontier_t> f;
-
-    for (int connection_i = 0; connection_i < numBones; ++connection_i) {
-        const auto bone = bones[connection_i];
-        const auto j1i = bone[0] - 1;
-        const auto j2i = bone[1] - 1;
-        if (ann.keypoints[j1i*3 + 2] > 0.0f && ann.keypoints[j2i*3 + 2] == 0.0f) {
-            f.emplace_back( ann.keypoints[j1i*3 + 2], connection_i, true, j1i, j2i );
-        }
-    }
-
-    for (int connection_i = 0; connection_i < numBones; ++connection_i) {
-        const auto bone = bones[connection_i];
-        const auto j1i = bone[0] - 1;
-        const auto j2i = bone[1] - 1;
-        if (ann.keypoints[j2i*3 + 2] > 0.0f && ann.keypoints[j1i*3 + 2] == 0.0f) {
-            f.emplace_back( ann.keypoints[j2i*3 + 2], connection_i, false, j1i, j2i );
-        }
-    }
-
-    std::sort(f.begin(), f.end(), [] (const frontier_t& a, const frontier_t& b) {
-        const auto ca = std::get<0>(a);
-        const auto cb = std::get<0>(b);
-        return ca > cb;
-    });
-
-    return f;
-}
-
-OpenPifPafPostprocessor::frontier_t OpenPifPafPostprocessor::frontierIter(Annotation& ann) {
-    while (frontierActive) {
-        // unblocked_frontier = [f for f in self.frontier()
-        //                       if (f[1], f[2]) not in block_frontier]
-        std::vector<frontier_t> unblockedFrontier;
-        for (auto f : frontier(ann)) {
-            const auto connection_id = std::get<1>(f);
-            const auto forward = std::get<2>(f);
-            if (blockFrontier.find(std::tuple<int, bool>{ connection_id, forward }) == blockFrontier.end()) {
-                unblockedFrontier.push_back(f);
+        for (auto a : sorted) {
+            Annotation& ann = annotations[a];
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                const auto x = ann.keypoints[k * 3];
+                const auto y = ann.keypoints[k * 3 + 1];
+                const auto v = ann.keypoints[k * 3 + 2];
+                if (v == 0) {
+                    continue;
+                }
+
+                const auto i = std::min(std::max(0, (int)std::round(x)), w - 1);
+                const auto j = std::min(std::max(0, (int)std::round(y)), h - 1);
+
+                if (occupied.fuzz_get(k, j, i)) {
+                    ann.keypoints[k * 3 + 2] = 0.0f;
+                } else {
+                    scalarSquareAddSingle(occupied, k, h, w, x, y, ann.jointScales[k]);
+                }
             }
         }
 
-        /*
-        // For debugging
-        printf("unblockedFrontier ");
-        for (auto n : unblockedFrontier) {
-          printf("(%f, %d, %s, %d, %d), ", std::get<0>(n), std::get<1>(n),
-                                           std::get<2>(n) ? "true" : "false",
-                                           std::get<3>(n), std::get<4>(n));
-        }
-        printf("\n");
-        */
-
-        // if not unblocked_frontier:
-        //     break
-        if (unblockedFrontier.empty()) {
-            frontierActive = false;
-            break;
-        }
-
-        // first = unblocked_frontier[0]
-        // yield first
-        // block_frontier.add((first[1], first[2]))
-        const auto first = unblockedFrontier[0];
-        const auto connection_id = std::get<1>(first);
-        const auto forward = std::get<2>(first);
-        blockFrontier.insert(std::tuple<int, bool>{ connection_id, forward });
-        return first;
-    }
-    return {};
-}
-
-void OpenPifPafPostprocessor::grow(Annotation& ann,
-                                   const std::vector<std::vector<float>>& pafForward,
-                                   const std::vector<std::vector<float>>& pafBackward,
-                                   float th)
-{
-    frontierActive = true;
-    blockFrontier.clear();
-
-    while (true) {
-        const auto f = frontierIter(ann);
-        if (!frontierActive) { return; }
-
-        const auto i = std::get<1>(f);
-        const auto forward = std::get<2>(f);
-        const auto j1i = std::get<3>(f);
-        const auto j2i = std::get<4>(f);
-
-        // For debugging
-        //printf("grow: %d %s %d %d\n", i, forward ? "true" : "false", j1i, j2i);
-
-        float x, y, v;
-        std::vector<float> directed_paf_field;
-        std::vector<float> directed_paf_field_reverse;
-        if (forward) {
-            x = ann.keypoints[j1i*3    ];
-            y = ann.keypoints[j1i*3 + 1];
-            v = ann.keypoints[j1i*3 + 2];
-            directed_paf_field = pafForward[i];
-            directed_paf_field_reverse = pafBackward[i];
-        } else {
-            x = ann.keypoints[j2i*3    ];
-            y = ann.keypoints[j2i*3 + 1];
-            v = ann.keypoints[j2i*3 + 2];
-            directed_paf_field = pafBackward[i];
-            directed_paf_field_reverse = pafForward[i];
-        }
-
-        const auto t = growConnection(x, y, directed_paf_field);
-        const auto new_x = std::get<0>(t);
-        const auto new_y = std::get<1>(t);
-        auto new_v = std::get<2>(t);
-
-        if (new_v < th) { continue; }
-
-        // reverse match
-        if (th >= 0.1) {
-            const auto t1 = growConnection(new_x, new_y, directed_paf_field_reverse);
-            const auto reverse_x = std::get<0>(t1);
-            const auto reverse_y = std::get<1>(t1);
-            const auto reverse_v = std::get<2>(t1);
-            if (reverse_v < th) { continue; }
-            if (std::abs(x - reverse_x) + std::abs(y - reverse_y) > 1.0f) { continue; }
-        }
-
-        new_v = std::sqrt(new_v * v);  // geometric mean
-
-        if (forward) {
-            if (new_v > ann.keypoints[j2i*3 + 2]) {
-                ann.keypoints[j2i*3    ] = new_x;
-                ann.keypoints[j2i*3 + 1] = new_y;
-                ann.keypoints[j2i*3 + 2] = new_v;
-            }
-        } else {
-            if (new_v > ann.keypoints[j1i*3 + 2]) {
-                ann.keypoints[j1i*3    ] = new_x;
-                ann.keypoints[j1i*3 + 1] = new_y;
-                ann.keypoints[j1i*3 + 2] = new_v;
+        std::vector<Annotation> filtered;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                if (ann.keypoints[k * 3 + 2] > 0.0f) {
+                    filtered.push_back(ann);
+                    break;
+                }
             }
         }
-    }
-}
-
-void OpenPifPafPostprocessor::fillJointScales(Annotation& ann,
-                                              const std::vector<float>& scales,
-                                              int fieldH,
-                                              int fieldW,
-                                              float hr_scale)
-{
-    for (int k = 0; k < numKeypoints; ++k) {
-        const auto x = ann.keypoints[k*3];
-        const auto y = ann.keypoints[k*3 + 1];
-        const auto v = ann.keypoints[k*3 + 2];
-        if (v == 0) { continue; }
-
-        // i = max(0, min(scale_field.shape[1] - 1, int(round(xyv[0] * hr_scale))))
-        // j = max(0, min(scale_field.shape[0] - 1, int(round(xyv[1] * hr_scale))))
-        const auto i = std::max(0, std::min(fieldW - 1, (int)std::round(x * hr_scale)));
-        const auto j = std::max(0, std::min(fieldH - 1, (int)std::round(y * hr_scale)));
-
-        // self.joint_scales[xyv_i] = scale_field[j, i] / hr_scale
-        ann.jointScales[k] = scales[k*pifhr_stride_0 + j*pifhr_stride_1 + i] / hr_scale;
-    }
-}
-
-std::vector<OpenPifPafPostprocessor::Annotation>
-OpenPifPafPostprocessor::decodeAnnotations(const std::vector<float>& pifhr,
-                                           const std::vector<float>& pifhrScales,
-                                           const std::vector<float>& pifhrCore,
-                                           const std::vector<std::vector<float>>& pafForward,
-                                           const std::vector<std::vector<float>>& pafBackward)
-{
-    const auto seeds = pifhrSeeds(pifhrScales, pifhrCore);
-
-    // This is a (17, H_hr, W_hr) tensor.
-    std::vector<float> occupied(17 * H_hr * W_hr, 0.0f);
-
-    std::vector<Annotation> annotations;
-    for (auto& seed : seeds) {
-        const auto v = std::get<0>(seed);
-        const auto f = std::get<1>(seed);
-        const auto x = std::get<2>(seed);
-        const auto y = std::get<3>(seed);
-
-        const auto i = std::min(std::max(0, (int)std::round(x * stride)), W_hr - 1);
-        const auto j = std::min(std::max(0, (int)std::round(y * stride)), H_hr - 1);
-        if (occupied[f*H_hr*W_hr + j*W_hr + i] > 0.0f) { continue; }
-
-        Annotation ann(f, x, y, v);
-        grow(ann, pafForward, pafBackward);
-        fillJointScales(ann, pifhrScales, H_hr, W_hr, stride);
-        annotations.push_back(ann);
-
-        for (int i = 0; i < numKeypoints; ++i) {
-            const auto x = ann.keypoints[i*3];
-            const auto y = ann.keypoints[i*3 + 1];
-            const auto v = ann.keypoints[i*3 + 2];
-            if (v == 0) { continue; }
-
-            const auto width = ann.jointScales[i] * stride;
-            scalarSquareAddSingle(occupied.data() + i*H_hr*W_hr, H_hr, W_hr,
-                                  x * stride, y * stride, width / 2.0f, 1.0f);
-        }
-    }
-    return annotations;
-}
-
-std::vector<OpenPifPafPostprocessor::Annotation> OpenPifPafPostprocessor::softNMS(std::vector<Annotation>& annotations) {
-    float maxx = 0.0f;
-    float maxy = 0.0f;
-    for (auto& ann : annotations) {
-        for (int k = 0; k < numKeypoints; ++k) {
-            auto x = ann.keypoints[k*3];
-            auto y = ann.keypoints[k*3 + 1];
-            if (x > maxx) { maxx = x; }
-            if (y > maxy) { maxy = y; }
+        return filtered;
+
+        // Note: The original code sorts here on the score (descending), but
+        // we sort again later on so it's a bit quicker if we skip that here.
+    }
+
+    void OpenPifPafPostprocessor::initTensors(int tensorWidth, int tensorHeight)
+    {
+        H = tensorHeight;
+        W = tensorWidth;
+        H_hr = (H - 1) * (int)STRIDE + 1;
+        W_hr = (W - 1) * (int)STRIDE + 1;
+
+        pif_stride_1 = H * W;
+        pif_stride_0 = 5 * pif_stride_1;
+
+        pifhr_stride_1 = W_hr;
+        pifhr_stride_0 = H_hr * pifhr_stride_1;
+
+        const int shape = C * H_hr * W_hr;
+        targetsCoreOnly = std::vector<float>(shape);
+        targets = std::vector<float>(shape);
+        scales = std::vector<float>(shape);
+        ns = std::vector<float>(shape);
+    }
+
+    ai_app::Object_detection::Result OpenPifPafPostprocessor::postprocess(
+        int inputWidth, int inputHeight,
+        int tensorWidth, int tensorHeight,
+        const std::vector<float>& pif,
+        const std::vector<float>& paf)
+    {
+        // Allocate the intermediate tensors the first time or when the size changes.
+        if (W != tensorWidth || H != tensorHeight) {
+            initTensors(tensorWidth, tensorHeight);
+        }
+
+        const auto result_tuple = targetIntensities(pif);
+        const auto& pifhr = std::get<0>(result_tuple);
+        const auto& pifhrScales = std::get<1>(result_tuple);
+        const auto& pifhrCore = std::get<2>(result_tuple);
+
+        //      (17, 5, H, W)
+        // pif: [v, x, y, _, s]
+        const size_t pif_ch = 5, hw_size = H * W;
+        const size_t pif_shard_size = pif_ch * hw_size;
+
+        // BEGIN: seeds = utils.CifSeeds(cifhr.accumulated).fill(fields, self.cif_metas)
+        std::vector<std::tuple<float, int, float, float, float>> seeds{};
+
+        const float maxx = W_hr - 0.51, maxy = H_hr - 0.51;
+        for (size_t field_i = 0; field_i < N_PIFPAF_KEYPOINTS; ++field_i) {
+            // Search qualified entries.
+            size_t this_field_offset = field_i * pif_shard_size;
+            for (size_t hw_index = 0; hw_index < hw_size; ++hw_index) {
+                size_t vindex = hw_index + this_field_offset;
+                if (pif[vindex] > seedThreshold) {
+                    float c = pif[vindex], x = pif[vindex + hw_size], y = pif[vindex + 2 * hw_size], s = pif[vindex + 4 * hw_size];
+                    // scalar_values
+                    if (x < -0.49 || y < -0.49 || x > maxx || y > maxy) {
+                        continue;
+                    }
+                    float v = pifhrCore[field_i * W_hr * H_hr + ((size_t)(y * STRIDE + 0.5) * W_hr) + (size_t)(x * STRIDE + 0.5)];
+                    // scalar_values :: over.
+
+                    v = 0.9 * v + 0.1 * c;
+                    // printf("%f   %f, %f, %f, %f\n", v, c, x, y, s);
+
+                    // pass or not?
+                    if (v > seedThreshold) {
+                        // ok, you pass. -> seeds -> [x, y, v, s]
+                        seeds.emplace_back(v, field_i, x * STRIDE, y * STRIDE, s * STRIDE);
+                    }
+                }
+            }
         }
-    }
-
-    const auto h = (int)(maxy + 1);
-    const auto w = (int)(maxx + 1);
-    std::vector<float> occupied(17 * h * w, 0.0f);
-
-    std::vector<int> sorted(annotations.size());
-    std::iota(sorted.begin(), sorted.end(), 0);
-    std::sort(sorted.begin(), sorted.end(), [annotations] (int const& a, int const& b) {
-        return annotations[a].score() > annotations[b].score();
-    });
-
-    for (auto a : sorted) {
-        Annotation& ann = annotations[a];
-        for (int k = 0; k < numKeypoints; ++k) {
-            const auto x = ann.keypoints[k*3    ];
-            const auto y = ann.keypoints[k*3 + 1];
-            const auto v = ann.keypoints[k*3 + 2];
-            if (v == 0) { continue; }
-
-            const auto i = std::min(std::max(0, (int)std::round(x)), w - 1);
-            const auto j = std::min(std::max(0, (int)std::round(y)), h - 1);
-
-            if (occupied[k*h*w + j*w + i] > 0.0f) {
-                ann.keypoints[k*3 + 2] = 0.0f;
-            } else {
-                scalarSquareAddSingle(occupied.data() + k*h*w, h, w, x, y, ann.jointScales[k], 1.0f);
+        // std::cout << seeds.size() << "seeds size\n";
+        // END: seeds = utils.CifSeeds(cifhr.accumulated).fill(fields, self.cif_metas)
+
+        // BEGIN: caf_scored = utils.CafScored(cifhr.accumulated).fill(fields, self.caf_metas)
+        // (19, 9, DYNAMICs)
+        constexpr size_t paf_ch = 9;
+        const size_t paf_shard_size = paf_ch * hw_size;
+        // (19, 9, H, W)...
+        FBContainer forward{}, backward{};
+        for (size_t field_i = 0; field_i < forward.size(); ++field_i) {
+            constexpr float PAF_SCORE_THRE = 0.2;
+            constexpr float CIF_FLOOR = 0.1;
+            // filter!
+            for (size_t hw_idx = 0; hw_idx < hw_size; ++hw_idx) {
+                const size_t paf_conf_idx = hw_idx + field_i * paf_shard_size;
+                const auto conf = paf[paf_conf_idx];
+                if (conf > PAF_SCORE_THRE) {
+                    // values in this line...
+                    std::array<float, 9> this_ch{};
+                    for (size_t chidx = 0; chidx < this_ch.size(); ++chidx) {
+                        this_ch[chidx] = paf[paf_conf_idx + chidx * hw_size];
+                        if (chidx != 0)
+                            this_ch[chidx] *= STRIDE;
+                    }
+
+                    auto backward_pif_ch = bones[field_i][0] - 1;
+                    auto forward_pif_ch = bones[field_i][1] - 1;
+                    // backward pass.
+                    constexpr std::array<size_t, 9> BACKWARD_IDX{ 0, 3, 4, 1, 2, 6, 5, 8, 7 };
+                    constexpr std::array<size_t, 9> FORWARD_IDX{ 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+                    // restore... (yet another filtering...)
+                    // cifhr_t = scalar_values(self.cifhr[joint_t], nine[3], nine[4], default=0.0)
+                    // nine[0] = nine[0] * (self.cif_floor + (1.0 - self.cif_floor) * cifhr_t)
+                    const auto pass = [&this_ch, maxx, maxy, this, field_i, &pifhrCore](const auto& idx_mapping, FBContainer& cont, size_t pif_field_idx) {
+                        float x = this_ch[idx_mapping[3]], y = this_ch[idx_mapping[4]];
+                        if (!(x < -0.49 || y < -0.49 || x > maxx || y > maxy)) {
+                            // std::cout << field_i << "\tXY = \t"<< x << '\t' << y << '\t' << (size_t)(x + 0.5) << '\t' << (size_t)(y + 0.5) << "\t MAX HW: " << W_hr << ' ' << H_hr << std::endl;
+                            float cifhr_t = pifhrCore[pif_field_idx * W_hr * H_hr + ((size_t)(y + 0.5) * W_hr) + (size_t)(x + 0.5)];
+                            float new_v = this_ch[0] * (CIF_FLOOR + (1 - CIF_FLOOR) * cifhr_t);
+                            if (new_v > PAF_SCORE_THRE) {
+                                // forward pass.
+                                for (size_t fwd_idx = 0; fwd_idx < cont.front().size(); ++fwd_idx) {
+                                    // restore!
+                                    cont[field_i][fwd_idx].push_back(this_ch[idx_mapping[fwd_idx]]);
+                                }
+                                cont[field_i][0].back() = new_v;
+                            }
+                        }
+                    };
+
+                    pass(BACKWARD_IDX, backward, backward_pif_ch);
+                    pass(FORWARD_IDX, forward, forward_pif_ch);
+                }
             }
         }
-    }
+        // for (const auto& f : forward) {
+        //   std::cout << "(" << f.size() << ", " << f.front().size() << "), ";
+        // }
+        // std::cout << '\n';
+        // for (const auto& f : backward) {
+        //   std::cout << "(" << f.size() << ", " << f.front().size() << "), ";
+        // }
+        // std::cout << '\n';
+        // END: caf_scored = utils.CafScored(cifhr.accumulated).fill(fields, self.caf_metas)
+        std::sort(seeds.begin(), seeds.end(), std::greater{});
+
+        // occupacy map.
+        // std::cout << C << ' ' << H_hr << ' ' << W_hr << '\n';
+        Occupancy occupied(C, H_hr, W_hr);
+        std::vector<Annotation> annotations;
+        for (const auto& [v, f, x, y, s] : seeds) {
+            if (occupied.fuzz_get(f, y, x)) {
+                continue;
+            }
 
-    std::vector<Annotation> filtered;
-    for (auto& ann : annotations) {
-        for (int k = 0; k < numKeypoints; ++k) {
-            if (ann.keypoints[k*3 + 2] > 0.0f) {
-                filtered.push_back(ann);
-                break;
+            Annotation ann(f, x, y, v);
+            ann.jointScales[f] = s;
+            grow(ann, forward, backward);
+            annotations.push_back(ann);
+
+            for (int i = 0; i < N_PIFPAF_KEYPOINTS; ++i) {
+                const auto ax = ann.keypoints[i * 3];
+                const auto ay = ann.keypoints[i * 3 + 1];
+                const auto av = ann.keypoints[i * 3 + 2];
+                if (av == 0) {
+                    continue;
+                }
+
+                const auto width = ann.jointScales[i];
+                scalarSquareAddSingle(occupied, i, H_hr, W_hr, ax, ay, width, Occupancy::reduction, Occupancy::min_scale_reduced); // width is sigma...
             }
         }
-    }
-    return filtered;
-
-    // Note: The original code sorts here on the score (descending), but
-    // we sort again later on so it's a bit quicker if we skip that here.
-}
 
-void OpenPifPafPostprocessor::initTensors(int tensorWidth, int tensorHeight) {
-    H = tensorHeight;
-    W = tensorWidth;
-    H_hr = H * (int)stride;
-    W_hr = W * (int)stride;
+        // This returns two lists that each contain 19 tensors of shape (7, ?)
+        // where the second dimension can vary in size (depends on thresholds).
+        // const auto pt = scorePafTarget(paf, pifhr);
+        // const auto pafForward = std::get<0>(pt);
+        // const auto pafBackward = std::get<1>(pt);
 
-    paf_stride_2 = H * W;
-    paf_stride_1 = 4 * paf_stride_2;
-    paf_stride_0 = 2 * paf_stride_1;
-
-    pif_stride_1 = H * W;
-    pif_stride_0 = 4 * pif_stride_1;
-
-    pifhr_stride_1 = W_hr;
-    pifhr_stride_0 = H_hr * pifhr_stride_1;
-
-    indexField = makeIndexField(H, W);
-    indexField_hr = makeIndexField(H_hr, W_hr);
-    paf = std::vector<float>(19 * 2 * 4 * H * W);
-    pif = std::vector<float>(17 * 4 * H * W);
-
-    const int shape = C * H_hr * W_hr;
-    targetsCoreOnly = std::vector<float>(shape);
-    targets = std::vector<float>(shape);
-    scales = std::vector<float>(shape);
-    ns = std::vector<float>(shape);
-}
-
-ai_app::Object_detection::Result OpenPifPafPostprocessor::postprocess_0_8(
-    int inputWidth, int inputHeight,
-    int tensorWidth, int tensorHeight,
-    const float* pif_c,
-    const float* pif_r,
-    const float* pif_s,
-    const float* paf_c,
-    const float* paf_r1,
-    const float* paf_r2,
-    const float* paf_b1,
-    const float* paf_b2)
-{
-    this->inputWidth = inputWidth;
-    this->inputHeight = inputHeight;
-
-    // Allocate the intermediate tensors the first time or when the size changes.
-    if (W != tensorWidth || H != tensorHeight) {
-        initTensors(tensorWidth, tensorHeight);
-    }
-
-    normalizePAF(paf_c, paf_r1, paf_r2, paf_b1, paf_b2);
-    normalizePIF(pif_c, pif_r, pif_s);
-
-    const auto ti = targetIntensities(pif);
-    const auto pifhr = std::get<0>(ti);
-    const auto pifhrScales = std::get<1>(ti);
-    const auto pifhrCore = std::get<2>(ti);
-
-    /*
-    // For debugging
-    for (int c = 0; c < 17; ++c) {
-      for (int y = 0; y < H_hr; ++y) {
-        for (int x = 0; x < W_hr; ++x) {
-          printf("%f, ", pifhrCore[c*136*248 + y*248 + x]);
+        /*
+  // For debugging
+  printf("pafForward:\n");
+  for (auto& i : pafForward) {
+    for (auto j : i) { printf("%f, ", j); } printf("\n");
+  }
+  printf("\npafBackward:\n");
+  for (auto i : pafBackward) {
+    for (auto& j : i) { printf("%f, ", j); } printf("\n");
+  }
+  */
+
+        // auto annotations = decodeAnnotations(seeds, pifhr, pifhrScales, pifhrCore, pafForward, pafBackward);
+
+        // Scale to input size
+        //  for (auto& ann : annotations) {
+        //    for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+        //      ann.keypoints[k*3    ] *= STRIDE;
+        //      ann.keypoints[k*3 + 1] *= STRIDE;
+        //        std::cout << "--> Scaled: " <<ann.keypoints[k*3    ] << '\t' << ann.keypoints[k*3+1] << '\n';
+        //      ann.jointScales[k]     *= STRIDE;
+        //    }
+        //  }
+
+        // Non-maximum suppression
+        if (!annotations.empty()) {
+            annotations = softNMS(annotations);
+        }
+
+        // // Threshold
+        std::vector<Annotation> thresholded;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                if (ann.keypoints[k * 3 + 2] < keypointThreshold) {
+                    ann.keypoints[k * 3 + 2] = 0.0f;
+                }
+            }
+            if (ann.score() >= instanceThreshold) {
+                thresholded.push_back(ann);
+            }
         }
-      }
-      printf("\n");
-    }
-    */
-
-    // This returns two lists that each contain 19 tensors of shape (7, ?)
-    // where the second dimension can vary in size (depends on thresholds).
-    const auto pt = scorePafTarget(paf, pifhr);
-    const auto pafForward = std::get<0>(pt);
-    const auto pafBackward = std::get<1>(pt);
 
-    /*
-    // For debugging
-    printf("pafForward:\n");
-    for (auto& i : pafForward) {
-      for (auto j : i) { printf("%f, ", j); } printf("\n");
-    }
-    printf("\npafBackward:\n");
-    for (auto i : pafBackward) {
-      for (auto& j : i) { printf("%f, ", j); } printf("\n");
-    }
-    */
+        std::sort(thresholded.begin(), thresholded.end(), [](const Annotation& a, const Annotation& b) {
+            return a.score() > b.score();
+        });
 
-    auto annotations = decodeAnnotations(pifhr, pifhrScales, pifhrCore, pafForward, pafBackward);
+        // // Convert to normalized coordinates
+        //  for (auto& ann : thresholded) {
+        //    for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+        //      ann.keypoints[k*3    ] /= inputWidth;
+        //      ann.keypoints[k*3 + 1] /= inputHeight;
+        //    }
+        //  }
 
-    // Scale to input size
-    const float output_stride = 8.0f;
-    for (auto& ann : annotations) {
-        for (int k = 0; k < numKeypoints; ++k) {
-            ann.keypoints[k*3    ] *= output_stride;
-            ann.keypoints[k*3 + 1] *= output_stride;
-            ann.jointScales[k]     *= output_stride;
-        }
+        /*
+  // For debugging
+  for (auto ann : thresholded) {
+    printf("Keypoints:\n");
+    for (auto k : ann.keypoints) {
+      printf("%f, ", k);
     }
-
-    // Non-maximum suppression
-    if (!annotations.empty()) {
-        annotations = softNMS(annotations);
+    printf("\nJoint scales:\n");
+    for (auto k : ann.jointScales) {
+      printf("%f, ", k);
     }
-
-    // Threshold
-    std::vector<Annotation> thresholded;
-    for (auto& ann : annotations) {
-        for (int k = 0; k < numKeypoints; ++k) {
-            if (ann.keypoints[k*3 + 2] < keypointThreshold) {
-                ann.keypoints[k*3 + 2] = 0.0f;
+    printf("\n");
+  }
+  */
+
+        ai_app::Object_detection::Result result;
+        result.success = true;
+        for (auto& ann : thresholded) {
+            ai_app::Landmarks landmarks;
+            landmarks.type = "body_pose_pifpaf";
+
+            int minx = std::numeric_limits<int>::max(),
+                miny = std::numeric_limits<int>::max(),
+                maxx_ = std::numeric_limits<int>::min(),
+                maxy_ = std::numeric_limits<int>::min();
+
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                const int x = ann.keypoints[k * 3];
+                const int y = ann.keypoints[k * 3 + 1];
+                const auto v = ann.keypoints[k * 3 + 2];
+
+                if (v > 0.0f) {
+                    if (x < minx) {
+                        minx = x;
+                    }
+                    if (x > maxx_) {
+                        maxx_ = x;
+                    }
+                    if (y < miny) {
+                        miny = y;
+                    }
+                    if (y > maxy_) {
+                        maxy_ = y;
+                    }
+                }
+
+                ai_app::Landmark landmark{};
+                landmark.confidence = v;
+                landmark.position.x = x;
+                landmark.position.y = y;
+                landmarks.points.push_back(landmark);
             }
-        }
-        if (ann.score() >= instanceThreshold) {
-            thresholded.push_back(ann);
-        }
-    }
 
-    std::sort(thresholded.begin(), thresholded.end(), [] (const Annotation& a, const Annotation& b) {
-        return a.score() > b.score();
-    });
+            ai_app::Object_detection::Result::Item item;
+            item.confidence = ann.score();
+            item.class_index = 1;
+            item.bounding_box.origin.x = minx;
+            item.bounding_box.origin.y = miny;
+            item.bounding_box.size.x = maxx_ - minx;
+            item.bounding_box.size.y = maxy_ - miny;
+            item.landmarks = landmarks;
 
-    // Convert to normalized coordinates
-    for (auto& ann : thresholded) {
-        for (int k = 0; k < numKeypoints; ++k) {
-            ann.keypoints[k*3    ] /= inputWidth;
-            ann.keypoints[k*3 + 1] /= inputHeight;
+            result.items.push_back(item);
         }
+        return result;
     }
 
-    /*
-    // For debugging
-    for (auto ann : thresholded) {
-      printf("Keypoints:\n");
-      for (auto k : ann.keypoints) {
-        printf("%f, ", k);
-      }
-      printf("\nJoint scales:\n");
-      for (auto k : ann.jointScales) {
-        printf("%f, ", k);
-      }
-      printf("\n");
-    }
-    */
-
-    ai_app::Object_detection::Result result;
-    result.success = true;
-    for (auto& ann : thresholded) {
-        ai_app::Landmarks landmarks;
-        landmarks.type = "body_pose_pifpaf";
-
-        int minx =  std::numeric_limits<int>::max(),
-            miny =  std::numeric_limits<int>::max(),
-            maxx = -std::numeric_limits<int>::max(),
-            maxy = -std::numeric_limits<int>::max();
-
-        for (int k = 0; k < numKeypoints; ++k) {
-            const int x = ann.keypoints[k*3    ] * 10000; // FIXME: MAGIC NUMBER.
-            const int y = ann.keypoints[k*3 + 1] * 10000;
-            const auto v = ann.keypoints[k*3 + 2];
-
-            if (v > 0.0f) {
-                if (x < minx) { minx = x; }
-                if (x > maxx) { maxx = x; }
-                if (y < miny) { miny = y; }
-                if (y > maxy) { maxy = y; }
-            }
-
-            ai_app::Landmark landmark;
-            landmark.confidence = v;
-            landmark.position.x = x;
-            landmark.position.y = y;
-            landmarks.points.push_back(landmark);
-        }
-
-        ai_app::Object_detection::Result::Item item;
-        item.confidence = ann.score();
-        item.class_index = 1;
-        item.bounding_box.origin.x = minx;
-        item.bounding_box.origin.y = miny;
-        item.bounding_box.size.x = maxx - minx;
-        item.bounding_box.size.y = maxy - miny;
-        item.landmarks = landmarks;
-
-        result.items.push_back(item);
-    }
-    return result;
 }
-
-}
\ No newline at end of file
+}
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.hpp b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
index 9124b1f1..6aa28353 100644
--- a/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
@@ -1,15 +1,18 @@
 #pragma once
 
-#include <cstdio>
+#include <set>
+#include <stdio.h>
 #include <string>
 #include <vector>
-#include <set>
 
 #include "object_detection.hpp"
 
-namespace lpdnn::aiapp_impl {
+namespace lpdnn {
+namespace aiapp_impl {
 
-/**
+    using FBContainer = std::array<std::array<std::vector<float>, 9>, 19>;
+
+    /**
   Post-processing logic for OpenPifPaf
 
   \note This object caches the big tensors to save on memory allocations.
@@ -21,168 +24,101 @@ namespace lpdnn::aiapp_impl {
   the same time. If you must use multiple threads, give each thread its own
   instance of this class.
  */
-class OpenPifPafPostprocessor
-{
-public:
-  OpenPifPafPostprocessor() : H(0), W(0) { }
-
-  /**
-    Applies post-processing to OpenPifPaf output.
-
-    \param inpWidth Width of the input tensor in pixels.
-    \param inpHeight Height of the input tensor in pixels.
-    \param tensorWidth Width of the neural network's PIF and PAF outputs.
-    \param tensorHeight Height of the neural network's PIF and PAF outputs.
-  */
-  ai_app::Object_detection::Result postprocess_0_8(
-    int inpWidth, int inpHeight, int tensorWidth, int tensorHeight,
-    const float* pif_c,  // 17xHxW
-    const float* pif_r,  // 34xHxW
-    const float* pif_s,  // 17xHxW
-    const float* paf_c,  // 19xHxW
-    const float* paf_r1, // 38xHxW
-    const float* paf_r2, // 38xHxW
-    const float* paf_b1, // 19xHxW
-    const float* paf_b2  // 19xHxW
-  );
-
-public:
-  static const int numKeypoints = 17;
-  static const int numBones = 19;
-
-  // Connections between the different keypoint indices.
-  // Note: these start at 1, not 0!
-  static const int bones[19][2];
-
-private:
-  struct Annotation {
-    // Array of `numKeypoints * 3` elements:
-    // - element `i*3 + 0` is x-coordinate (normalized)
-    // - element `i*3 + 1` is y-coordinate (normalized)
-    // - element `i*3 + 2` is confidence score
-    std::vector<float> keypoints;
-
-    std::vector<float> jointScales;
-
-    Annotation(int j, float x, float y, float v) : keypoints(numKeypoints * 3),
-                                                   jointScales(numKeypoints)
-    {
-      keypoints[j*3    ] = x;
-      keypoints[j*3 + 1] = y;
-      keypoints[j*3 + 2] = v;
-    }
-
-    /**
+    class OpenPifPafPostprocessor {
+    public:
+        OpenPifPafPostprocessor()
+            : H(0)
+            , W(0)
+        {
+        }
+
+    public:
+        static constexpr int N_PIFPAF_KEYPOINTS = 17;
+        static constexpr int N_PIFPAF_BONES = 19;
+
+        // Connections between the different keypoint indices.
+        // Note: these start at 1, not 0!
+        static const int bones[19][2];
+        float keypointThreshold;
+
+        ai_app::Object_detection::Result postprocess(
+            int inputWidth, int inputHeight,
+            int tensorWidth, int tensorHeight,
+            const std::vector<float>& pif,
+            const std::vector<float>& paf);
+
+    private:
+        struct Annotation {
+            // Array of `N_PIFPAF_KEYPOINTS * 3` elements:
+            // - element `i*3 + 0` is x-coordinate (normalized)
+            // - element `i*3 + 1` is y-coordinate (normalized)
+            // - element `i*3 + 2` is confidence score
+            std::vector<float> keypoints;
+
+            std::vector<float> jointScales;
+
+            Annotation(int j, float x, float y, float v)
+                : keypoints(N_PIFPAF_KEYPOINTS * 3)
+                , jointScales(N_PIFPAF_KEYPOINTS)
+            {
+                keypoints[j * 3] = x;
+                keypoints[j * 3 + 1] = y;
+                keypoints[j * 3 + 2] = v;
+            }
+
+            /**
       Overall confidence score for the entire skeleton.
     */
-    [[nodiscard]] float score() const {
-      float maxv = 0.0f;
-      float vv = 0.0f;
-      for (int k = 0; k < numKeypoints; ++k) {
-        auto v = keypoints[k*3 + 2];
-        maxv = std::max(maxv, v);
-        vv += v * v;
-      }
-      return 0.1f * maxv + 0.9f * vv / (float)numKeypoints;
-    }
-  };
-
-  // 0: confidence of origin
-  // 1: connection index
-  // 2: forward?
-  // 3: joint index 1 (not corrected for forward)
-  // 4: joint index 2 (not corrected for forward)
-  typedef std::tuple<float, int, bool, int, int> frontier_t;
-  typedef std::tuple<std::vector<float>, std::vector<float>, std::vector<float>> Target_intensity;
-  typedef std::tuple<std::vector<std::vector<float>>, std::vector<std::vector<float>>> Paf_target;
-  typedef std::tuple<float, int, float, float> Pifhr_seed;
-  typedef std::tuple<float, float, float> Connection;
-
-private:
-  void initTensors(int tensorWidth, int tensorHeight);
-
-  void normalizePAF(const float* intensityFields,
-                    const float* j1Fields,
-                    const float* j2Fields,
-                    const float* j1FieldsLogb,
-                    const float* j2FieldsLogb);
-
-  void normalizePIF(const float* jointIntensityFields,
-                    const float* jointFields,
-                    const float* scaleFields);
-
-  Target_intensity
-  targetIntensities(const std::vector<float>& pif,
-                    float v_th = 0.1f,
-                    bool coreOnly = false);
-
-  Paf_target
-  scorePafTarget(const std::vector<float>& pafvec,
-                 const std::vector<float>& pifhr,
-                 float pifhr_floor = 0.01f,
-                 float score_th = 0.1f) const;
-
-  std::vector<Pifhr_seed>
-  pifhrSeeds(const std::vector<float>& pifhrScales,
-             const std::vector<float>& pifhrCore);
-
-  static std::vector<float> pafCenter(const std::vector<float>& paf_field,
-                               float x, float y, float sigma = 1.0f);
-
-  static Connection
-  growConnection(float x, float y, const std::vector<float>& paf_field_);
-
-  static std::vector<frontier_t> frontier(Annotation& ann);
-
-  frontier_t frontierIter(Annotation& ann);
-
-  void grow(Annotation& ann,
-            const std::vector<std::vector<float>>& pafForward,
-            const std::vector<std::vector<float>>& pafBackward,
-            float th = 0.1f);
-
-  void fillJointScales(Annotation& ann,
-                       const std::vector<float>& scales,
-                       int fieldH,
-                       int fieldW,
-                       float hr_scale);
-
-  std::vector<Annotation>
-  decodeAnnotations(const std::vector<float>& pifhr,
-                    const std::vector<float>& pifhrScales,
-                    const std::vector<float>& pifhrCore,
-                    const std::vector<std::vector<float>>& pafForward,
-                    const std::vector<std::vector<float>>& pafBackward);
-
-  std::vector<Annotation> softNMS(std::vector<Annotation>& annotations);
-
-private:
-  // Used to normalize the skeleton keypoint coordinates to [0, 1].
-  float inputWidth, inputHeight;
-
-  // Tensor dimensions (hr = high-resolution).
-  int H, W, H_hr, W_hr;
-
-  // Strides for tensor dimensions.
-  size_t paf_stride_2, paf_stride_1, paf_stride_0;
-  size_t pif_stride_1, pif_stride_0;
-  size_t pifhr_stride_1, pifhr_stride_0;
-
-  // Temporary tensors.
-  std::vector<float> indexField;     // 2 x H x W
-  std::vector<float> indexField_hr;  // 2 x H x W
-  std::vector<float> paf;            // 19 x 2 x 4 x H x W
-  std::vector<float> pif;            // 17 x     4 x H x W
-
-  // Filled in by targetIntensities().
-  std::vector<float> targetsCoreOnly;
-  std::vector<float> targets;
-  std::vector<float> scales;
-  std::vector<float> ns;
-
-  std::set<std::tuple<int, bool>> blockFrontier;
-  bool frontierActive;
-};
+            float score() const
+            {
+                float maxv = 0.0f;
+                float vv = 0.0f;
+                for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                    auto v = keypoints[k * 3 + 2];
+                    if (v > maxv) {
+                        maxv = v;
+                    }
+                    vv += v * v;
+                }
+                return 0.1f * maxv + 0.9f * vv / (float)N_PIFPAF_KEYPOINTS;
+            }
+        };
+
+        typedef std::tuple<std::vector<float>, std::vector<float>, std::vector<float>> Target_intensity;
+
+    private:
+        void initTensors(int tensorWidth, int tensorHeight);
+
+        Target_intensity
+        targetIntensities(const std::vector<float>& pif,
+            float v_th = 0.1f,
+            bool coreOnly = false);
+
+        std::tuple<float, float, float, float>
+        growConnectionBlend(float x, float y, float s, const std::array<std::vector<float>, 9>& paf_field_);
+
+        // frontier_t frontierIter(Annotation& ann);
+
+        void grow(Annotation& ann,
+            const FBContainer& pafForward,
+            const FBContainer& pafBackward);
+
+        std::vector<Annotation> softNMS(std::vector<Annotation>& annotations);
+
+    private:
+        // Tensor dimensions (hr = high-resolution).
+        int H, W, H_hr, W_hr;
+
+        // Strides for tensor dimensions.
+        size_t pif_stride_1, pif_stride_0;
+        size_t pifhr_stride_1, pifhr_stride_0;
+
+        // Filled in by targetIntensities().
+        std::vector<float> targetsCoreOnly;
+        std::vector<float> targets;
+        std::vector<float> scales;
+        std::vector<float> ns;
+    };
 
 }
-
+}

From e8f5b1daeb91bb6551ca663c296b7fcf264dfa03 Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Sat, 26 Jun 2021 12:08:17 +0800
Subject: [PATCH 4/4] fix: ci

---
 cmake/hyperpose.fake.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cmake/hyperpose.fake.cmake b/cmake/hyperpose.fake.cmake
index 07ae7c1c..7aa1ce0c 100644
--- a/cmake/hyperpose.fake.cmake
+++ b/cmake/hyperpose.fake.cmake
@@ -12,7 +12,11 @@ ADD_LIBRARY(
         src/stream.cpp
         src/thread_pool.cpp
         src/pose_proposal.cpp
-        src/human.cpp)
+        src/human.cpp
+        src/pifpaf.cpp
+        src/pifpaf_decoder/math_helpers.cpp
+        src/pifpaf_decoder/openpifpaf_postprocessor.cpp
+)
 
 TARGET_LINK_LIBRARIES(
         ${POSE_LIB_NAME}