diff --git a/README.md b/README.md
index 2db67570..7806a2d4 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,7 @@ We compare the prediction performance of HyperPose with [OpenPose 1.6](https://g
 | OpenPose (TinyVGG)  | 34.7 MB       | 384 x 256          | **124.925 FPS**         | N/A                   |
 | OpenPose (MobileNet) | 17.9 MB       | 432 x 368          | **84.32 FPS**           | 8.5 FPS (TF-Pose)         |
 | OpenPose (ResNet18)  | 45.0 MB       | 432 x 368          | **62.52 FPS**           | N/A                  |
+| OpenPifPaf (ResNet50)  | 97.6 MB       | 97 x 129          | **178.6 FPS**           | 35.3                  |
 
 </a>
 <p align="center">
diff --git a/cmake/hyperpose.cmake b/cmake/hyperpose.cmake
index 6ebb96b5..28131965 100644
--- a/cmake/hyperpose.cmake
+++ b/cmake/hyperpose.cmake
@@ -5,10 +5,15 @@ set(POSE_LIB_NAME hyperpose)
 INCLUDE(cmake/cuda.cmake)
 FIND_PACKAGE(OpenCV REQUIRED)
 
+FILE(GLOB PIFPAF_DECODER
+        src/pifpaf_decoder/*.cpp)
+
 ADD_LIBRARY(
         ${POSE_LIB_NAME} # SHARED
         src/logging.cpp
         src/tensorrt.cpp
+        src/pifpaf.cpp
+        ${PIFPAF_DECODER}
         src/paf.cpp
         src/data.cpp
         src/stream.cpp
diff --git a/cmake/hyperpose.fake.cmake b/cmake/hyperpose.fake.cmake
index 07ae7c1c..7aa1ce0c 100644
--- a/cmake/hyperpose.fake.cmake
+++ b/cmake/hyperpose.fake.cmake
@@ -12,7 +12,11 @@ ADD_LIBRARY(
         src/stream.cpp
         src/thread_pool.cpp
         src/pose_proposal.cpp
-        src/human.cpp)
+        src/human.cpp
+        src/pifpaf.cpp
+        src/pifpaf_decoder/math_helpers.cpp
+        src/pifpaf_decoder/openpifpaf_postprocessor.cpp
+)
 
 TARGET_LINK_LIBRARIES(
         ${POSE_LIB_NAME}
diff --git a/examples/cli.cpp b/examples/cli.cpp
index 80a54aa4..dc4dd9f5 100644
--- a/examples/cli.cpp
+++ b/examples/cli.cpp
@@ -9,16 +9,17 @@
 #define kSTREAM "stream"
 #define kPAF "paf"
 #define kPPN "ppn"
+#define kPIFPAF "pifpaf"
 
 // Model Configuration.
 DEFINE_string(model, "../data/models/TinyVGG-V1-HW=256x384.uff", "Path to the model.");
 DEFINE_string(
     post,
     kPAF,
-    "Post-processing method. (`" kPAF "` -> [Part Affine Field] or `" kPPN "` -> [Pose Proposal Network])");
+    "Post-processing method. (`" kPAF "` -> [Part Affine Field] or `" kPPN "` -> [Pose Proposal Network]) or `" kPIFPAF "` -> [Pif Paf]");
 DEFINE_int32(w, 384, "Width of input image.");
 DEFINE_int32(h, 256, "Height of input image.");
-DEFINE_int32(max_batch_size, 8, "Max batch size for inference engine to execute.");
+DEFINE_int32(max_batch_size, 4, "Max batch size for inference engine to execute.");
 
 // Execution Mode
 DEFINE_bool(imshow, true, "Whether to open an imshow window.");
@@ -37,18 +38,19 @@ namespace hp = hyperpose;
 
 class parser_variant {
 public:
+    using var_t = std::variant<hp::parser::pose_proposal, hp::parser::paf, hp::parser::pifpaf>;
     template <typename Container>
     std::vector<hp::human_t> process(Container&& feature_map_containers)
     {
         return std::visit([&feature_map_containers](auto& arg) { return arg.process(feature_map_containers); }, m_parser);
     }
-    parser_variant(std::variant<hp::parser::pose_proposal, hp::parser::paf> v)
+    parser_variant(var_t v)
         : m_parser(std::move(v))
     {
     }
 
 private:
-    std::variant<hp::parser::pose_proposal, hp::parser::paf> m_parser;
+    var_t m_parser;
 };
 //parser_variant parser{parser};
 
@@ -142,14 +144,17 @@ int main(int argc, char** argv)
     }();
     cli_log() << "DNN engine is built.\n";
 
-    auto parser = parser_variant{ [&engine]() -> std::variant<hp::parser::pose_proposal, hp::parser::paf> {
+    auto parser = parser_variant{ [&engine]() -> parser_variant::var_t {
         if (FLAGS_post == kPAF)
             return hp::parser::paf{};
 
         if (FLAGS_post == kPPN)
             return hp::parser::pose_proposal(engine.input_size());
 
-        cli_log() << "ERROR: Unknown post-processing flag: `" << FLAGS_post << "`. Use `paf` or `ppn` please.\n";
+        if (FLAGS_post == kPIFPAF)
+            return hp::parser::pifpaf(engine.input_size().height, engine.input_size().width);
+
+        cli_log() << "ERROR: Unknown post-processing flag: `" << FLAGS_post << "`. Use `paf`, `ppn` or `pifpaf` please.\n";
         std::exit(-1);
     }() };
 
@@ -179,6 +184,7 @@ int main(int argc, char** argv)
     if (FLAGS_runtime == kOPERATOR) {
         if (images.empty()) { // For CAP.
 
+            auto beg = clk_t::now();
             auto writer = make_writer();
             while (cap.isOpened()) {
                 cv::Mat mat;
@@ -222,6 +228,9 @@ int main(int argc, char** argv)
                         break;
                 }
             }
+            auto inference_time = std::chrono::duration<double, std::milli>(clk_t::now() - beg).count();
+            std::cout << cap.get(cv::CAP_PROP_FRAME_COUNT) << " images got processed in " << inference_time << " ms, FPS = "
+                      << 1000. * cap.get(cv::CAP_PROP_FRAME_COUNT) / inference_time << '\n';
         } else { // For Vec<Image>.
             auto beg = clk_t::now();
             // * TensorRT Inference.
diff --git a/examples/gen_serialized_engine.example.cpp b/examples/gen_serialized_engine.example.cpp
index f172a930..3f09dd98 100644
--- a/examples/gen_serialized_engine.example.cpp
+++ b/examples/gen_serialized_engine.example.cpp
@@ -12,7 +12,7 @@ DEFINE_string(output_name_list, "outputs/conf,outputs/paf", "The output node nam
 
 DEFINE_int32(input_height, 256, "Height of input image.");
 DEFINE_int32(input_width, 384, "Width of input image.");
-DEFINE_int32(max_batch_size, 32, "The max batch size for the exported serialized model.");
+DEFINE_int32(max_batch_size, 1, "The max batch size for the exported serialized model.");
 
 DEFINE_string(output_model, "", "Path to output serialized model.");
 
diff --git a/examples/operator_api_batched_images_pifpaf.example.cpp b/examples/operator_api_batched_images_pifpaf.example.cpp
new file mode 100644
index 00000000..5f57fa11
--- /dev/null
+++ b/examples/operator_api_batched_images_pifpaf.example.cpp
@@ -0,0 +1,77 @@
+#include "utils.hpp"
+#include <gflags/gflags.h>
+#include <hyperpose/hyperpose.hpp>
+#include <string_view>
+
+// Model flags
+DEFINE_string(model_file, "../data/models/openpifpaf-resnet50.onnx", "Path to the model.");
+
+DEFINE_bool(logging, false, "Print the logging information or not.");
+DEFINE_int32(input_height, 640, "Height of input image.");
+DEFINE_int32(input_width, 427, "Width of input image.");
+
+DEFINE_string(input_folder, "../data/media", "Folder of images to inference.");
+
+int main(int argc, char** argv)
+{
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    // * Collect data into batch.
+    std::vector<cv::Mat> batch = glob_images(FLAGS_input_folder);
+
+    if (batch.empty()) {
+        example_log() << "No input images got. Exiting.\n";
+        exit(-1);
+    }
+
+    example_log() << "Batch shape: [" << batch.size() << ", 3, " << FLAGS_input_height << ", " << FLAGS_input_width << "]\n";
+
+    // * Create TensorRT engine.
+    namespace hp = hyperpose;
+    if (FLAGS_logging)
+        hp::enable_logging();
+
+    auto engine = [&] {
+        using namespace hp::dnn;
+        constexpr std::string_view onnx_suffix = ".onnx";
+        constexpr std::string_view uff_suffix = ".uff";
+
+        if (std::equal(onnx_suffix.crbegin(), onnx_suffix.crend(), FLAGS_model_file.crbegin()))
+            return tensorrt(onnx{ FLAGS_model_file }, { FLAGS_input_width, FLAGS_input_height }, batch.size());
+
+        example_log() << "Your model file's suffix is not [.onnx | .uff]. Your model file path: " << FLAGS_model_file;
+        example_log() << "Trying to be viewed as a serialized TensorRT model.";
+
+        return tensorrt(tensorrt_serialized{ FLAGS_model_file }, { FLAGS_input_width, FLAGS_input_height }, batch.size());
+    }();
+
+    hp::parser::pifpaf parser(engine.input_size().height, engine.input_size().width);
+
+    using clk_t = std::chrono::high_resolution_clock;
+    auto beg = clk_t::now();
+    {
+        // * TensorRT Inference.
+        auto feature_map_packets = engine.inference(batch);
+        for (const auto& packet : feature_map_packets)
+            for (const auto& feature_map : packet)
+                example_log() << feature_map << std::endl;
+
+        // * Paf.
+        std::vector<std::vector<hp::human_t>> pose_vectors;
+        pose_vectors.reserve(feature_map_packets.size());
+        for (auto&& packet : feature_map_packets) {
+            pose_vectors.push_back(parser.process(packet[0], packet[1]));
+        }
+
+        std::cout << batch.size() << " images got processed. FPS = "
+                  << 1000. * batch.size() / std::chrono::duration<double, std::milli>(clk_t::now() - beg).count()
+                  << '\n';
+
+        for (size_t i = 0; i < batch.size(); ++i) {
+            cv::resize(batch[i], batch[i], { FLAGS_input_width, FLAGS_input_height });
+            for (auto&& pose : pose_vectors[i])
+                hp::draw_human(batch[i], pose);
+            cv::imwrite("output_" + std::to_string(i) + ".png", batch[i]);
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/hyperpose/hyperpose.hpp b/include/hyperpose/hyperpose.hpp
index a851c758..b9df8fcb 100644
--- a/include/hyperpose/hyperpose.hpp
+++ b/include/hyperpose/hyperpose.hpp
@@ -9,6 +9,7 @@
 
 #include "operator/dnn/tensorrt.hpp"
 #include "operator/parser/paf.hpp"
+#include "operator/parser/pifpaf.hpp"
 #include "operator/parser/proposal_network.hpp"
 
 #include "stream/stream.hpp"
\ No newline at end of file
diff --git a/include/hyperpose/operator/parser/pifpaf.hpp b/include/hyperpose/operator/parser/pifpaf.hpp
new file mode 100644
index 00000000..41eb83de
--- /dev/null
+++ b/include/hyperpose/operator/parser/pifpaf.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../../utility/data.hpp"
+#include "paf.hpp"
+
+namespace hyperpose::parser {
+
+class pifpaf {
+public:
+    inline explicit pifpaf(int h, int w, float thresh = 0.1)
+        : m_net_h(h)
+        , m_net_w(w)
+        , m_keypoint_thresh(thresh){};
+    std::vector<human_t> process(const feature_map_t& pif, const feature_map_t& paf);
+    template <typename C>
+    std::vector<human_t> process(C&& feature_map_containers)
+    {
+        // 1@pif, 2@paf.
+        assert(feature_map_containers.size() == 2);
+        return process(feature_map_containers[0], feature_map_containers[1]);
+    }
+
+private:
+    int m_net_w, m_net_h;
+    float m_keypoint_thresh;
+};
+
+} // namespace hyperpose
\ No newline at end of file
diff --git a/src/human.cpp b/src/human.cpp
index dfc46893..7473a116 100644
--- a/src/human.cpp
+++ b/src/human.cpp
@@ -7,7 +7,7 @@ namespace hyperpose {
 void draw_human(cv::Mat& img, const human_t& human)
 {
     float n = 1, s = 0, w = 1, e = 0;
-    for(const auto& p : human.parts)
+    for (const auto& p : human.parts)
         if (p.has_value) {
             n = std::min(n, p.y);
             s = std::max(s, p.y);
diff --git a/src/pifpaf.cpp b/src/pifpaf.cpp
new file mode 100644
index 00000000..4bfc7064
--- /dev/null
+++ b/src/pifpaf.cpp
@@ -0,0 +1,97 @@
+#include "pifpaf_decoder/openpifpaf_postprocessor.hpp"
+#include <hyperpose/operator/parser/pifpaf.hpp>
+
+namespace hyperpose::parser {
+
+// TODO: Name ORDER!
+std::vector<human_t> pifpaf::process(const feature_map_t& paf, const feature_map_t& pif)
+{
+    // Helpful links (Chinese)::
+    // https://zhuanlan.zhihu.com/p/93896207
+    // https://zhuanlan.zhihu.com/p/68073113
+    // pif: [17, 5, h, w] => KEY POINTS;
+    // 5: [conf, dx, dy, b, scale]
+    // Example: array([ 0.00527313,  0.13620843, -0.32253477,  0.3263721 ,  0.90980804], dtype=float32)
+    // heat map: f(x, y) = \sum_ij conf * N(x, y|ij)
+    // paf: [19, 9, h, w] => LIMBS;
+    // 9: [conf, [x1, y1, x2, y2], [b1, b2], [s1, s2]]
+    // Example: [ 0.00712654, -0.54057586,  5.4075847 ,  3.0354404 ,  3.1246614 ,  1.0621283 , -3.5857565 ,  2.6072054 ,  3.8406293 ],
+    // TODO: OPTIMIZE THIS.
+
+    lpdnn::aiapp_impl::OpenPifPafPostprocessor pp;
+    pp.keypointThreshold = m_keypoint_thresh;
+    size_t h = pif.shape()[pif.shape().size() - 2];
+    size_t w = pif.shape().back();
+
+    std::vector<float> pif_vec{}, paf_vec{};
+
+    const auto raw_copy = [](const feature_map_t& tensor, std::vector<float>& vec) {
+        size_t d0 = tensor.shape()[0];
+        size_t d1 = tensor.shape()[1];
+        size_t h = tensor.shape()[2];
+        size_t w = tensor.shape()[3];
+        const size_t total_size = d0 * d1 * h * w;
+        vec.reserve(total_size);
+        for (size_t i = 0; i < total_size; ++i) {
+            vec.push_back(tensor.view<float>()[i]);
+        }
+    };
+
+    raw_copy(pif, pif_vec);
+    raw_copy(paf, paf_vec);
+
+    // TODO: RECOVER THE INP{W, H};
+    auto apires = pp.postprocess(m_net_w, m_net_h, w, h, pif_vec, paf_vec);
+
+    std::vector<human_t> ret{};
+    ret.reserve(apires.items.size());
+    // OpenPifPaf COCO Topology: https://miro.medium.com/max/366/0*KFrFQVj3OoGAtt6o.png
+    // HyperPose: Unified Topology
+    // NOTE: This step is to convert pifpaf topology to hyperpose topology.
+
+    for (auto&& item : apires.items) {
+        if (item.landmarks.points.empty())
+            continue;
+        human_t man{};
+        man.score = item.confidence;
+
+        auto p2p = [this](const auto& src, auto& dst) {
+            if (src.confidence > 0.) {
+                dst.score = 1; // src.confidence; FIXME
+                dst.x = src.position.x / (float)m_net_w;
+                dst.y = src.position.y / (float)m_net_h;
+                dst.has_value = true;
+            }
+        };
+
+        auto& from = item.landmarks.points;
+        auto& to = man.parts;
+        // OpenPifPaf -> HyperPose
+        p2p(from[0], to[0]);
+        // ! to [1]
+        constexpr std::array<size_t, 16> from_index = {
+            6, 8, 10, 5, 7, 9,
+            12, 14, 16, 11, 13, 15,
+            2, 1, 4, 3
+        };
+
+        for (size_t i = 0; i < from_index.size(); ++i) {
+            p2p(from[from_index[i]], to[i + 2]);
+        }
+
+        if (to[2].has_value && to[5].has_value) {
+            to[1].x = (to[2].x + to[5].x) / 2;
+            ;
+            to[1].y = (to[2].y + to[5].y) / 2;
+            ;
+            to[1].has_value = true;
+            to[1].score = (to[2].score + to[5].score) / 2;
+        }
+
+        ret.push_back(man);
+    }
+
+    return ret;
+}
+
+} // namespace hyperpose
\ No newline at end of file
diff --git a/src/pifpaf_decoder/aiapp.hpp b/src/pifpaf_decoder/aiapp.hpp
new file mode 100644
index 00000000..85c75a09
--- /dev/null
+++ b/src/pifpaf_decoder/aiapp.hpp
@@ -0,0 +1,118 @@
+///
+/// Ai-app base interface and types
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace lpdnn {
+namespace ai_app {
+
+    /// Aiapp Blob
+    /// This could be improved to allow referring to existing data
+    /// thus avoding unneeded data-copy, for example by using shared_ptr.
+    struct Blob {
+        /// Data dimensions. Mandatory if the blob represents a tensor.
+        std::vector<int> dim;
+
+        /// Data. Mandatory if the blob represents a tensor.
+        std::vector<float> data;
+
+        /// Optional raw representation.
+        std::vector<uint8_t> raw;
+
+        /// Optional CBOR representation when data is structured.
+        std::vector<uint8_t> cbor;
+
+        /// Optional additional information
+        /// (eg, description of internal representation: "NCHW,8bits,dp3").
+        std::string info;
+    };
+
+    /// AI-App interface
+    class Aiapp {
+    public:
+        virtual ~Aiapp() {}
+
+        /// @return the ai-class id for this aiapp
+        virtual const char* class_id() const = 0;
+
+        /// @return the implementation id for this aiapp
+        virtual const char* impl_id() const = 0;
+
+        /// Initialization options
+        /// \param cfg: configuration string, typically in JSON format.
+        /// \return: true if success
+        virtual bool init(const std::string& cfg) = 0;
+
+        /// Set runtime options for the specified component
+        /// \param opt: runtime options, typically in JSON format.
+        /// \param name: subcomponent name
+        /// \return: true if success
+        virtual bool set_options(const std::string& opt,
+            const std::string& name = "")
+            = 0;
+
+        /// Introspection methods
+        /// \{
+
+        /// \return: names of all direct subcomponents of the specified component
+        virtual std::vector<std::string> components(
+            const std::string& name = "") const = 0;
+
+        /// \return output(s) of the specified component
+        virtual std::vector<Blob> output(const std::string& name = "") const = 0;
+
+        /// \return metrics of the specified component and all its subcomponents
+        virtual std::string metrics(const std::string& name = "") const = 0;
+
+        /// set end-of-execution at the end of the specified component
+        /// if name is empty any exit-point previously set is removed
+        virtual bool set_exit_after(const std::string& name = "") = 0;
+
+        /// \}
+    };
+
+    /// AiApp standard processing components
+    /// Each ai-app can contain other sub-components.
+    /// Each subcomponent can be identified by a pathname, for example:
+    ///   "preprocessing.normalize"
+    ///   "inference.net1.conv23"
+    struct Component {
+        /// Standard component names. Their use is not mandatory but
+        /// allows an ai-app to be supported by existing tools.
+        static constexpr char const* preprocessing = "preprocessing";
+        static constexpr char const* inference = "inference";
+        static constexpr char const* postprocessing = "postprocessing";
+
+        /// Ai-app interface parameters
+        static constexpr char const* interface = "interface";
+
+        /// Name separator in a component pathname string.
+        /// Component names can't contain the separator except possibly for the leafs
+        static constexpr char separator = '.';
+
+        /// Concatenate component names in a component pathname
+        static std::string join(const std::string& path, const std::string& comp)
+        {
+            return path + separator + comp;
+        }
+    };
+
+    /// AiApp Metrics
+    struct Metrics {
+        /// Standard metrics. All timings are in microseconds.
+        static constexpr char const* init_time = "init_time";
+        static constexpr char const* inference_time = "inference_time";
+        static constexpr char const* inference_cpu_time = "inference_cpu_time";
+    };
+
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/image_based.hpp b/src/pifpaf_decoder/image_based.hpp
new file mode 100644
index 00000000..914e0f7c
--- /dev/null
+++ b/src/pifpaf_decoder/image_based.hpp
@@ -0,0 +1,152 @@
+///
+/// Ai-app interface and types for image-based ai-apps
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include "aiapp.hpp"
+
+namespace lpdnn {
+namespace ai_app {
+
+    /// 2-dimensional size
+    struct Dim2d {
+        int x;
+        int y;
+    };
+
+    /// Rectangle
+    struct Rect {
+        Dim2d origin;
+        Dim2d size;
+
+        bool empty() const { return size.x <= 0 || size.y <= 0; }
+    };
+
+    /// Landmarks
+    struct Landmark {
+        Dim2d position;
+        float confidence; /// Negative value if N/A
+    };
+
+    struct Landmarks {
+        /// Landmark specification identifier
+        std::string type;
+        /// Landmark points
+        std::vector<Landmark> points;
+    };
+
+    /// Image representation.
+    /// The data of a RAW image consists of *y scanlines of *x pixels,
+    /// with each pixel consisting of N interleaved 8-bit components; the first
+    /// pixel pointed to is top-left-most in the image. There is no padding between
+    /// image scanlines or between pixels, regardless of format. The number of
+    /// components N is 3 for RGB images, 4 for RGBA, 1 for grayscale.
+    /// Support for 8bits RGB format is MANDATORY for all image-processing AiApps.
+    /// An image can be constructed from a std::vector<uint8_t>, or a std::string
+    /// or raw data pointer and size. When passing rvalues vector or strings, the
+    /// image will take ownership of the data, otherwise will just keep reference.
+    class Image {
+    protected:
+        /// Contains image data if we have ownership of it
+        std::vector<uint8_t> _image_content;
+
+    public:
+        /// Image format
+        enum class Format {
+            raw_grayscale = 1, /// 8bits grayscale
+            raw_rgb8 = 3, /// 8bits RGB *MANDATORY*
+            raw_rgba8 = 4, /// 8bits RGBA
+
+            encoded = 256, /// Standard JPEG/BMP/PNG/TIFF format
+
+            custom = 512 /// Custom format. Use attributes field for more details.
+        };
+
+        /// Don't take data ownership.
+        /// img_dim parameter can be omitted in case of encoded images since
+        /// this information will be extracted from the image content itself.
+        Image(Format img_format, const std::vector<uint8_t>& data, Dim2d img_dim = {})
+            : Image(img_format, data.data(), data.size(), img_dim)
+        {
+        }
+
+        /// Take data ownership
+        Image(Format img_format, std::vector<uint8_t>&& data, Dim2d img_dim = {})
+            : _image_content(std::move(data))
+            , format{ img_format }
+            , dim(img_dim)
+            , data{ _image_content.data() }
+            , data_size{ _image_content.size() }
+        {
+        }
+
+        /// Don't take data ownership.
+        Image(Format img_format, const std::string& data, Dim2d img_dim = {})
+            : Image(img_format, (uint8_t*)data.c_str(), data.size(), img_dim)
+        {
+        }
+
+        /// Take data ownership
+        Image(Format img_format, std::string&& data, Dim2d img_dim = {})
+            : Image(img_format,
+                std::vector<uint8_t>((uint8_t*)data.c_str(),
+                    (uint8_t*)data.c_str() + data.size()),
+                img_dim)
+        {
+            data.clear();
+        }
+
+        /// Don't take data ownership
+        /// img_data_size is mandatory in case of encoded images.
+        Image(Format img_format, const uint8_t* img_data, size_t img_data_size,
+            Dim2d img_dim = {})
+            : format{ img_format }
+            , dim(img_dim)
+            , data{ img_data }
+            , data_size{ img_data_size }
+        {
+        }
+
+        /// Utility factory methods
+        static Image encoded(const std::vector<uint8_t>& data)
+        {
+            return Image(Format::encoded, data);
+        }
+
+        /// Image format
+        Format format;
+
+        /// Image dimensions (for raw images)
+        Dim2d dim;
+
+        /// Region of interest inside the image (all if empty)
+        Rect roi{};
+
+        /// Custom attributes.
+        /// This is ai-app specific and allows to specify custom data formats.
+        std::string attributes;
+
+        /// Pointer to image data (no ownership of the data).
+        const uint8_t* data;
+
+        /// Size of image data. Mandatory for encoded images.
+        size_t data_size;
+
+        /// Additional optional information about the image.
+        /// May be required by some aiapps.
+        Landmarks landmarks;
+    };
+
+    /// Abstract image-based AiApp
+    class Image_based : virtual public Aiapp {
+    public:
+        /// @return supported image formats (ordered by preference)
+        virtual std::vector<Image::Format> image_formats() const = 0;
+    };
+
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/math_helpers.cpp b/src/pifpaf_decoder/math_helpers.cpp
new file mode 100644
index 00000000..f7634da6
--- /dev/null
+++ b/src/pifpaf_decoder/math_helpers.cpp
@@ -0,0 +1,25 @@
+#include "math_helpers.hpp"
+
+void vfill(float* x, unsigned long n, float v)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        x[i] = v;
+    }
+}
+
+void vmul(const float* a, const float* b, float* c, unsigned long n)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        c[i] = a[i] * b[i];
+    }
+}
+
+void vsmul(const float* a, float b, float* c, unsigned long n)
+{
+    // Slow version
+    for (unsigned long i = 0; i < n; ++i) {
+        c[i] = a[i] * b;
+    }
+}
diff --git a/src/pifpaf_decoder/math_helpers.hpp b/src/pifpaf_decoder/math_helpers.hpp
new file mode 100644
index 00000000..15dcb087
--- /dev/null
+++ b/src/pifpaf_decoder/math_helpers.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+// x[i] = v
+void vfill(float* x, unsigned long n, float v);
+
+// c[i] = a[i] * b[i]
+void vmul(const float* a, const float* b, float* c, unsigned long n);
+
+// c[i] = a[i] * b
+void vsmul(const float* a, float b, float* c, unsigned long n);
diff --git a/src/pifpaf_decoder/object_detection.hpp b/src/pifpaf_decoder/object_detection.hpp
new file mode 100644
index 00000000..91c8f3c0
--- /dev/null
+++ b/src/pifpaf_decoder/object_detection.hpp
@@ -0,0 +1,50 @@
+///
+/// Ai-app interface for object detection
+///
+/// \copyright 2018 NVISO SA. All rights reserved.
+/// \license This project is released under the XXXXXX License.
+///
+
+#pragma once
+
+#include "image_based.hpp"
+
+namespace lpdnn {
+namespace ai_app {
+
+    /// Object detection AiApp
+    class Object_detection : virtual public Image_based {
+    public:
+        struct Result {
+            struct Item {
+                float confidence;
+                int class_index;
+                Rect bounding_box;
+                Landmarks landmarks;
+            };
+
+            bool success{};
+            std::vector<Item> items;
+        };
+
+        /// Set minimum detectable object size
+        /// @return true if success
+        virtual bool set_min_size(Dim2d minSize) = 0;
+
+        /// Set maximum detectable object size
+        /// @return true if success
+        virtual bool set_max_size(Dim2d maxSize) = 0;
+
+        /// Perform inference.
+        virtual Result execute(const Image& input) = 0;
+
+        /// @return Names of classes
+        virtual std::vector<std::string> classes() = 0;
+
+        /// @return our aiapp class id
+        const char* class_id() const override { return ai_class_id; }
+        static constexpr char const* ai_class_id = "com_bonseyes::object_detection";
+    };
+
+} // namespace ai_app
+} // namespace lpdnn
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.cpp b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
new file mode 100644
index 00000000..d0a52617
--- /dev/null
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.cpp
@@ -0,0 +1,930 @@
+// Heavily modified from openpifpaf/cpp/example.
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <queue>
+#include <tuple>
+#include <utility>
+
+#include "math_helpers.hpp"
+#include "openpifpaf_postprocessor.hpp"
+
+struct Occupancy {
+    // self.reduction = reduction
+    // self.min_scale_reduced = min_scale / reduction
+    constexpr static float reduction = 2.f;
+    constexpr static float min_scale_reduced = 4.f / reduction;
+    size_t d0, d1, d2; // c h w
+    std::vector<uint8_t> occupancy_view;
+
+    Occupancy(size_t d0_, size_t d1_, size_t d2_)
+        : d0(d0_)
+        , d1(d1_)
+        , d2(d2_)
+        , occupancy_view(d0_ * d1_ * d2_)
+    {
+    }
+
+    bool fuzz_get(size_t f, float y, float x)
+    {
+        if (f >= d0)
+            return true;
+
+        // scalar_nonzero_clipped_with_reduction
+        float xx = std::min((float)d2 - 1, std::max(0.f, x / reduction));
+        float yy = std::min((float)d1 - 1, std::max(0.f, y / reduction));
+
+        return get(f, yy, xx);
+    }
+
+    bool get(size_t d0_, size_t d1_, size_t d2_)
+    {
+        return occupancy_view[(d1 * d2) * d0_ + d2 * d1_ + d2_];
+    }
+
+    void set(size_t d0_, size_t d1_, size_t d2_)
+    {
+        occupancy_view[(d1 * d2) * d0_ + d2 * d1_ + d2_] = 1;
+    }
+};
+
+namespace lpdnn {
+namespace aiapp_impl {
+
+    constexpr int OpenPifPafPostprocessor::bones[19][2] = {
+        { 16, 14 },
+        { 14, 12 },
+        { 17, 15 },
+        { 15, 13 },
+        { 12, 13 },
+        { 6, 12 },
+        { 7, 13 },
+        { 6, 7 },
+        { 6, 8 },
+        { 7, 9 },
+        { 8, 10 },
+        { 9, 11 },
+        { 2, 3 },
+        { 1, 2 },
+        { 1, 3 },
+        { 2, 4 },
+        { 3, 5 },
+        { 4, 6 },
+        { 5, 7 },
+    };
+
+    struct to_point {
+        int field_id;
+        bool possitve;
+    };
+
+    auto BY_SOURCE_MAP = [] {
+        // print(self.by_source)
+        // for i in range(17):
+        //     for (end_i), (caf_i, connect) in self.by_source[i].items():
+        //         data = f'to_point{{{caf_i}, {"true" if connect else "false"}}}'
+        //         print(f'smap[{i}][{end_i}] = {data};')
+        std::array<std::map<int, to_point, std::greater<>>, 17> smap;
+        smap[0][1] = to_point{ 13, true };
+        smap[0][2] = to_point{ 14, true };
+        smap[1][2] = to_point{ 12, true };
+        smap[1][0] = to_point{ 13, false };
+        smap[1][3] = to_point{ 15, true };
+        smap[2][1] = to_point{ 12, false };
+        smap[2][0] = to_point{ 14, false };
+        smap[2][4] = to_point{ 16, true };
+        smap[3][1] = to_point{ 15, false };
+        smap[3][5] = to_point{ 17, true };
+        smap[4][2] = to_point{ 16, false };
+        smap[4][6] = to_point{ 18, true };
+        smap[5][11] = to_point{ 5, true };
+        smap[5][6] = to_point{ 7, true };
+        smap[5][7] = to_point{ 8, true };
+        smap[5][3] = to_point{ 17, false };
+        smap[6][12] = to_point{ 6, true };
+        smap[6][5] = to_point{ 7, false };
+        smap[6][8] = to_point{ 9, true };
+        smap[6][4] = to_point{ 18, false };
+        smap[7][5] = to_point{ 8, false };
+        smap[7][9] = to_point{ 10, true };
+        smap[8][6] = to_point{ 9, false };
+        smap[8][10] = to_point{ 11, true };
+        smap[9][7] = to_point{ 10, false };
+        smap[10][8] = to_point{ 11, false };
+        smap[11][13] = to_point{ 1, false };
+        smap[11][12] = to_point{ 4, true };
+        smap[11][5] = to_point{ 5, false };
+        smap[12][14] = to_point{ 3, false };
+        smap[12][11] = to_point{ 4, false };
+        smap[12][6] = to_point{ 6, false };
+        smap[13][15] = to_point{ 0, false };
+        smap[13][11] = to_point{ 1, true };
+        smap[14][16] = to_point{ 2, false };
+        smap[14][12] = to_point{ 3, true };
+        smap[15][13] = to_point{ 0, true };
+        smap[16][14] = to_point{ 2, true };
+        return smap;
+    }();
+
+    static const int C = 17;
+    static const float STRIDE = 8.0f;
+    static const float seedThreshold = 0.3f; // 0.5
+    //static const float keypointThreshold = 0.15f;
+    static const float instanceThreshold = 0.2f;
+
+    static void scalarSquareAddConstant(float* field,
+        int fieldH,
+        int fieldW,
+        const std::vector<float>& x,
+        const std::vector<float>& y,
+        const std::vector<float>& width,
+        const std::vector<float>& v)
+    {
+        // minx_np = np.round(x_np - width_np).astype(np.int)
+        // minx_np = np.clip(minx_np, 0, field.shape[1] - 1)
+        std::vector<int> minx(x.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            minx[i] = std::min(fieldW - 1, std::max(0, (int)std::round(x[i] - width[i])));
+        }
+
+        // miny_np = np.round(y_np - width_np).astype(np.int)
+        // miny_np = np.clip(miny_np, 0, field.shape[0] - 1)
+        std::vector<int> miny(y.size());
+        for (size_t i = 0; i < y.size(); ++i) {
+            miny[i] = std::min(fieldH - 1, std::max(0, (int)std::round(y[i] - width[i])));
+        }
+
+        // maxx_np = np.round(x_np + width_np).astype(np.int)
+        // maxx_np = np.clip(maxx_np + 1, minx_np + 1, field.shape[1])
+        std::vector<int> maxx(x.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            maxx[i] = std::min(fieldW, std::max(minx[i] + 1, (int)std::round(x[i] + width[i]) + 1));
+        }
+
+        // maxy_np = np.round(y_np + width_np).astype(np.int)
+        // maxy_np = np.clip(maxy_np + 1, miny_np + 1, field.shape[0])
+        std::vector<int> maxy(y.size());
+        for (size_t i = 0; i < y.size(); ++i) {
+            maxy[i] = std::min(fieldH, std::max(miny[i] + 1, (int)std::round(y[i] + width[i]) + 1));
+        }
+
+        // for i in range(minx.shape[0]):
+        //     for xx in range(minx[i], maxx[i]):
+        //         for yy in range(miny[i], maxy[i]):
+        //             field[yy, xx] += v[i]
+        for (size_t i = 0; i < minx.size(); ++i) {
+            for (int yy = miny[i]; yy < maxy[i]; ++yy) {
+                for (int xx = minx[i]; xx < maxx[i]; ++xx) {
+                    field[yy * fieldW + xx] += v[i];
+                }
+            }
+        }
+    }
+
+    static void scalarSquareAddGaussWitMax(float* field,
+        int fieldH,
+        int fieldW,
+        const std::vector<float>& x,
+        const std::vector<float>& y,
+        const std::vector<float>& sigma_,
+        const std::vector<float>& v,
+        float truncate,
+        float max_val = 1.0f)
+    {
+        // // ganler!
+        // assert(v.size() == x.size() == y.size() == sigma_.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            float csigma = sigma_[i];
+            float truncate_csigma = csigma * truncate;
+            float cx = x[i];
+            float cy = y[i];
+            float cv = v[i];
+            const auto clip = [](float val, float low, float high) {
+                return std::max(low, std::min(high, val));
+            };
+
+            // printf("%f, %f, %f, %f, %f\n", cx, cy, csigma, truncate_csigma, max_val);
+            const int64_t minx = clip(cx - truncate_csigma, 0, fieldW - 1);
+            const int64_t maxx = clip(cx + truncate_csigma + 1, minx + 1, fieldW);
+            const int64_t miny = clip(cy - truncate_csigma, 0, fieldH - 1);
+            const int64_t maxy = clip(cy + truncate_csigma + 1, miny + 1, fieldH);
+            // std::cout << minx << '\t' << maxx << '\t' << miny << '\t' << maxy << '\n';
+            // printf("%lli, %lli, %lli, %lli\n", minx, maxx, miny, maxy);
+
+            for (int64_t xx = minx; xx < maxx; ++xx) {
+                float deltax2 = (xx - cx) * (xx - cx);
+                for (int64_t yy = miny; yy < maxy; ++yy) {
+                    float deltay2 = (yy - cy) * (yy - cy);
+
+                    if (deltax2 + deltay2 > truncate_csigma * truncate_csigma) {
+                        continue;
+                    }
+
+                    const auto approx_exp = [](float x) {
+                        if (x > 2 || x < -2)
+                            return 0.f;
+                        x = 1.f + x / 8;
+                        x *= x;
+                        x *= x;
+                        x *= x;
+                        return x;
+                    };
+                    float vv = (deltax2 < 0.25 && deltay2 < 0.25) ? cv : cv * approx_exp(-0.5 * (deltax2 + deltay2) / (csigma * csigma));
+                    field[yy * fieldW + xx] += vv;
+                    field[yy * fieldW + xx] = std::min(max_val, field[yy * fieldW + xx]);
+                }
+            }
+        }
+    }
+
+    static void scalarSquareAddSingle(Occupancy& field,
+        int field_idx,
+        int fieldH,
+        int fieldW,
+        float x,
+        float y,
+        float width,
+        float reduction = 1.0,
+        float min_scaled_reduced = 0.0)
+    {
+        if (reduction != 1.0) {
+            x /= reduction;
+            y /= reduction;
+            width = std::max(min_scaled_reduced, width / reduction);
+        }
+
+        // minx = max(0, int(round(x - width)))
+        // miny = max(0, int(round(y - width)))
+        auto minx = std::min(fieldW - 1, std::max(0, (int)(x - width)));
+        auto miny = std::min(fieldH - 1, std::max(0, (int)(y - width)));
+
+        // maxx = max(minx + 1, min(field.shape[1], int(round(x + width)) + 1))
+        // maxy = max(miny + 1, min(field.shape[0], int(round(y + width)) + 1))
+        auto maxx = std::min(fieldW, std::max(minx + 1, std::min(fieldW, (int)(x + width) + 1)));
+        auto maxy = std::min(fieldH, std::max(miny + 1, std::min(fieldH, (int)(y + width) + 1)));
+
+        // field[miny:maxy, minx:maxx] += value
+        for (auto yy = miny; yy < maxy; ++yy) {
+            for (auto xx = minx; xx < maxx; ++xx) {
+                field.set(field_idx, yy, xx);
+            }
+        }
+    }
+
+    OpenPifPafPostprocessor::Target_intensity
+    OpenPifPafPostprocessor::targetIntensities(const std::vector<float>& pif,
+        float v_th, bool coreOnly)
+    {
+        constexpr float PIF_NN = 16.0f;
+
+        const size_t targets_stride_0 = H_hr * W_hr;
+        const size_t scales_stride_0 = H_hr * W_hr;
+        const size_t ns_stride_0 = H_hr * W_hr;
+
+        // These tensors need to be emptied out on each frame.
+        vfill(targetsCoreOnly.data(), targetsCoreOnly.size(), 0.0f);
+        vfill(targets.data(), targets.size(), 0.0f);
+        vfill(scales.data(), scales.size(), 0.0f);
+        vfill(ns.data(), ns.size(), 0.0f);
+
+        std::vector<float> v;
+        std::vector<float> x;
+        std::vector<float> y;
+        std::vector<float> s;
+
+        for (int i = 0; i < C; ++i) {
+            // Threshold pif[i, ...], which is a (4, h, w) tensor. Copy the values
+            // that are over the threshold into four vectors: v, x, y, s. Multiply
+            // x, y, s with the stride.
+            //
+            // v, x, y, s = p[:, p[0] > v_th]
+            // x = x * self.stride
+            // y = y * self.stride
+            // s = s * self.stride
+            v.clear();
+            x.clear();
+            y.clear();
+            s.clear();
+            const size_t pifOffset = i * pif_stride_0;
+            const size_t xOffset = pifOffset + pif_stride_1;
+            const size_t yOffset = xOffset + pif_stride_1;
+            const size_t sOffset = yOffset + pif_stride_1 * 2;
+            for (int j = 0; j < H * W; ++j) {
+                float p = pif[pifOffset + j];
+                if (p > v_th) {
+                    v.push_back(p);
+                    x.push_back(pif[xOffset + j] * STRIDE);
+                    y.push_back(pif[yOffset + j] * STRIDE);
+                    s.push_back(std::max(1., 0.5 * pif[sOffset + j] * STRIDE));
+                }
+            }
+
+            /*
+    // For debugging
+    printf("iteration: %d\n", i);
+    printf("v:\n"); for (auto n : v) printf("%f, ", n); printf("\n");
+    printf("x:\n"); for (auto n : x) printf("%f, ", n); printf("\n");
+    printf("y:\n"); for (auto n : y) printf("%f, ", n); printf("\n");
+    printf("s:\n"); for (auto n : s) printf("%f, ", n); printf("\n");
+    */
+
+            // Create a high-resolution confidence map for this keypoint.
+            // std::cout << x.size() << '\t'<< y.size() << '\t'<< v.size() << '\t' << s.size() << '\n';
+            // v / pif_nn
+            std::vector<float> v_over_pif_nn(v.size());
+            vsmul(v.data(), 1.0f / PIF_NN, v_over_pif_nn.data(), v.size());
+
+            // The original code computed the "core only" version in a separate step
+            // but that duplicates a bunch of work, so we do it at the same time.
+            const auto tco = targetsCoreOnly.data() + i * targets_stride_0;
+            scalarSquareAddGaussWitMax(tco, H_hr, W_hr, x, y, s, v_over_pif_nn, 1.0f, 1.0f);
+
+            size_t cnt = 0;
+            for (size_t dd = 0; dd < targets_stride_0; ++dd) {
+                if (tco[dd] > 0.01)
+                    ++cnt;
+            }
+            // std::cout << targets_stride_0 << '\t' << i << '\t'<< cnt << '\t' << tco[0] << '\n';
+
+            // s * v
+            std::vector<float> s_times_v(v.size());
+            vmul(s.data(), v.data(), s_times_v.data(), v.size());
+
+            const auto t = targets.data() + i * targets_stride_0;
+            const auto scale = scales.data() + i * scales_stride_0;
+            const auto n = ns.data() + i * ns_stride_0;
+            scalarSquareAddGaussWitMax(t, H_hr, W_hr, x, y, s, v_over_pif_nn, 1.0f);
+            scalarSquareAddConstant(scale, H_hr, W_hr, x, y, s, s_times_v);
+            scalarSquareAddConstant(n, H_hr, W_hr, x, y, s, v);
+        }
+
+        // m = ns > 0
+        // scales[m] = scales[m] / ns[m]
+        for (size_t i = 0; i < scales.size(); ++i) {
+            const auto d = ns[i];
+            if (d > 0) {
+                scales[i] /= d;
+            }
+        }
+        return Target_intensity{ targets, scales, targetsCoreOnly };
+    }
+
+    std::tuple<float, float, float, float>
+    OpenPifPafPostprocessor::growConnectionBlend(float x, float y, float s, const std::array<std::vector<float>, 9>& paf_field)
+    {
+        // # source value
+        // paf_field = paf_center(paf_field, xy[0], xy[1], sigma=2.0)
+        // if paf_field.shape[1] == 0:
+        //     return 0, 0, 0
+        const float sigma = 2.0 * s;
+        const float sigma2 = 0.25 * s * s;
+        size_t score_1_i = 0, score_2_i = 0;
+        float score_1 = 0, score_2 = 0;
+
+        const int paf_stride = paf_field.front().size();
+        for (int i = 0; i < paf_stride; ++i) {
+            if ((paf_field[1][i] < x - sigma) || (paf_field[1][i] > x + sigma) || (paf_field[2][i] < y - sigma) || (paf_field[2][i] > y + sigma))
+                continue;
+            float d2 = (paf_field[1][i] - x) * (paf_field[1][i] - x) + (paf_field[2][i] - y) * (paf_field[2][i] - y);
+            float score = std::exp(-0.5 * d2 / sigma2) * paf_field[0][i];
+            if (score >= score_1) {
+                score_2_i = score_1_i;
+                score_2 = score_1;
+                score_1_i = i;
+                score_1 = score;
+            } else if (score > score_2) {
+                score_2_i = i;
+                score_2 = score;
+            }
+        }
+
+        if (score_1 == 0)
+            return { 0, 0, 0, 0 };
+
+        auto entry_1 = std::make_tuple(paf_field[3][score_1_i], paf_field[4][score_1_i], paf_field[6][score_1_i], paf_field[8][score_1_i]);
+
+        auto [ex1, ey1, eb1, es1] = entry_1;
+        if (score_2 < 0.01 || score_2 < 0.5 * score_1) {
+            return { ex1, ey1, es1, score_1 * 0.5 };
+        }
+
+        // blend...
+        auto entry_2 = std::make_tuple(paf_field[3][score_2_i], paf_field[4][score_2_i], paf_field[6][score_2_i], paf_field[8][score_2_i]);
+        auto [ex2, ey2, eb2, es2] = entry_2;
+
+        float blend_d2 = (ex1 - ex2) * (ex1 - ex2) + (ey1 - ey2) * (ey1 - ey2);
+        if (blend_d2 > ((es1 * es1) / 4)) {
+            return { ex1, ey1, es1, score_1 * 0.5 };
+        }
+
+        return {
+            // xysv
+            (score_1 * ex1 + score_2 * ex2) / (score_1 + score_2),
+            (score_1 * ey1 + score_2 * ey2) / (score_1 + score_2),
+            (score_1 * es1 + score_2 * es2) / (score_1 + score_2),
+            0.5 * (score_1 + score_2),
+        };
+    }
+
+    using xysv = std::optional<std::tuple<float, float, float, float>>;
+
+    struct queue_item { // -score, xyv, start_i, end_i
+        template <typename... Args>
+        queue_item(Args&&... args)
+            : data(std::make_tuple(std::forward<Args>(args)...))
+        {
+        }
+        std::tuple<float, xysv, int, int> data;
+        friend bool operator>(const queue_item& l, const queue_item& r)
+        {
+            return std::get<0>(l.data) >= std::get<0>(r.data);
+        }
+        friend bool operator<(const queue_item& l, const queue_item& r)
+        {
+            return std::get<0>(l.data) < std::get<0>(r.data);
+        }
+    };
+
+    void OpenPifPafPostprocessor::grow(Annotation& ann,
+        const FBContainer& pafForward,
+        const FBContainer& pafBackward)
+    {
+        // frontierActive = true;
+        // blockFrontier.clear();
+        std::set<std::pair<int, int>> in_frontier{};
+        std::priority_queue<queue_item, std::deque<queue_item>, std::greater<queue_item>> frontier;
+
+        const auto add_to_frontier = [&](size_t start_i) {
+            for (const auto& [end_i, to_p] : BY_SOURCE_MAP[start_i]) {
+                int caf_i = to_p.field_id;
+                // std::cout << "----> " << start_i << '\t' << end_i << '\t' << caf_i << '\n';
+                if (ann.keypoints[3 * end_i + 2] > 0.0) {
+                    // std::cout << "CONTINUE start_i = " << start_i << '\n';
+                    continue;
+                }
+                // found!
+                if (in_frontier.cend() != in_frontier.find(std::make_pair(start_i, end_i))) {
+                    // std::cout << "CONTINUE map already got you!\n";
+                    continue;
+                }
+
+                float max_possible_score = std::sqrt(ann.keypoints[3 * start_i + 2]);
+                // std::cout << "put " << start_i << ' ' << end_i << "\tscore = " << max_possible_score << "\n";
+                frontier.emplace(-max_possible_score, std::nullopt, start_i, end_i);
+                in_frontier.emplace(start_i, end_i);
+            }
+        };
+
+        const auto frontier_get = [&]() -> std::optional<queue_item> {
+            while (!frontier.empty()) {
+                auto entry = frontier.top();
+                frontier.pop();
+
+                {
+                    auto [_a, _b, start_i, end_i] = entry.data;
+                    // std::cout << "POP " << start_i << ' ' << end_i << " has val = " << std::get<1>(entry.data).has_value() << '\n';
+                }
+
+                if (std::get<1>(entry.data).has_value()) {
+                    // std::cout << "RETURN \n";
+                    return entry;
+                }
+
+                auto [_a, _b, start_i, end_i] = entry.data;
+                if (ann.keypoints[end_i * 3 + 2] > 0.0)
+                    continue;
+
+                // connection_value(self, ann, caf_scored, start_i, end_i, *, reverse_match=True):
+                auto new_xysv = [&](size_t start_i, size_t end_i) -> xysv {
+                    const auto& point = BY_SOURCE_MAP[start_i][end_i];
+                    int caf_i = point.field_id;
+                    bool is_forward = point.possitve;
+                    const auto& caf_f = is_forward ? pafForward[caf_i] : pafBackward[caf_i]; // [19, 9, N]
+                    const auto& caf_b = is_forward ? pafBackward[caf_i] : pafForward[caf_i];
+                    auto [x, y, v] = std::make_tuple(ann.keypoints[start_i * 3], ann.keypoints[start_i * 3 + 1], ann.keypoints[start_i * 3 + 2]);
+                    float xy_scale_s = std::max(0.f, ann.jointScales[start_i]);
+                    const auto [nx, ny, ns, nv] = growConnectionBlend(x, y, xy_scale_s, caf_f);
+                    // std::cout << "NEW:\t" << nx << '\t' << ny << '\t' << ns << '\t' << nv << '\n';
+
+                    if (nv == 0)
+                        return std::nullopt;
+
+                    float keypoint_score = std::sqrt(nv * v);
+                    if (keypoint_score < keypointThreshold)
+                        return std::nullopt;
+                    // Use relative threashold
+                    constexpr float keypoint_threshold_rel = 0.5;
+                    if (keypoint_score < v * keypoint_threshold_rel)
+                        return std::nullopt;
+
+                    float xy_scale_t = std::max(0.f, ns);
+                    // if self.reverse_match and reverse_match -> true
+                    const auto [rx, ry, rs, rv] = growConnectionBlend(nx, ny, xy_scale_t, caf_b);
+                    // std::cout << "REVERSE:\t" << rx << '\t' << ry << '\t' << rs << '\t' << rv << '\n';
+                    if (rs == 0 || std::abs(x - rx) + std::abs(y - ry) > xy_scale_s)
+                        return std::nullopt;
+
+                    return std::make_tuple(nx, ny, ns, keypoint_score);
+                }(start_i, end_i);
+
+                if (std::nullopt == new_xysv)
+                    continue;
+
+                frontier.emplace(-std::get<3>(new_xysv.value()), new_xysv, start_i, end_i);
+            }
+            return std::nullopt;
+        };
+
+        for (size_t joint_i = 0; joint_i < N_PIFPAF_KEYPOINTS; ++joint_i) {
+            if (ann.keypoints[3 * joint_i + 2] != 0.0) {
+                // std::cout << "-----joint_i " << joint_i << "\n";
+                add_to_frontier(joint_i);
+            }
+        }
+
+        while (true) {
+            auto entry = frontier_get();
+            if (!entry.has_value())
+                break;
+
+            auto [_, new_xysv, jsi, jti] = entry.value().data;
+
+            // std::cout << "jsi = " << jsi << ", jti = " << jti << ", ann.data[jti, 2] = " << ann.keypoints[jti * 3 + 2] << '\n';
+            if (ann.keypoints[jti * 3 + 2] > 0.0)
+                continue;
+
+            auto [nx, ny, ns, nv] = new_xysv.value();
+            ann.keypoints[jti * 3 + 0] = nx;
+            ann.keypoints[jti * 3 + 1] = ny;
+            ann.keypoints[jti * 3 + 2] = nv;
+            ann.jointScales[jti] = ns;
+            add_to_frontier(jti);
+        }
+    }
+
+    std::vector<OpenPifPafPostprocessor::Annotation> OpenPifPafPostprocessor::softNMS(std::vector<Annotation>& annotations)
+    {
+        float maxx = 0.0f;
+        float maxy = 0.0f;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                auto x = ann.keypoints[k * 3];
+                auto y = ann.keypoints[k * 3 + 1];
+                if (x > maxx) {
+                    maxx = x;
+                }
+                if (y > maxy) {
+                    maxy = y;
+                }
+            }
+        }
+
+        const auto h = (int)(maxy + 1);
+        const auto w = (int)(maxx + 1);
+        Occupancy occupied(17, h, w);
+
+        std::vector<int> sorted(annotations.size());
+        std::iota(sorted.begin(), sorted.end(), 0);
+        std::sort(sorted.begin(), sorted.end(), [annotations](int const& a, int const& b) {
+            return annotations[a].score() > annotations[b].score();
+        });
+
+        for (auto a : sorted) {
+            Annotation& ann = annotations[a];
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                const auto x = ann.keypoints[k * 3];
+                const auto y = ann.keypoints[k * 3 + 1];
+                const auto v = ann.keypoints[k * 3 + 2];
+                if (v == 0) {
+                    continue;
+                }
+
+                const auto i = std::min(std::max(0, (int)std::round(x)), w - 1);
+                const auto j = std::min(std::max(0, (int)std::round(y)), h - 1);
+
+                if (occupied.fuzz_get(k, j, i)) {
+                    ann.keypoints[k * 3 + 2] = 0.0f;
+                } else {
+                    scalarSquareAddSingle(occupied, k, h, w, x, y, ann.jointScales[k]);
+                }
+            }
+        }
+
+        std::vector<Annotation> filtered;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                if (ann.keypoints[k * 3 + 2] > 0.0f) {
+                    filtered.push_back(ann);
+                    break;
+                }
+            }
+        }
+        return filtered;
+
+        // Note: The original code sorts here on the score (descending), but
+        // we sort again later on so it's a bit quicker if we skip that here.
+    }
+
+    void OpenPifPafPostprocessor::initTensors(int tensorWidth, int tensorHeight)
+    {
+        H = tensorHeight;
+        W = tensorWidth;
+        H_hr = (H - 1) * (int)STRIDE + 1;
+        W_hr = (W - 1) * (int)STRIDE + 1;
+
+        pif_stride_1 = H * W;
+        pif_stride_0 = 5 * pif_stride_1;
+
+        pifhr_stride_1 = W_hr;
+        pifhr_stride_0 = H_hr * pifhr_stride_1;
+
+        const int shape = C * H_hr * W_hr;
+        targetsCoreOnly = std::vector<float>(shape);
+        targets = std::vector<float>(shape);
+        scales = std::vector<float>(shape);
+        ns = std::vector<float>(shape);
+    }
+
+    ai_app::Object_detection::Result OpenPifPafPostprocessor::postprocess(
+        int inputWidth, int inputHeight,
+        int tensorWidth, int tensorHeight,
+        const std::vector<float>& pif,
+        const std::vector<float>& paf)
+    {
+        // Allocate the intermediate tensors the first time or when the size changes.
+        if (W != tensorWidth || H != tensorHeight) {
+            initTensors(tensorWidth, tensorHeight);
+        }
+
+        const auto result_tuple = targetIntensities(pif);
+        const auto& pifhr = std::get<0>(result_tuple);
+        const auto& pifhrScales = std::get<1>(result_tuple);
+        const auto& pifhrCore = std::get<2>(result_tuple);
+
+        //      (17, 5, H, W)
+        // pif: [v, x, y, _, s]
+        const size_t pif_ch = 5, hw_size = H * W;
+        const size_t pif_shard_size = pif_ch * hw_size;
+
+        // BEGIN: seeds = utils.CifSeeds(cifhr.accumulated).fill(fields, self.cif_metas)
+        std::vector<std::tuple<float, int, float, float, float>> seeds{};
+
+        const float maxx = W_hr - 0.51, maxy = H_hr - 0.51;
+        for (size_t field_i = 0; field_i < N_PIFPAF_KEYPOINTS; ++field_i) {
+            // Search qualified entries.
+            size_t this_field_offset = field_i * pif_shard_size;
+            for (size_t hw_index = 0; hw_index < hw_size; ++hw_index) {
+                size_t vindex = hw_index + this_field_offset;
+                if (pif[vindex] > seedThreshold) {
+                    float c = pif[vindex], x = pif[vindex + hw_size], y = pif[vindex + 2 * hw_size], s = pif[vindex + 4 * hw_size];
+                    // scalar_values
+                    if (x < -0.49 || y < -0.49 || x > maxx || y > maxy) {
+                        continue;
+                    }
+                    float v = pifhrCore[field_i * W_hr * H_hr + ((size_t)(y * STRIDE + 0.5) * W_hr) + (size_t)(x * STRIDE + 0.5)];
+                    // scalar_values :: over.
+
+                    v = 0.9 * v + 0.1 * c;
+                    // printf("%f   %f, %f, %f, %f\n", v, c, x, y, s);
+
+                    // pass or not?
+                    if (v > seedThreshold) {
+                        // ok, you pass. -> seeds -> [x, y, v, s]
+                        seeds.emplace_back(v, field_i, x * STRIDE, y * STRIDE, s * STRIDE);
+                    }
+                }
+            }
+        }
+        // std::cout << seeds.size() << "seeds size\n";
+        // END: seeds = utils.CifSeeds(cifhr.accumulated).fill(fields, self.cif_metas)
+
+        // BEGIN: caf_scored = utils.CafScored(cifhr.accumulated).fill(fields, self.caf_metas)
+        // (19, 9, DYNAMICs)
+        constexpr size_t paf_ch = 9;
+        const size_t paf_shard_size = paf_ch * hw_size;
+        // (19, 9, H, W)...
+        FBContainer forward{}, backward{};
+        for (size_t field_i = 0; field_i < forward.size(); ++field_i) {
+            constexpr float PAF_SCORE_THRE = 0.2;
+            constexpr float CIF_FLOOR = 0.1;
+            // filter!
+            for (size_t hw_idx = 0; hw_idx < hw_size; ++hw_idx) {
+                const size_t paf_conf_idx = hw_idx + field_i * paf_shard_size;
+                const auto conf = paf[paf_conf_idx];
+                if (conf > PAF_SCORE_THRE) {
+                    // values in this line...
+                    std::array<float, 9> this_ch{};
+                    for (size_t chidx = 0; chidx < this_ch.size(); ++chidx) {
+                        this_ch[chidx] = paf[paf_conf_idx + chidx * hw_size];
+                        if (chidx != 0)
+                            this_ch[chidx] *= STRIDE;
+                    }
+
+                    auto backward_pif_ch = bones[field_i][0] - 1;
+                    auto forward_pif_ch = bones[field_i][1] - 1;
+                    // backward pass.
+                    constexpr std::array<size_t, 9> BACKWARD_IDX{ 0, 3, 4, 1, 2, 6, 5, 8, 7 };
+                    constexpr std::array<size_t, 9> FORWARD_IDX{ 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+                    // restore... (yet another filtering...)
+                    // cifhr_t = scalar_values(self.cifhr[joint_t], nine[3], nine[4], default=0.0)
+                    // nine[0] = nine[0] * (self.cif_floor + (1.0 - self.cif_floor) * cifhr_t)
+                    const auto pass = [&this_ch, maxx, maxy, this, field_i, &pifhrCore](const auto& idx_mapping, FBContainer& cont, size_t pif_field_idx) {
+                        float x = this_ch[idx_mapping[3]], y = this_ch[idx_mapping[4]];
+                        if (!(x < -0.49 || y < -0.49 || x > maxx || y > maxy)) {
+                            // std::cout << field_i << "\tXY = \t"<< x << '\t' << y << '\t' << (size_t)(x + 0.5) << '\t' << (size_t)(y + 0.5) << "\t MAX HW: " << W_hr << ' ' << H_hr << std::endl;
+                            float cifhr_t = pifhrCore[pif_field_idx * W_hr * H_hr + ((size_t)(y + 0.5) * W_hr) + (size_t)(x + 0.5)];
+                            float new_v = this_ch[0] * (CIF_FLOOR + (1 - CIF_FLOOR) * cifhr_t);
+                            if (new_v > PAF_SCORE_THRE) {
+                                // forward pass.
+                                for (size_t fwd_idx = 0; fwd_idx < cont.front().size(); ++fwd_idx) {
+                                    // restore!
+                                    cont[field_i][fwd_idx].push_back(this_ch[idx_mapping[fwd_idx]]);
+                                }
+                                cont[field_i][0].back() = new_v;
+                            }
+                        }
+                    };
+
+                    pass(BACKWARD_IDX, backward, backward_pif_ch);
+                    pass(FORWARD_IDX, forward, forward_pif_ch);
+                }
+            }
+        }
+        // for (const auto& f : forward) {
+        //   std::cout << "(" << f.size() << ", " << f.front().size() << "), ";
+        // }
+        // std::cout << '\n';
+        // for (const auto& f : backward) {
+        //   std::cout << "(" << f.size() << ", " << f.front().size() << "), ";
+        // }
+        // std::cout << '\n';
+        // END: caf_scored = utils.CafScored(cifhr.accumulated).fill(fields, self.caf_metas)
+        std::sort(seeds.begin(), seeds.end(), std::greater{});
+
+        // occupacy map.
+        // std::cout << C << ' ' << H_hr << ' ' << W_hr << '\n';
+        Occupancy occupied(C, H_hr, W_hr);
+        std::vector<Annotation> annotations;
+        for (const auto& [v, f, x, y, s] : seeds) {
+            if (occupied.fuzz_get(f, y, x)) {
+                continue;
+            }
+
+            Annotation ann(f, x, y, v);
+            ann.jointScales[f] = s;
+            grow(ann, forward, backward);
+            annotations.push_back(ann);
+
+            for (int i = 0; i < N_PIFPAF_KEYPOINTS; ++i) {
+                const auto ax = ann.keypoints[i * 3];
+                const auto ay = ann.keypoints[i * 3 + 1];
+                const auto av = ann.keypoints[i * 3 + 2];
+                if (av == 0) {
+                    continue;
+                }
+
+                const auto width = ann.jointScales[i];
+                scalarSquareAddSingle(occupied, i, H_hr, W_hr, ax, ay, width, Occupancy::reduction, Occupancy::min_scale_reduced); // width is sigma...
+            }
+        }
+
+        // This returns two lists that each contain 19 tensors of shape (7, ?)
+        // where the second dimension can vary in size (depends on thresholds).
+        // const auto pt = scorePafTarget(paf, pifhr);
+        // const auto pafForward = std::get<0>(pt);
+        // const auto pafBackward = std::get<1>(pt);
+
+        /*
+  // For debugging
+  printf("pafForward:\n");
+  for (auto& i : pafForward) {
+    for (auto j : i) { printf("%f, ", j); } printf("\n");
+  }
+  printf("\npafBackward:\n");
+  for (auto i : pafBackward) {
+    for (auto& j : i) { printf("%f, ", j); } printf("\n");
+  }
+  */
+
+        // auto annotations = decodeAnnotations(seeds, pifhr, pifhrScales, pifhrCore, pafForward, pafBackward);
+
+        // Scale to input size
+        //  for (auto& ann : annotations) {
+        //    for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+        //      ann.keypoints[k*3    ] *= STRIDE;
+        //      ann.keypoints[k*3 + 1] *= STRIDE;
+        //        std::cout << "--> Scaled: " <<ann.keypoints[k*3    ] << '\t' << ann.keypoints[k*3+1] << '\n';
+        //      ann.jointScales[k]     *= STRIDE;
+        //    }
+        //  }
+
+        // Non-maximum suppression
+        if (!annotations.empty()) {
+            annotations = softNMS(annotations);
+        }
+
+        // // Threshold
+        std::vector<Annotation> thresholded;
+        for (auto& ann : annotations) {
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                if (ann.keypoints[k * 3 + 2] < keypointThreshold) {
+                    ann.keypoints[k * 3 + 2] = 0.0f;
+                }
+            }
+            if (ann.score() >= instanceThreshold) {
+                thresholded.push_back(ann);
+            }
+        }
+
+        std::sort(thresholded.begin(), thresholded.end(), [](const Annotation& a, const Annotation& b) {
+            return a.score() > b.score();
+        });
+
+        // // Convert to normalized coordinates
+        //  for (auto& ann : thresholded) {
+        //    for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+        //      ann.keypoints[k*3    ] /= inputWidth;
+        //      ann.keypoints[k*3 + 1] /= inputHeight;
+        //    }
+        //  }
+
+        /*
+  // For debugging
+  for (auto ann : thresholded) {
+    printf("Keypoints:\n");
+    for (auto k : ann.keypoints) {
+      printf("%f, ", k);
+    }
+    printf("\nJoint scales:\n");
+    for (auto k : ann.jointScales) {
+      printf("%f, ", k);
+    }
+    printf("\n");
+  }
+  */
+
+        ai_app::Object_detection::Result result;
+        result.success = true;
+        for (auto& ann : thresholded) {
+            ai_app::Landmarks landmarks;
+            landmarks.type = "body_pose_pifpaf";
+
+            int minx = std::numeric_limits<int>::max(),
+                miny = std::numeric_limits<int>::max(),
+                maxx_ = std::numeric_limits<int>::min(),
+                maxy_ = std::numeric_limits<int>::min();
+
+            for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                const int x = ann.keypoints[k * 3];
+                const int y = ann.keypoints[k * 3 + 1];
+                const auto v = ann.keypoints[k * 3 + 2];
+
+                if (v > 0.0f) {
+                    if (x < minx) {
+                        minx = x;
+                    }
+                    if (x > maxx_) {
+                        maxx_ = x;
+                    }
+                    if (y < miny) {
+                        miny = y;
+                    }
+                    if (y > maxy_) {
+                        maxy_ = y;
+                    }
+                }
+
+                ai_app::Landmark landmark{};
+                landmark.confidence = v;
+                landmark.position.x = x;
+                landmark.position.y = y;
+                landmarks.points.push_back(landmark);
+            }
+
+            ai_app::Object_detection::Result::Item item;
+            item.confidence = ann.score();
+            item.class_index = 1;
+            item.bounding_box.origin.x = minx;
+            item.bounding_box.origin.y = miny;
+            item.bounding_box.size.x = maxx_ - minx;
+            item.bounding_box.size.y = maxy_ - miny;
+            item.landmarks = landmarks;
+
+            result.items.push_back(item);
+        }
+        return result;
+    }
+
+}
+}
diff --git a/src/pifpaf_decoder/openpifpaf_postprocessor.hpp b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
new file mode 100644
index 00000000..6aa28353
--- /dev/null
+++ b/src/pifpaf_decoder/openpifpaf_postprocessor.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <set>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "object_detection.hpp"
+
+namespace lpdnn {
+namespace aiapp_impl {
+
+    using FBContainer = std::array<std::array<std::vector<float>, 9>, 19>;
+
+    /**
+  Post-processing logic for OpenPifPaf
+
+  \note This object caches the big tensors to save on memory allocations.
+  This means it's best to make one instance of this class and keep using it.
+  For the most efficient results, make sure the input tensors are always the
+  same width and height.
+
+  \note This code is not threadsafe. Don't call it from multiple threads at
+  the same time. If you must use multiple threads, give each thread its own
+  instance of this class.
+ */
+    class OpenPifPafPostprocessor {
+    public:
+        OpenPifPafPostprocessor()
+            : H(0)
+            , W(0)
+        {
+        }
+
+    public:
+        static constexpr int N_PIFPAF_KEYPOINTS = 17;
+        static constexpr int N_PIFPAF_BONES = 19;
+
+        // Connections between the different keypoint indices.
+        // Note: these start at 1, not 0!
+        static const int bones[19][2];
+        float keypointThreshold;
+
+        ai_app::Object_detection::Result postprocess(
+            int inputWidth, int inputHeight,
+            int tensorWidth, int tensorHeight,
+            const std::vector<float>& pif,
+            const std::vector<float>& paf);
+
+    private:
+        struct Annotation {
+            // Array of `N_PIFPAF_KEYPOINTS * 3` elements:
+            // - element `i*3 + 0` is x-coordinate (normalized)
+            // - element `i*3 + 1` is y-coordinate (normalized)
+            // - element `i*3 + 2` is confidence score
+            std::vector<float> keypoints;
+
+            std::vector<float> jointScales;
+
+            Annotation(int j, float x, float y, float v)
+                : keypoints(N_PIFPAF_KEYPOINTS * 3)
+                , jointScales(N_PIFPAF_KEYPOINTS)
+            {
+                keypoints[j * 3] = x;
+                keypoints[j * 3 + 1] = y;
+                keypoints[j * 3 + 2] = v;
+            }
+
+            /**
+      Overall confidence score for the entire skeleton.
+    */
+            float score() const
+            {
+                float maxv = 0.0f;
+                float vv = 0.0f;
+                for (int k = 0; k < N_PIFPAF_KEYPOINTS; ++k) {
+                    auto v = keypoints[k * 3 + 2];
+                    if (v > maxv) {
+                        maxv = v;
+                    }
+                    vv += v * v;
+                }
+                return 0.1f * maxv + 0.9f * vv / (float)N_PIFPAF_KEYPOINTS;
+            }
+        };
+
+        typedef std::tuple<std::vector<float>, std::vector<float>, std::vector<float>> Target_intensity;
+
+    private:
+        void initTensors(int tensorWidth, int tensorHeight);
+
+        Target_intensity
+        targetIntensities(const std::vector<float>& pif,
+            float v_th = 0.1f,
+            bool coreOnly = false);
+
+        std::tuple<float, float, float, float>
+        growConnectionBlend(float x, float y, float s, const std::array<std::vector<float>, 9>& paf_field_);
+
+        // frontier_t frontierIter(Annotation& ann);
+
+        void grow(Annotation& ann,
+            const FBContainer& pafForward,
+            const FBContainer& pafBackward);
+
+        std::vector<Annotation> softNMS(std::vector<Annotation>& annotations);
+
+    private:
+        // Tensor dimensions (hr = high-resolution).
+        int H, W, H_hr, W_hr;
+
+        // Strides for tensor dimensions.
+        size_t pif_stride_1, pif_stride_0;
+        size_t pifhr_stride_1, pifhr_stride_0;
+
+        // Filled in by targetIntensities().
+        std::vector<float> targetsCoreOnly;
+        std::vector<float> targets;
+        std::vector<float> scales;
+        std::vector<float> ns;
+    };
+
+}
+}