Improve and refactor softmax layer (opencv#24466)

* improve and refactor softmax layer * fix building error * compatible region layer * fix axisStep when disable SIMD * fix dynamic array * try to fix error * use nlanes from VTraits * move axisBias to srcOffset * fix bug caused by axisBias * remove macro * replace #ifdef with #if for CV_SIMD
thewoz · May 29, 2024 · acb0dcf · acb0dcf
1 parent ddb3b20
commit acb0dcf
Show file tree

Hide file tree

Showing 6 changed files with 251 additions and 82 deletions.
diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
@@ -758,4 +758,55 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
     dnnBackendsAndTargets()
 ));
 
+typedef TestBaseWithParam<tuple<std::vector<int>, int, tuple<Backend, Target> > > Layer_Softmax;
+PERF_TEST_P_(Layer_Softmax, softmax_3d) {
+    std::vector<int> shape = get<0>(GetParam());
+    int axis = get<1>(GetParam());
+    int backendId = get<0>(get<2>(GetParam()));
+    int targetId = get<1>(get<2>(GetParam()));
+
+    Mat data(shape, CV_32FC1);
+    Scalar mean = 0.f;
+    Scalar std = 1.f;
+    randn(data, mean, std);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Softmax";
+    lp.name = "testLayer";
+    lp.set("axis", axis);
+
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    // warmup
+    {
+        net.setInput(data);
+        net.setPreferableBackend(backendId);
+        net.setPreferableTarget(targetId);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE() {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
+    Values(                // input size
+            std::vector<int>({16, 50, 50}),
+            std::vector<int>({16, 197, 197}),
+            std::vector<int>({16, 1024, 1024})
+    ),
+    Values(0, 1, 2),  // axis
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
+
 } // namespace
diff --git a/modules/dnn/src/layers/cpu_kernels/softmax.cpp b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "softmax.hpp"
+
+namespace cv { namespace dnn {
+
+void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
+    CV_Assert(src.type() == CV_32F);
+    CV_Assert(src.isContinuous() && dst.isContinuous());
+    CV_Assert(src.size == dst.size);
+    axis = normalize_axis(axis, src.dims);
+
+    size_t outerSize = src.total(0, axis),
+           innerSize = src.total(axis + 1);
+
+    const float *srcPtr = src.ptr<float>();
+    float *dstPtr = dst.ptr<float>();
+
+    size_t outerStep = src.total(axis);
+    size_t cnStep = src.total(axis + 1);
+
+    // multi-threads
+    size_t totalTasks = outerSize * innerSize;
+    double nstripes = (double) totalTasks / 1024.0;
+    // make the channel axis to be multiple of 8
+    size_t channelAxis = (axisStep + 7) & -8;
+
+#if CV_SIMD
+    const int nlanes = VTraits<v_float32>::vlanes();
+    // the number of redundant dimension
+    size_t redundantDim = nlanes - axisStep % nlanes;
+#endif
+
+    parallel_for_(Range(0, (int) totalTasks), [&](const Range &range) {
+        AutoBuffer<float> axisBuf_(channelAxis);
+        float *axisBuf = axisBuf_.data();
+
+        for (size_t i = range.start; i < range.end; i++) {
+            size_t outerDim = i / innerSize;
+            size_t innerDim = i % innerSize;
+            size_t srcOffset = outerDim * outerStep + innerDim;
+            // copy data from src to buf along axis, since the data may not be continuous
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
+                axisBuf[cnDim] = srcPtr[srcOffset + (cnDim + axisBias) * cnStep];
+
+            float s = 0.f;
+#if CV_SIMD
+            // make the value of the redundant dimension to be -FLT_MAX
+            if (redundantDim != nlanes) {
+                for (size_t j = axisStep; j < axisStep + redundantDim; j++)
+                    axisBuf[j] = -FLT_MAX;
+            }
+            // calculate the max value along the axis
+            v_float32 vmax = vx_load(axisBuf);
+            for (size_t cnDim = nlanes; cnDim < axisStep; cnDim += nlanes) {
+                v_float32 val = vx_load(axisBuf + cnDim);
+                vmax = v_max(vmax, val);
+            }
+            float maxVal = v_reduce_max(vmax);
+
+            // calculate the exp value along the axis
+            v_float32 vs = vx_setzero_f32();
+            vmax = vx_setall_f32(maxVal);
+            // initialize vexp constant
+            v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
+            v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
+            v_float32 _vexp_half = vx_setall_f32(0.5f);
+            v_float32 _vexp_one = vx_setall_f32(1.f);
+            v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
+            v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
+            v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
+            v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
+            v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
+            v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
+            v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
+            v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
+            v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
+            // initialize temp vectors for vexp
+            v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
+            v_int32 _vexp_mm;
+
+            // calculate and sum all data along axis
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
+                val = vx_load(axisBuf + cnDim);
+                val = v_sub(val, vmax);
+
+                // compute vexp of val
+                _vexp_x = v_min(val, _vexp_hi);
+                _vexp_x = v_max(_vexp_x, _vexp_lo);
+                _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
+                _vexp_mm = v_floor(_vexp_);
+                _vexp_ = v_cvt_f32(_vexp_mm);
+                _vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
+                _vexp_mm = v_shl(_vexp_mm, 23);
+                _vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
+                _vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
+                _vexp_z = v_mul(_vexp_x, _vexp_x);
+                _vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
+                _vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
+                _vexp_y = v_add(_vexp_y, _vexp_one);
+                val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+
+                vs = v_add(vs, val);
+                v_store(axisBuf + cnDim, val);
+            }
+
+            s = v_reduce_sum(vs);
+            // subtract the value of the redundant dimension
+            if (redundantDim != nlanes) {
+                float* _val = new float[nlanes];
+                v_store(_val, val);
+                for (size_t j = nlanes - redundantDim; j < nlanes; j++)
+                    s -= _val[j];
+            }
+#else
+            float maxVal = axisBuf[0];
+            for (size_t cnDim = 1; cnDim < axisStep; cnDim++) {
+                maxVal = std::max(maxVal, axisBuf[cnDim]);
+            }
+            for (size_t j = 0; j < axisStep; j++) {
+                axisBuf[j] = expf(axisBuf[j] - maxVal);
+                s += axisBuf[j];
+            }
+#endif
+            s = 1.f / s;
+
+            // copy back the result to src
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
+                dstPtr[srcOffset + (cnDim + axisBias) * cnStep] = axisBuf[cnDim] * s;
+        }
+    }, nstripes);
+}
+
+void softmax(Mat &dst, const Mat &src, int axis) {
+    softmax(dst, src, axis, 0, src.size[axis]);
+}
+
+void logSoftmax(Mat &dst, const Mat &src, int axis) {
+    softmax(dst, src, axis);
+    log(dst, dst);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/softmax.hpp b/modules/dnn/src/layers/cpu_kernels/softmax.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#ifndef OPENCV_DNN_SOFTMAX_HPP
+#define OPENCV_DNN_SOFTMAX_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep);
+
+void softmax(Mat &dst, const Mat &src, int axis);
+
+void logSoftmax(Mat &dst, const Mat &src, int axis);
+
+}} // cv::dnn
+
+#endif // OPENCV_DNN_SOFTMAX_HPP
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
@@ -45,6 +45,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include "../nms.inl.hpp"
+#include "cpu_kernels/softmax.hpp"
 
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
@@ -280,10 +281,8 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
                 }
 
                 if (useSoftmax) {  // Yolo v2
-                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
-                        int index = cell_size*i;
-                        softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
-                    }
+                    Mat _inpBlob = inpBlob.reshape(0, outBlob.dims, outBlob.size);
+                    softmax(outBlob, _inpBlob, -1, 5, classes);
                 }
                 else if (useLogistic) {  // Yolo v3
                     for (int i = 0; i < batch_size*rows*cols*anchors; ++i){

diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
@@ -52,6 +52,7 @@
 #include <algorithm>
 #include <stdlib.h>
 #include <opencv2/core/utils/logger.hpp>
+#include "cpu_kernels/softmax.hpp"
 using std::max;
 
 #ifdef HAVE_OPENCL
@@ -225,89 +226,15 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
         std::vector<Mat> inputs, outputs, internals;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
-        internals_arr.getMatVector(internals);
 
         const Mat &src = inputs[0];
         Mat &dst = outputs[0];
-
         int axis = normalize_axis(axisRaw, src.dims);
-        size_t outerSize = src.total(0, axis), channels = src.size[axis],
-                innerSize = src.total(axis + 1);
-
-        CV_Assert(src.type() == CV_32F);
-        CV_Assert(src.isContinuous() && dst.isContinuous());
-
-        const float *srcPtr = src.ptr<float>();
-        float *dstPtr = dst.ptr<float>();
-        float *bufPtr = internals[0].ptr<float>();
-
-        size_t outerStep = src.total(axis);
-        size_t cnStep = src.total(axis + 1);
-
-        //compute max along axis
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
-
-            for (size_t cnDim = 1; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
-            }
-        }
-
-        //subtract max
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[offset + i] = srcPtr[offset + i] - bufPtr[bufOffset + i];
-            }
-        }
-
-        cv::exp(dst, dst);
-
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            //sum exp along axis
-            for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] = 0.f;
 
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] += dstPtr[offset + i];
-            }
-
-            //divide by computed sum
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[offset + i] /= bufPtr[bufOffset + i];
-            }
-            if (logSoftMax)
-            {
-                for (size_t cnDim = 0; cnDim < channels; cnDim++)
-                {
-                    const int offset = srcOffset + cnDim * cnStep;
-                    for (size_t i = 0; i < innerSize; i++)
-                        dstPtr[offset + i] = log(dstPtr[offset + i]);
-                }
-            }
-        }
+        if(logSoftMax)
+            logSoftmax(dst, src, axis);
+        else
+            softmax(dst, src, axis);
     }
 
 #ifdef HAVE_CUDA

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -2788,6 +2788,13 @@ void ONNXImporter::parseUpsample(LayerParams& layerParams, const opencv_onnx::No
 void ONNXImporter::parseSoftMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     const std::string& layer_type = node_proto.op_type();
+    int axis;
+    if (layerParams.has("opset") && layerParams.get<int>("opset") > 11) {
+        axis = layerParams.get<int>("axis", -1);
+    } else {
+        axis = layerParams.get<int>("axis", 1);
+    }
+    layerParams.set<int>("axis", axis);
     layerParams.type = "Softmax";
     layerParams.set("log_softmax", layer_type == "LogSoftmax");
     addLayer(layerParams, node_proto);