Merge pull request #1163 from xtensor-stack/bug/1132

JohanMabille · web-flow · commit eb3bacbe1012 · 2025-08-29T17:46:48.000+02:00
generalization of reduce_mul implementation
diff --git a/docs/source/api/reducer_index.rst b/docs/source/api/reducer_index.rst
@@ -38,6 +38,8 @@ Reduction operators
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`reduce_min`                | min of the batch elements                          |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`reduce_mul`                | product of the batch elements                      |
++---------------------------------------+----------------------------------------------------+
 | :cpp:func:`haddp`                     | horizontal sum across batches                      |
 +---------------------------------------+----------------------------------------------------+
 
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -139,20 +139,6 @@ namespace xsimd
             return fma(x, y, select(mask, neg(z), z));
         }
 
-        // hadd
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept
-        {
-            alignas(A::alignment()) T buffer[batch<T, A>::size];
-            self.store_aligned(buffer);
-            T res = 0;
-            for (T val : buffer)
-            {
-                res += val;
-            }
-            return res;
-        }
-
         // incr
         template <class A, class T>
         XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp
@@ -77,6 +77,8 @@ namespace xsimd
     template <class T, class A>
     XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
     template <class T, class A>
+    XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
+    template <class T, class A>
     XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
     template <class T, class A>
     XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp
@@ -2103,6 +2103,19 @@ namespace xsimd
             return { reduce_add(self.real()), reduce_add(self.imag()) };
         }
 
+        template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 0;
+            for (T val : buffer)
+            {
+                res += val;
+            }
+            return res;
+        }
+
         namespace detail
         {
             template <class T, T N>
@@ -2147,6 +2160,34 @@ namespace xsimd
                                   self, std::integral_constant<unsigned, batch<T, A>::size>());
         }
 
+        // reduce_mul
+        template <class A, class T>
+        XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
+        {
+            // FIXME: could do better
+            alignas(A::alignment()) std::complex<T> buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(buffer);
+            std::complex<T> res = 1;
+            for (auto val : buffer)
+            {
+                res *= val;
+            }
+            return res;
+        }
+
+        template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 1;
+            for (T val : buffer)
+            {
+                res *= val;
+            }
+            return res;
+        }
+
         // remainder
         template <class A>
         XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1046,7 +1046,7 @@ namespace xsimd
         }
 
         // reduce_add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             typename batch<T, sse4_2>::register_type low, high;
@@ -1077,6 +1077,16 @@ namespace xsimd
             return reduce_min(batch<T, sse4_2>(low));
         }
 
+        // reduce_mul
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            typename batch<T, sse4_2>::register_type low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_mul(blow * bhigh);
+        }
+
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1911,4 +1921,4 @@ namespace xsimd
     }
 }
 
-#endif
+#endif
diff --git a/include/xsimd/arch/xsimd_avx512dq.hpp b/include/xsimd/arch/xsimd_avx512dq.hpp
@@ -188,6 +188,16 @@ namespace xsimd
             return reduce_add(batch<float, avx2>(res1), avx2 {});
         }
 
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_mul_ps(tmp1, tmp2);
+            return reduce_mul(batch<float, avx2>(res1), avx2 {});
+        }
+
         // swizzle constant mask
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
                   uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1558,6 +1558,37 @@ namespace xsimd
             return reduce_min(batch<T, avx2>(low));
         }
 
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_reduce_mul_ps(rhs);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_reduce_mul_pd(rhs);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_reduce_mul_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_reduce_mul_epi64(self);
+            }
+            else
+            {
+                __m256i low, high;
+                detail::split_avx512(self, low, high);
+                batch<T, avx2> blow(low), bhigh(high);
+                return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
+            }
+        }
+
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -36,8 +36,10 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;
         // Forward declarations for pack-level helpers
         namespace detail
         {
diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp
@@ -601,6 +601,16 @@ namespace xsimd
                                    { return xsimd::min(x, y); });
         }
 
+        // reduce_mul
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> buffer;
+            self.store_unaligned(buffer.data());
+            return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies<T>());
+        }
+
         // rsqrt
         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -1705,14 +1705,21 @@ namespace xsimd
          * reduce_max *
          **************/
 
-        // Using common implementation because ARM doe snot provide intrinsics
+        // Using common implementation because ARM does not provide intrinsics
         // for this operation
 
         /**************
          * reduce_min *
          **************/
 
-        // Using common implementation because ARM doe snot provide intrinsics
+        // Using common implementation because ARM does not provide intrinsics
+        // for this operation
+
+        /**************
+         * reduce_mul *
+         **************/
+
+        // Using common implementation because ARM does not provide intrinsics
         // for this operation
 
         /**********
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -1290,7 +1290,7 @@ namespace xsimd
             }
             else
             {
-                return hadd(self, common {});
+                return reduce_add(self, common {});
             }
         }
 
@@ -1344,6 +1344,52 @@ namespace xsimd
             return first(acc3, A {});
         }
 
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
+                tmp1 = tmp1 * self;
+                batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
+                tmp2 = tmp2 * tmp1;
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
+                auto tmp2 = tmp1 * self;
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_mul(self, common {});
+            }
+        }
+
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp
@@ -51,6 +51,15 @@ namespace xsimd
             return _mm_cvtss_f32(tmp1);
         }
 
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
+            return _mm_cvtss_f32(tmp2);
+        }
+
     }
 
 }
diff --git a/include/xsimd/arch/xsimd_vsx.hpp b/include/xsimd/arch/xsimd_vsx.hpp
@@ -559,7 +559,49 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<vsx>) noexcept
         {
-            return hadd(self, common {});
+            return reduce_add(self, common {});
+        }
+
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE signed reduce_mul(batch<signed, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
+            auto tmp3 = vec_mul(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE unsigned reduce_mul(batch<unsigned, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
+            auto tmp3 = vec_mul(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<vsx>) noexcept
+        {
+            // FIXME: find an in-order approach
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
+            auto tmp3 = vec_mul(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v1, v0
+            auto tmp1 = vec_mul(self.data, tmp0); // v0 * v1, v1 * v0
+            return vec_extract(tmp1, 0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return reduce_mul(self, common {});
         }
 
         // round
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp

Original file line number	Diff line number	Diff line change
`@@ -1046,7 +1046,7 @@ namespace xsimd`
`1046`	`1046`	`}`
`1047`	`1047`
`1048`	`1048`	`// reduce_add`
`1049`		`- template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value \|\| std::is_same<T, float>::value \|\| std::is_same<T, double>::value, void>::type>`
	`1049`	`+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`1050`	`1050`	`XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept`
`1051`	`1051`	`{`
`1052`	`1052`	`typename batch<T, sse4_2>::register_type low, high;`
`@@ -1077,6 +1077,16 @@ namespace xsimd`
`1077`	`1077`	`return reduce_min(batch<T, sse4_2>(low));`
`1078`	`1078`	`}`
`1079`	`1079`
	`1080`	`+ // reduce_mul`
	`1081`	`+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
	`1082`	`+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept`
	`1083`	`+ {`
	`1084`	`+ typename batch<T, sse4_2>::register_type low, high;`
	`1085`	`+ detail::split_avx(self, low, high);`
	`1086`	`+ batch<T, sse4_2> blow(low), bhigh(high);`
	`1087`	`+ return reduce_mul(blow * bhigh);`
	`1088`	`+ }`
	`1089`	`+`
`1080`	`1090`	`// rsqrt`
`1081`	`1091`	`template <class A>`
`1082`	`1092`	`XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept`
`@@ -1911,4 +1921,4 @@ namespace xsimd`
`1911`	`1921`	`}`
`1912`	`1922`	`}`
`1913`	`1923`
`1914`		`-#endif`
	`1924`	`+#endif`