diff --git a/docs/source/api/reducer_index.rst b/docs/source/api/reducer_index.rst index 09f769be9..f99c7f1a7 100644 --- a/docs/source/api/reducer_index.rst +++ b/docs/source/api/reducer_index.rst @@ -38,6 +38,8 @@ Reduction operators +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_min` | min of the batch elements | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`reduce_mul` | product of the batch elements | ++---------------------------------------+----------------------------------------------------+ | :cpp:func:`haddp` | horizontal sum across batches | +---------------------------------------+----------------------------------------------------+ diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index 2e83d33de..8a0d2e428 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -139,20 +139,6 @@ namespace xsimd return fma(x, y, select(mask, neg(z), z)); } - // hadd - template ::value, void>::type*/> - XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept - { - alignas(A::alignment()) T buffer[batch::size]; - self.store_aligned(buffer); - T res = 0; - for (T val : buffer) - { - res += val; - } - return res; - } - // incr template XSIMD_INLINE batch incr(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index 7103589b7..28590ce99 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -77,6 +77,8 @@ namespace xsimd template XSIMD_INLINE T reduce_add(batch const&) noexcept; template + XSIMD_INLINE T reduce_mul(batch const&) noexcept; + template XSIMD_INLINE batch select(batch_bool const&, batch const&, batch const&) noexcept; template XSIMD_INLINE batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index 10db980e2..e5bc57db9 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -2103,6 +2103,19 @@ namespace xsimd return { reduce_add(self.real()), reduce_add(self.imag()) }; } + template ::value, void>::type*/> + XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(buffer); + T res = 0; + for (T val : buffer) + { + res += val; + } + return res; + } + namespace detail { template @@ -2147,6 +2160,34 @@ namespace xsimd self, std::integral_constant::size>()); } + // reduce_mul + template + XSIMD_INLINE std::complex reduce_mul(batch, A> const& self, requires_arch) noexcept + { + // FIXME: could do better + alignas(A::alignment()) std::complex buffer[batch, A>::size]; + self.store_aligned(buffer); + std::complex res = 1; + for (auto val : buffer) + { + res *= val; + } + return res; + } + + template ::value, void>::type*/> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(buffer); + T res = 1; + for (T val : buffer) + { + res *= val; + } + return res; + } + // remainder template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index ed3319d4e..20bbf600f 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1046,7 +1046,7 @@ namespace xsimd } // reduce_add - template ::value || std::is_same::value || std::is_same::value, void>::type> + template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { typename batch::register_type low, high; @@ -1077,6 +1077,16 @@ namespace xsimd return reduce_min(batch(low)); } + // reduce_mul + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + typename batch::register_type low, high; + detail::split_avx(self, low, high); + batch blow(low), bhigh(high); + return reduce_mul(blow * bhigh); + } + // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept @@ -1911,4 +1921,4 @@ namespace xsimd } } -#endif \ No newline at end of file +#endif diff --git a/include/xsimd/arch/xsimd_avx512dq.hpp b/include/xsimd/arch/xsimd_avx512dq.hpp index 9b234587f..063affa4c 100644 --- a/include/xsimd/arch/xsimd_avx512dq.hpp +++ b/include/xsimd/arch/xsimd_avx512dq.hpp @@ -188,6 +188,16 @@ namespace xsimd return reduce_add(batch(res1), avx2 {}); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& rhs, requires_arch) noexcept + { + __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); + __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); + __m256 res1 = _mm256_mul_ps(tmp1, tmp2); + return reduce_mul(batch(res1), avx2 {}); + } + // swizzle constant mask template diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index d770bde28..7073d9d4f 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1558,6 +1558,37 @@ namespace xsimd return reduce_min(batch(low)); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& rhs, requires_arch) noexcept + { + return _mm512_reduce_mul_ps(rhs); + } + template + XSIMD_INLINE double reduce_mul(batch const& rhs, requires_arch) noexcept + { + return _mm512_reduce_mul_pd(rhs); + } + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_reduce_mul_epi32(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_reduce_mul_epi64(self); + } + else + { + __m256i low, high; + detail::split_avx512(self, low, high); + batch blow(low), bhigh(high); + return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {}); + } + } + // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index aa9a0c163..0c3cb362e 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -36,8 +36,10 @@ namespace xsimd XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; - template ::value, void>::type> - XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept; + template ::value, void>::type> + XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept; + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail { diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp index bf169cd9a..77cf03a8a 100644 --- a/include/xsimd/arch/xsimd_emulated.hpp +++ b/include/xsimd/arch/xsimd_emulated.hpp @@ -601,6 +601,16 @@ namespace xsimd { return xsimd::min(x, y); }); } + // reduce_mul + template ::size> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch>) noexcept + { + constexpr size_t size = batch::size; + std::array buffer; + self.store_unaligned(buffer.data()); + return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies()); + } + // rsqrt template ::size> XSIMD_INLINE batch rsqrt(batch const& self, requires_arch>) noexcept diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index d6c800de1..5bd7d9206 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1705,14 +1705,21 @@ namespace xsimd * reduce_max * **************/ - // Using common implementation because ARM doe snot provide intrinsics + // Using common implementation because ARM does not provide intrinsics // for this operation /************** * reduce_min * **************/ - // Using common implementation because ARM doe snot provide intrinsics + // Using common implementation because ARM does not provide intrinsics + // for this operation + + /************** + * reduce_mul * + **************/ + + // Using common implementation because ARM does not provide intrinsics // for this operation /********** diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index b1e447103..e8ad82875 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1290,7 +1290,7 @@ namespace xsimd } else { - return hadd(self, common {}); + return reduce_add(self, common {}); } } @@ -1344,6 +1344,52 @@ namespace xsimd return first(acc3, A {}); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); + return _mm_cvtss_f32(tmp1); + } + + template + XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self))); + } + + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + batch tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3)); + tmp1 = tmp1 * self; + batch tmp2 = _mm_unpackhi_epi32(tmp1, tmp1); + tmp2 = tmp2 * tmp1; + return _mm_cvtsi128_si32(tmp2); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch tmp1 = _mm_unpackhi_epi64(self, self); + auto tmp2 = tmp1 * self; +#if defined(__x86_64__) + return _mm_cvtsi128_si64(tmp2); +#else + __m128i m; + _mm_storel_epi64(&m, tmp2); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + return reduce_mul(self, common {}); + } + } + // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp index 9dbc4b343..a88d693cd 100644 --- a/include/xsimd/arch/xsimd_sse3.hpp +++ b/include/xsimd/arch/xsimd_sse3.hpp @@ -51,6 +51,15 @@ namespace xsimd return _mm_cvtss_f32(tmp1); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1)); + return _mm_cvtss_f32(tmp2); + } + } } diff --git a/include/xsimd/arch/xsimd_vsx.hpp b/include/xsimd/arch/xsimd_vsx.hpp index 953d67faf..0e7d74a3b 100644 --- a/include/xsimd/arch/xsimd_vsx.hpp +++ b/include/xsimd/arch/xsimd_vsx.hpp @@ -559,7 +559,49 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { - return hadd(self, common {}); + return reduce_add(self, common {}); + } + + // reduce_mul + template + XSIMD_INLINE signed reduce_mul(batch const& self, requires_arch) noexcept + { + auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 + auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 + auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 + auto tmp3 = vec_mul(tmp1, tmp2); + return vec_extract(tmp3, 0); + } + template + XSIMD_INLINE unsigned reduce_mul(batch const& self, requires_arch) noexcept + { + auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 + auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 + auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 + auto tmp3 = vec_mul(tmp1, tmp2); + return vec_extract(tmp3, 0); + } + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + // FIXME: find an in-order approach + auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 + auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 + auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 + auto tmp3 = vec_mul(tmp1, tmp2); + return vec_extract(tmp3, 0); + } + template + XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept + { + auto tmp0 = vec_reve(self.data); // v1, v0 + auto tmp1 = vec_mul(self.data, tmp0); // v0 * v1, v1 * v0 + return vec_extract(tmp1, 0); + } + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + return reduce_mul(self, common {}); } // round diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index 29d9aed0a..ad0867c5b 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -1206,7 +1206,7 @@ namespace xsimd } else { - return hadd(self, common {}); + return reduce_add(self, common {}); } } template @@ -1218,6 +1218,47 @@ namespace xsimd return wasm_f64x2_extract_lane(tmp2, 0); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + v128_t tmp0 = wasm_f32x4_mul(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3)); + v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4); + v128_t tmp2 = wasm_f32x4_mul(tmp0, tmp1); + v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3); + return wasm_f32x4_extract_lane(tmp3, 0); + } + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + v128_t tmp1 = wasm_i32x4_mul(self, tmp0); + v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0); + v128_t tmp3 = wasm_i32x4_mul(tmp1, tmp2); + return wasm_i32x4_extract_lane(tmp3, 0); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + v128_t tmp1 = wasm_i64x2_mul(self, tmp0); + return wasm_i64x2_extract_lane(tmp1, 0); + } + else + { + return reduce_mul(self, common {}); + } + } + template + XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept + { + v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3); + v128_t tmp1 = wasm_f64x2_mul(self, tmp0); + v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1); + return wasm_f64x2_extract_lane(tmp2, 0); + } + // rsqrt template XSIMD_INLINE batch rsqrt(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index eeee5b267..875134766 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1879,6 +1879,20 @@ namespace xsimd return kernel::reduce_min(x, A {}); } + /** + * @ingroup batch_reducers + * + * Multiplies of all the scalars of the batch \c x. + * @param x batch involved in the reduction + * @return the result of the reduction. + */ + template + XSIMD_INLINE T reduce_mul(batch const& x) noexcept + { + detail::static_check_supported_config(); + return kernel::reduce_mul(x, A {}); + } + /** * @ingroup batch_math * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index d23b53729..b301a5bea 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -840,6 +840,13 @@ struct batch_test INFO("reduce_min"); CHECK_SCALAR_EQ(res, expected); } + // reduce_mul + { + value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(1), std::multiplies()); + value_type res = reduce_mul(batch_lhs()); + INFO("reduce_mul"); + CHECK_SCALAR_EQ(res, expected); + } } template diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp index 47ed9ca5b..e06ad83fb 100644 --- a/test/test_batch_complex.cpp +++ b/test/test_batch_complex.cpp @@ -578,6 +578,12 @@ struct batch_complex_test value_type res = reduce_add(batch_lhs()); CHECK_SCALAR_EQ(res, expected); } + // reduce_mul + { + value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(1), std::multiplies()); + value_type res = reduce_mul(batch_lhs()); + CHECK_SCALAR_EQ(res, expected); + } } void test_fused_operations() const