Skip to content

Commit eb3bacb

Browse files
authored
Merge pull request #1163 from xtensor-stack/bug/1132
generalization of reduce_mul implementation
2 parents 429da70 + 83727ee commit eb3bacb

17 files changed

+289
-23
lines changed

docs/source/api/reducer_index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Reduction operators
3838
+---------------------------------------+----------------------------------------------------+
3939
| :cpp:func:`reduce_min` | min of the batch elements |
4040
+---------------------------------------+----------------------------------------------------+
41+
| :cpp:func:`reduce_mul` | product of the batch elements |
42+
+---------------------------------------+----------------------------------------------------+
4143
| :cpp:func:`haddp` | horizontal sum across batches |
4244
+---------------------------------------+----------------------------------------------------+
4345

include/xsimd/arch/common/xsimd_common_arithmetic.hpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,6 @@ namespace xsimd
139139
return fma(x, y, select(mask, neg(z), z));
140140
}
141141

142-
// hadd
143-
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
144-
XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept
145-
{
146-
alignas(A::alignment()) T buffer[batch<T, A>::size];
147-
self.store_aligned(buffer);
148-
T res = 0;
149-
for (T val : buffer)
150-
{
151-
res += val;
152-
}
153-
return res;
154-
}
155-
156142
// incr
157143
template <class A, class T>
158144
XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<common>) noexcept

include/xsimd/arch/common/xsimd_common_details.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ namespace xsimd
7777
template <class T, class A>
7878
XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
7979
template <class T, class A>
80+
XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
81+
template <class T, class A>
8082
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
8183
template <class T, class A>
8284
XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2103,6 +2103,19 @@ namespace xsimd
21032103
return { reduce_add(self.real()), reduce_add(self.imag()) };
21042104
}
21052105

2106+
template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
2107+
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept
2108+
{
2109+
alignas(A::alignment()) T buffer[batch<T, A>::size];
2110+
self.store_aligned(buffer);
2111+
T res = 0;
2112+
for (T val : buffer)
2113+
{
2114+
res += val;
2115+
}
2116+
return res;
2117+
}
2118+
21062119
namespace detail
21072120
{
21082121
template <class T, T N>
@@ -2147,6 +2160,34 @@ namespace xsimd
21472160
self, std::integral_constant<unsigned, batch<T, A>::size>());
21482161
}
21492162

2163+
// reduce_mul
2164+
template <class A, class T>
2165+
XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
2166+
{
2167+
// FIXME: could do better
2168+
alignas(A::alignment()) std::complex<T> buffer[batch<std::complex<T>, A>::size];
2169+
self.store_aligned(buffer);
2170+
std::complex<T> res = 1;
2171+
for (auto val : buffer)
2172+
{
2173+
res *= val;
2174+
}
2175+
return res;
2176+
}
2177+
2178+
template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
2179+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
2180+
{
2181+
alignas(A::alignment()) T buffer[batch<T, A>::size];
2182+
self.store_aligned(buffer);
2183+
T res = 1;
2184+
for (T val : buffer)
2185+
{
2186+
res *= val;
2187+
}
2188+
return res;
2189+
}
2190+
21502191
// remainder
21512192
template <class A>
21522193
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,7 +1046,7 @@ namespace xsimd
10461046
}
10471047

10481048
// reduce_add
1049-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
1049+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
10501050
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
10511051
{
10521052
typename batch<T, sse4_2>::register_type low, high;
@@ -1077,6 +1077,16 @@ namespace xsimd
10771077
return reduce_min(batch<T, sse4_2>(low));
10781078
}
10791079

1080+
// reduce_mul
1081+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1082+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
1083+
{
1084+
typename batch<T, sse4_2>::register_type low, high;
1085+
detail::split_avx(self, low, high);
1086+
batch<T, sse4_2> blow(low), bhigh(high);
1087+
return reduce_mul(blow * bhigh);
1088+
}
1089+
10801090
// rsqrt
10811091
template <class A>
10821092
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1911,4 +1921,4 @@ namespace xsimd
19111921
}
19121922
}
19131923

1914-
#endif
1924+
#endif

include/xsimd/arch/xsimd_avx512dq.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,16 @@ namespace xsimd
188188
return reduce_add(batch<float, avx2>(res1), avx2 {});
189189
}
190190

191+
// reduce_mul
192+
template <class A>
193+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
194+
{
195+
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
196+
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
197+
__m256 res1 = _mm256_mul_ps(tmp1, tmp2);
198+
return reduce_mul(batch<float, avx2>(res1), avx2 {});
199+
}
200+
191201
// swizzle constant mask
192202
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
193203
uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,37 @@ namespace xsimd
15581558
return reduce_min(batch<T, avx2>(low));
15591559
}
15601560

1561+
// reduce_mul
1562+
template <class A>
1563+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1564+
{
1565+
return _mm512_reduce_mul_ps(rhs);
1566+
}
1567+
template <class A>
1568+
XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1569+
{
1570+
return _mm512_reduce_mul_pd(rhs);
1571+
}
1572+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1573+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1574+
{
1575+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1576+
{
1577+
return _mm512_reduce_mul_epi32(self);
1578+
}
1579+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1580+
{
1581+
return _mm512_reduce_mul_epi64(self);
1582+
}
1583+
else
1584+
{
1585+
__m256i low, high;
1586+
detail::split_avx512(self, low, high);
1587+
batch<T, avx2> blow(low), bhigh(high);
1588+
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
1589+
}
1590+
}
1591+
15611592
// rsqrt
15621593
template <class A>
15631594
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ namespace xsimd
3636
XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
3737
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
3838
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
39-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
40-
XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept;
39+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
40+
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;
41+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
42+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;
4143
// Forward declarations for pack-level helpers
4244
namespace detail
4345
{

include/xsimd/arch/xsimd_emulated.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,16 @@ namespace xsimd
601601
{ return xsimd::min(x, y); });
602602
}
603603

604+
// reduce_mul
605+
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
606+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
607+
{
608+
constexpr size_t size = batch<T, A>::size;
609+
std::array<T, size> buffer;
610+
self.store_unaligned(buffer.data());
611+
return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies<T>());
612+
}
613+
604614
// rsqrt
605615
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
606616
XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,14 +1705,21 @@ namespace xsimd
17051705
* reduce_max *
17061706
**************/
17071707

1708-
// Using common implementation because ARM doe snot provide intrinsics
1708+
// Using common implementation because ARM does not provide intrinsics
17091709
// for this operation
17101710

17111711
/**************
17121712
* reduce_min *
17131713
**************/
17141714

1715-
// Using common implementation because ARM doe snot provide intrinsics
1715+
// Using common implementation because ARM does not provide intrinsics
1716+
// for this operation
1717+
1718+
/**************
1719+
* reduce_mul *
1720+
**************/
1721+
1722+
// Using common implementation because ARM does not provide intrinsics
17161723
// for this operation
17171724

17181725
/**********

0 commit comments

Comments
 (0)