Skip to content

Commit

Permalink
Test and fill holes in xsimd scalar api
Browse files Browse the repository at this point in the history
Notable changes:
- parity between xsimd::* vector and scalar versions
- fix bug in bitwise_andnot implementation on Intel
- fix cast warning during load_aligned and store_aligned on ARM architecture
- fix ambiguous overload between some scalar and batch version
- harmonize xsimd::sincos api between scalar and vector version
- fix bad xsimd::neq overload for complex batches
- remove polynomial evaluation from the public API

Minor changes:
- fix typos / indent etc
- removed legacy 'fallback_' prefix from test description

Fix #784
  • Loading branch information
serge-sans-paille committed Jul 13, 2022
1 parent b77539b commit 8a5d2e1
Show file tree
Hide file tree
Showing 14 changed files with 1,741 additions and 161 deletions.
100 changes: 58 additions & 42 deletions benchmark/xsimd_benchmark.hpp
Expand Up @@ -49,8 +49,8 @@ namespace xsimd
for (size_t i = 0; i < size; ++i)
{
op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size);
op1[i] = T(10.2) / T(i + 2) + T(0.25);
op2[i] = T(20.1) / T(i + 5) + T(0.65);
op1[i] = T(10.2) / T(i + 3) + T(0.25);
op2[i] = T(20.1) / T(i + 2) + T(0.65);
}
}

Expand Down Expand Up @@ -425,36 +425,48 @@ namespace xsimd
out << "============================" << std::endl;
}

#define DEFINE_OP_FUNCTOR_2OP(OP, NAME) \
struct NAME##_fn \
{ \
template <class T> \
inline T operator()(const T& lhs, const T& rhs) const { return lhs OP rhs; } \
inline std::string name() const { return #NAME; } \
#define DEFINE_OP_FUNCTOR_2OP(OP, NAME) \
struct NAME##_fn \
{ \
template <class T> \
inline T operator()(const T& lhs, const T& rhs) const \
{ \
return lhs OP rhs; \
} \
inline std::string name() const \
{ \
return #NAME; \
} \
}

#define DEFINE_FUNCTOR_1OP(FN) \
struct FN##_fn \
{ \
template <class T> \
inline T operator()(const T& x) const \
{ \
using xsimd::FN; \
return FN(x); \
} \
inline std::string name() const { return #FN; } \
#define DEFINE_FUNCTOR_1OP(FN) \
struct FN##_fn \
{ \
template <class T> \
inline T operator()(const T& x) const \
{ \
using xsimd::FN; \
return FN(x); \
} \
inline std::string name() const \
{ \
return #FN; \
} \
}

#define DEFINE_FUNCTOR_1OP_TEMPLATE(FN, N, ...) \
struct FN##_##N##_fn \
{ \
template <class T> \
inline T operator()(const T& x) const \
{ \
using xsimd::FN; \
return FN<T, __VA_ARGS__>(x); \
} \
inline std::string name() const { return #FN " " #N; } \
#define DEFINE_FUNCTOR_1OP_TEMPLATE(NAME, FN, N, ...) \
struct NAME##_##N##_fn \
{ \
template <class T> \
inline T operator()(const T& x) const \
{ \
using xsimd::FN; \
return FN<T, __VA_ARGS__>(x); \
} \
inline std::string name() const \
{ \
return #FN " " #N; \
} \
}

#define DEFINE_FUNCTOR_2OP(FN) \
Expand All @@ -466,7 +478,10 @@ namespace xsimd
using xsimd::FN; \
return FN(lhs, rhs); \
} \
inline std::string name() const { return #FN; } \
inline std::string name() const \
{ \
return #FN; \
} \
}

#define DEFINE_FUNCTOR_3OP(FN) \
Expand All @@ -478,7 +493,10 @@ namespace xsimd
using xsimd::FN; \
return FN(op0, op1, op2); \
} \
inline std::string name() const { return #FN; } \
inline std::string name() const \
{ \
return #FN; \
} \
}

DEFINE_OP_FUNCTOR_2OP(+, add);
Expand Down Expand Up @@ -532,18 +550,16 @@ DEFINE_FUNCTOR_1OP(is_odd);
DEFINE_FUNCTOR_1OP(is_even);
#endif

#ifdef XSIMD_POLY_BENCHMARKS
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 5, 1, 2, 3, 4, 5);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 5, 1, 2, 3, 4, 5);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
#endif
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 5, 1, 2, 3, 4, 5);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 5, 1, 2, 3, 4, 5);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);

}
#endif
15 changes: 7 additions & 8 deletions include/xsimd/arch/generic/xsimd_generic_math.hpp
Expand Up @@ -223,7 +223,7 @@ namespace xsimd
}

// copysign
template <class A, class T>
template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
inline batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return abs(self) | bitofsign(other);
Expand Down Expand Up @@ -470,17 +470,16 @@ namespace xsimd
batch_type x = abs(self);
auto test0 = self < batch_type(0.);
batch_type r1(0.);
auto test1 = x < batch_type(2.f / 3.f);
batch_type z = x / (batch_type(1.) + x);
if (any(test1))
if (any(3.f * x < 2.f))
{
r1 = detail::erf_kernel<batch_type>::erfc3(z);
if (all(test1))
return select(test0, batch_type(2.) - r1, r1);
}
z -= batch_type(0.4f);
batch_type r2 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
r1 = select(test1, r1, r2);
else
{
z -= batch_type(0.4f);
r1 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
}
#ifndef XSIMD_NO_INFINITIES
r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
#endif
Expand Down
8 changes: 4 additions & 4 deletions include/xsimd/arch/xsimd_avx.hpp
Expand Up @@ -204,23 +204,23 @@ namespace xsimd
template <class A>
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
{
return _mm256_andnot_ps(self, other);
return _mm256_andnot_ps(other, self);
}
template <class A>
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
{
return _mm256_andnot_pd(self, other);
return _mm256_andnot_pd(other, self);
}

template <class A>
inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
{
return _mm256_andnot_ps(self, other);
return _mm256_andnot_ps(other, self);
}
template <class A>
inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
{
return _mm256_andnot_pd(self, other);
return _mm256_andnot_pd(other, self);
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
Expand Down
4 changes: 2 additions & 2 deletions include/xsimd/arch/xsimd_avx2.hpp
Expand Up @@ -92,12 +92,12 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_andnot_si256(self, other);
return _mm256_andnot_si256(other, self);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_andnot_si256(self, other);
return _mm256_andnot_si256(other, self);
}

// bitwise_not
Expand Down
4 changes: 2 additions & 2 deletions include/xsimd/arch/xsimd_avx512dq.hpp
Expand Up @@ -37,12 +37,12 @@ namespace xsimd
template <class A>
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_ps(self, other);
return _mm512_andnot_ps(other, self);
}
template <class A>
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_pd(self, other);
return _mm512_andnot_pd(other, self);
}

// bitwise_or
Expand Down
10 changes: 7 additions & 3 deletions include/xsimd/arch/xsimd_avx512f.hpp
Expand Up @@ -336,7 +336,11 @@ namespace xsimd
template <class A>
inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
{
#if defined(_MSC_VER)
return _mm512_and_ps(self, other);
#else
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
#endif
}
template <class A>
inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
Expand All @@ -361,18 +365,18 @@ namespace xsimd
template <class A>
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
{
return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
}
template <class A>
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
{
return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
{
return _mm512_andnot_si512(self, other);
return _mm512_andnot_si512(other, self);
}

template <class A, class T>
Expand Down
32 changes: 16 additions & 16 deletions include/xsimd/arch/xsimd_neon.hpp
Expand Up @@ -455,44 +455,44 @@ namespace xsimd
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u8(src);
return vld1q_u8((uint8_t*)src);
}

template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s8(src);
return vld1q_s8((int8_t*)src);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u16(src);
return vld1q_u16((uint16_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s16(src);
return vld1q_s16((int16_t*)src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u32(src);
return vld1q_u32((uint32_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s32(src);
return vld1q_s32((int32_t*)src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u64(src);
return vld1q_u64((uint64_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s64(src);
return vld1q_s64((int64_t*)src);
}

template <class A>
Expand All @@ -514,49 +514,49 @@ namespace xsimd
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_u8(dst, src);
vst1q_u8((uint8_t*)dst, src);
}

template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_s8(dst, src);
vst1q_s8((int8_t*)dst, src);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_u16(dst, src);
vst1q_u16((uint16_t*)dst, src);
}

template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_s16(dst, src);
vst1q_s16((int16_t*)dst, src);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_u32(dst, src);
vst1q_u32((uint32_t*)dst, src);
}

template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_s32(dst, src);
vst1q_s32((int32_t*)dst, src);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_u64(dst, src);
vst1q_u64((uint64_t*)dst, src);
}

template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
{
vst1q_s64(dst, src);
vst1q_s64((int64_t*)dst, src);
}

template <class A>
Expand Down

0 comments on commit 8a5d2e1

Please sign in to comment.