Test and fill holes in xsimd scalar api

Notable changes: - parity between xsimd::* vector and scalar versions - fix bug in bitwise_andnot implementation on Intel - fix cast warning during load_aligned and store_aligned on ARM architecture - fix ambiguous overload between some scalar and batch version - harmonize xsimd::sincos api between scalar and vector version - fix bad xsimd::neq overload for complex batches - remove polynomial evaluation from the public API Minor changes: - fix typos / indent etc - removed legacy 'fallback_' prefix from test description Fix #784
xtensor-stack · Jul 13, 2022 · 8a5d2e1 · 8a5d2e1
1 parent b77539b
commit 8a5d2e1
Show file tree

Hide file tree

Showing 14 changed files with 1,741 additions and 161 deletions.
diff --git a/benchmark/xsimd_benchmark.hpp b/benchmark/xsimd_benchmark.hpp
@@ -49,8 +49,8 @@ namespace xsimd
         for (size_t i = 0; i < size; ++i)
         {
             op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size);
-            op1[i] = T(10.2) / T(i + 2) + T(0.25);
-            op2[i] = T(20.1) / T(i + 5) + T(0.65);
+            op1[i] = T(10.2) / T(i + 3) + T(0.25);
+            op2[i] = T(20.1) / T(i + 2) + T(0.65);
         }
     }
 
@@ -425,36 +425,48 @@ namespace xsimd
         out << "============================" << std::endl;
     }
 
-#define DEFINE_OP_FUNCTOR_2OP(OP, NAME)                                              \
-    struct NAME##_fn                                                                 \
-    {                                                                                \
-        template <class T>                                                           \
-        inline T operator()(const T& lhs, const T& rhs) const { return lhs OP rhs; } \
-        inline std::string name() const { return #NAME; }                            \
+#define DEFINE_OP_FUNCTOR_2OP(OP, NAME)                       \
+    struct NAME##_fn                                          \
+    {                                                         \
+        template <class T>                                    \
+        inline T operator()(const T& lhs, const T& rhs) const \
+        {                                                     \
+            return lhs OP rhs;                                \
+        }                                                     \
+        inline std::string name() const                       \
+        {                                                     \
+            return #NAME;                                     \
+        }                                                     \
     }
 
-#define DEFINE_FUNCTOR_1OP(FN)                          \
-    struct FN##_fn                                      \
-    {                                                   \
-        template <class T>                              \
-        inline T operator()(const T& x) const           \
-        {                                               \
-            using xsimd::FN;                            \
-            return FN(x);                               \
-        }                                               \
-        inline std::string name() const { return #FN; } \
+#define DEFINE_FUNCTOR_1OP(FN)                \
+    struct FN##_fn                            \
+    {                                         \
+        template <class T>                    \
+        inline T operator()(const T& x) const \
+        {                                     \
+            using xsimd::FN;                  \
+            return FN(x);                     \
+        }                                     \
+        inline std::string name() const       \
+        {                                     \
+            return #FN;                       \
+        }                                     \
     }
 
-#define DEFINE_FUNCTOR_1OP_TEMPLATE(FN, N, ...)                \
-    struct FN##_##N##_fn                                       \
-    {                                                          \
-        template <class T>                                     \
-        inline T operator()(const T& x) const                  \
-        {                                                      \
-            using xsimd::FN;                                   \
-            return FN<T, __VA_ARGS__>(x);                      \
-        }                                                      \
-        inline std::string name() const { return #FN " " #N; } \
+#define DEFINE_FUNCTOR_1OP_TEMPLATE(NAME, FN, N, ...) \
+    struct NAME##_##N##_fn                            \
+    {                                                 \
+        template <class T>                            \
+        inline T operator()(const T& x) const         \
+        {                                             \
+            using xsimd::FN;                          \
+            return FN<T, __VA_ARGS__>(x);             \
+        }                                             \
+        inline std::string name() const               \
+        {                                             \
+            return #FN " " #N;                        \
+        }                                             \
     }
 
 #define DEFINE_FUNCTOR_2OP(FN)                                \
@@ -466,7 +478,10 @@ namespace xsimd
             using xsimd::FN;                                  \
             return FN(lhs, rhs);                              \
         }                                                     \
-        inline std::string name() const { return #FN; }       \
+        inline std::string name() const                       \
+        {                                                     \
+            return #FN;                                       \
+        }                                                     \
     }
 
 #define DEFINE_FUNCTOR_3OP(FN)                                              \
@@ -478,7 +493,10 @@ namespace xsimd
             using xsimd::FN;                                                \
             return FN(op0, op1, op2);                                       \
         }                                                                   \
-        inline std::string name() const { return #FN; }                     \
+        inline std::string name() const                                     \
+        {                                                                   \
+            return #FN;                                                     \
+        }                                                                   \
     }
 
     DEFINE_OP_FUNCTOR_2OP(+, add);
@@ -532,18 +550,16 @@ DEFINE_FUNCTOR_1OP(is_odd);
 DEFINE_FUNCTOR_1OP(is_even);
 #endif
 
-#ifdef XSIMD_POLY_BENCHMARKS
-    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 5, 1, 2, 3, 4, 5);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 5, 1, 2, 3, 4, 5);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-#endif
+    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 5, 1, 2, 3, 4, 5);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 5, 1, 2, 3, 4, 5);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
 
 }
 #endif
diff --git a/include/xsimd/arch/generic/xsimd_generic_math.hpp b/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -223,7 +223,7 @@ namespace xsimd
         }
 
         // copysign
-        template <class A, class T>
+        template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
         inline batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return abs(self) | bitofsign(other);
@@ -470,17 +470,16 @@ namespace xsimd
             batch_type x = abs(self);
             auto test0 = self < batch_type(0.);
             batch_type r1(0.);
-            auto test1 = x < batch_type(2.f / 3.f);
             batch_type z = x / (batch_type(1.) + x);
-            if (any(test1))
+            if (any(3.f * x < 2.f))
             {
                 r1 = detail::erf_kernel<batch_type>::erfc3(z);
-                if (all(test1))
-                    return select(test0, batch_type(2.) - r1, r1);
             }
-            z -= batch_type(0.4f);
-            batch_type r2 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
-            r1 = select(test1, r1, r2);
+            else
+            {
+                z -= batch_type(0.4f);
+                r1 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            }
 #ifndef XSIMD_NO_INFINITIES
             r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
 #endif

diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -204,23 +204,23 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_andnot_ps(self, other);
+            return _mm256_andnot_ps(other, self);
         }
         template <class A>
         inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_andnot_pd(self, other);
+            return _mm256_andnot_pd(other, self);
         }
 
         template <class A>
         inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_andnot_ps(self, other);
+            return _mm256_andnot_ps(other, self);
         }
         template <class A>
         inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_andnot_pd(self, other);
+            return _mm256_andnot_pd(other, self);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>

diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -92,12 +92,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
-            return _mm256_andnot_si256(self, other);
+            return _mm256_andnot_si256(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
         {
-            return _mm256_andnot_si256(self, other);
+            return _mm256_andnot_si256(other, self);
         }
 
         // bitwise_not

diff --git a/include/xsimd/arch/xsimd_avx512dq.hpp b/include/xsimd/arch/xsimd_avx512dq.hpp
@@ -37,12 +37,12 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
         {
-            return _mm512_andnot_ps(self, other);
+            return _mm512_andnot_ps(other, self);
         }
         template <class A>
         inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
         {
-            return _mm512_andnot_pd(self, other);
+            return _mm512_andnot_pd(other, self);
         }
 
         // bitwise_or

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -336,7 +336,11 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
+#if defined(_MSC_VER)
+            return _mm512_and_ps(self, other);
+#else
             return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+#endif
         }
         template <class A>
         inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
@@ -361,18 +365,18 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+            return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
         }
         template <class A>
         inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+            return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_andnot_si512(self, other);
+            return _mm512_andnot_si512(other, self);
         }
 
         template <class A, class T>

diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -455,44 +455,44 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_u8(src);
+            return vld1q_u8((uint8_t*)src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_s8(src);
+            return vld1q_s8((int8_t*)src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_u16(src);
+            return vld1q_u16((uint16_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_s16(src);
+            return vld1q_s16((int16_t*)src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_u32(src);
+            return vld1q_u32((uint32_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_s32(src);
+            return vld1q_s32((int32_t*)src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_u64(src);
+            return vld1q_u64((uint64_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
         inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
-            return vld1q_s64(src);
+            return vld1q_s64((int64_t*)src);
         }
 
         template <class A>
@@ -514,49 +514,49 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_u8(dst, src);
+            vst1q_u8((uint8_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_s8(dst, src);
+            vst1q_s8((int8_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_u16(dst, src);
+            vst1q_u16((uint16_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_s16(dst, src);
+            vst1q_s16((int16_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_u32(dst, src);
+            vst1q_u32((uint32_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_s32(dst, src);
+            vst1q_s32((int32_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_u64(dst, src);
+            vst1q_u64((uint64_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
         inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
-            vst1q_s64(dst, src);
+            vst1q_s64((int64_t*)dst, src);
         }
 
         template <class A>