Merge pull request #987 from xtensor-stack/feature/fix-version-values

Fix various problems with architecture version handling
xtensor-stack · Dec 7, 2023 · c5c2101 · c5c2101
2 parents 27ec4ff + 7941abf
commit c5c2101
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 109 deletions.
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
@@ -6,7 +6,7 @@ concurrency:
 jobs:
   test:
     runs-on: ubuntu-latest
- 
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -19,11 +19,9 @@ jobs:
                 python
             init-shell: bash
 
-
-
       - name: Build script
         shell: bash -el {0}
         run: |
             echo "Build script for wasm"
             playwright install
-            ./test/test_wasm/test_wasm.sh
+            ./test/test_wasm/test_wasm.sh
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
@@ -380,7 +380,7 @@ namespace xsimd
         template <class A>
         inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
         {
-            return wasm_f32x4_eq(self, other);
+            return wasm_i32x4_eq(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -440,7 +440,7 @@ namespace xsimd
         template <class A>
         inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
         {
-            return wasm_f64x2_eq(self, other);
+            return wasm_i64x2_eq(self, other);
         }
 
         // fast_cast
@@ -579,6 +579,30 @@ namespace xsimd
                 0xFFFFFF00,
                 0xFFFFFFFF,
             };
+            alignas(A::alignment()) static const uint32_t lut16[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            alignas(A::alignment()) static const uint64_t lut8[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
                 assert(!(mask & ~0xFFFF) && "inbound mask");
@@ -587,15 +611,17 @@ namespace xsimd
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 assert(!(mask & ~0xFF) && "inbound mask");
-                return wasm_i64x2_make(lut64[mask >> 4], lut64[mask & 0xF]);
+                return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]);
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                return batch_bool_cast<T>(from_mask(batch_bool<float, A> {}, mask, wasm {}));
+                assert(!(mask & ~0xFul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut16[mask]);
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
             {
-                return batch_bool_cast<T>(from_mask(batch_bool<double, A> {}, mask, wasm {}));
+                assert(!(mask & ~0x3ul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut8[mask]);
             }
         }
 
@@ -1114,44 +1140,6 @@ namespace xsimd
             return wasm_f64x2_extract_lane(tmp2, 0);
         }
 
-        // reduce_max
-        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<wasm>) noexcept
-        {
-            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
-            batch<T, A> acc0 = max(self, step0);
-
-            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
-            batch<T, A> acc1 = max(acc0, step1);
-
-            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
-            batch<T, A> acc2 = max(acc1, step2);
-            if (sizeof(T) == 2)
-                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
-            batch<T, A> acc3 = max(acc2, step3);
-            return acc3.get(0);
-        }
-
-        // reduce_min
-        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<wasm>) noexcept
-        {
-            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
-            batch<T, A> acc0 = min(self, step0);
-
-            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
-            batch<T, A> acc1 = min(acc0, step1);
-
-            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
-            batch<T, A> acc2 = min(acc1, step2);
-            if (sizeof(T) == 2)
-                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
-            batch<T, A> acc3 = min(acc2, step3);
-            return acc3.get(0);
-        }
-
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1259,29 +1247,15 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<wasm>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
         {
-            // shuffle within lane
-            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
-                return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
-
-            // shuffle within opposite lane
-            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
-                return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3);
-            return shuffle(x, y, mask, generic {});
+            return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<wasm>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
         {
-            // shuffle within lane
-            if (I0 < 2 && I1 >= 2)
-                return wasm_i64x2_shuffle(x, y, I0, I1);
-
-            // shuffle within opposite lane
-            if (I0 >= 2 && I1 < 2)
-                return wasm_i64x2_shuffle(y, x, I0, I1);
-            return shuffle(x, y, mask, generic {});
+            return wasm_i64x2_shuffle(x, y, I0, I1);
         }
 
         // set
@@ -1500,7 +1474,6 @@ namespace xsimd
         }
 
         // swizzle
-
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
@@ -1516,7 +1489,7 @@ namespace xsimd
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
         {
-            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
@@ -1528,7 +1501,7 @@ namespace xsimd
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
-            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3);
+            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@@ -1537,6 +1510,32 @@ namespace xsimd
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
         }
 
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
+        }
+
         // trunc
         template <class A>
         inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1625,4 +1624,4 @@ namespace xsimd
     }
 }
 
-#endif
+#endif
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
@@ -57,22 +57,22 @@ namespace xsimd
         {
         };
 
-        template <class... Archs>
+        template <unsigned... Vals>
         struct is_sorted;
 
         template <>
         struct is_sorted<> : std::true_type
         {
         };
 
-        template <class Arch>
-        struct is_sorted<Arch> : std::true_type
+        template <unsigned Val>
+        struct is_sorted<Val> : std::true_type
         {
         };
 
-        template <class A0, class A1, class... Archs>
-        struct is_sorted<A0, A1, Archs...>
-            : std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
+        template <unsigned V0, unsigned V1, unsigned... Vals>
+        struct is_sorted<V0, V1, Vals...>
+            : std::conditional<(V0 >= V1), is_sorted<V1, Vals...>,
                                std::false_type>::type
         {
         };
@@ -111,7 +111,7 @@ namespace xsimd
     struct arch_list
     {
 #ifndef NDEBUG
-        static_assert(detail::is_sorted<Archs...>::value,
+        static_assert(detail::is_sorted<Archs::version()...>::value,
                       "architecture list must be sorted by version");
 #endif
 
@@ -190,13 +190,13 @@ namespace xsimd
     struct unsupported
     {
     };
-    using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+    using all_x86_architectures = arch_list<avx512vnni, avx512vbmi, avx512ifma, avx512pf, avx512bw, avx512er, avx512dq, avx512cd, avx512f, avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
     using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
     using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures, all_riscv_architectures, all_wasm_architectures>::type;
+    using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 

diff --git a/include/xsimd/types/xsimd_avx512ifma_register.hpp b/include/xsimd/types/xsimd_avx512ifma_register.hpp
@@ -12,7 +12,7 @@
 #ifndef XSIMD_AVX512IFMA_REGISTER_HPP
 #define XSIMD_AVX512IFMA_REGISTER_HPP
 
-#include "./xsimd_avx512dq_register.hpp"
+#include "./xsimd_avx512bw_register.hpp"
 
 namespace xsimd
 {
@@ -22,11 +22,11 @@ namespace xsimd
      *
      * AVX512IFMA instructions
      */
-    struct avx512ifma : avx512dq
+    struct avx512ifma : avx512bw
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
+        static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); }
         static constexpr char const* name() noexcept { return "avx512ifma"; }
     };
 
@@ -40,7 +40,7 @@ namespace xsimd
             using type = simd_avx512_bool_register<T>;
         };
 
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512dq);
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw);
 
     }
 #endif

diff --git a/include/xsimd/types/xsimd_avx512vbmi_register.hpp b/include/xsimd/types/xsimd_avx512vbmi_register.hpp
@@ -26,7 +26,7 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); }
+        static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); }
         static constexpr char const* name() noexcept { return "avx512vbmi"; }
     };
 

diff --git a/include/xsimd/types/xsimd_avx512vnni_register.hpp b/include/xsimd/types/xsimd_avx512vnni_register.hpp
@@ -26,7 +26,7 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); }
+        static constexpr unsigned version() noexcept { return generic::version(3, 7, 0); }
         static constexpr char const* name() noexcept { return "avx512vnni"; }
     };
 

diff --git a/include/xsimd/types/xsimd_sve_register.hpp b/include/xsimd/types/xsimd_sve_register.hpp
@@ -36,7 +36,7 @@ namespace xsimd
             static constexpr bool available() noexcept { return true; }
             static constexpr bool requires_alignment() noexcept { return true; }
             static constexpr std::size_t alignment() noexcept { return 16; }
-            static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
+            static constexpr unsigned version() noexcept { return generic::version(9, Width / 32, 0); }
             static constexpr char const* name() noexcept { return "arm64+sve"; }
         };
     }