diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
index 1226c887c..b64613dfb 100644
--- a/include/xsimd/arch/common/xsimd_common_cast.hpp
+++ b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -12,7 +12,10 @@
 #ifndef XSIMD_COMMON_CAST_HPP
 #define XSIMD_COMMON_CAST_HPP
 
-#include "../../types/xsimd_traits.hpp"
+#include <array>
+
+#include "../../config/xsimd_macros.hpp"
+#include "../../utils/xsimd_type_traits.hpp"
 
 namespace xsimd
 {
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
index 4af19a650..292dab7c3 100644
--- a/include/xsimd/arch/xsimd_neon.hpp
+++ b/include/xsimd/arch/xsimd_neon.hpp
@@ -21,6 +21,7 @@
 
 #include "../types/xsimd_neon_register.hpp"
 #include "../types/xsimd_utils.hpp"
+#include "../utils/xsimd_type_traits.hpp"
 #include "./common/xsimd_common_bit.hpp"
 #include "./common/xsimd_common_cast.hpp"
 
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
index 73183a086..6d1eae59d 100644
--- a/include/xsimd/arch/xsimd_rvv.hpp
+++ b/include/xsimd/arch/xsimd_rvv.hpp
@@ -16,6 +16,7 @@
 #include "../config/xsimd_macros.hpp"
 #include "../types/xsimd_batch_constant.hpp"
 #include "../types/xsimd_rvv_register.hpp"
+#include "../utils/xsimd_type_traits.hpp"
 #include "./xsimd_constants.hpp"
 
 // This set of macros allows the synthesis of identifiers using a template and
@@ -86,32 +87,32 @@
 // for the function signature argument(s) to XSIMD_RVV_OVERLOAD.  That signature can
 // also reference the template argument T, because it's a text substitution
 // into the template.
-#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                      \
-    namespace NAME##_cruft                                                \
-    {                                                                     \
-        template <class T>                                                \
-        struct ctx                                                        \
-        {                                                                 \
-            static constexpr size_t width = XSIMD_RVV_BITS;               \
-            static constexpr size_t vl = width / (sizeof(T) * 8);         \
-            using vec = rvv_reg_t<T, width>;                              \
-            using uvec = rvv_reg_t<as_unsigned_relaxed_t<T>, width>;      \
-            using svec = rvv_reg_t<as_signed_relaxed_t<T>, width>;        \
-            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;         \
-            using bvec = rvv_bool_t<T, width>;                            \
-            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>; \
-            using wide_vec = rvv_reg_t<T, width * 2>;                     \
-            using narrow_vec = rvv_reg_t<T, width / 2>;                   \
-            using type = SIGNATURE;                                       \
-        };                                                                \
-        template <class T>                                                \
-        using sig_t = typename ctx<T>::type;                              \
-        template <class K, class T>                                       \
-        struct impl                                                       \
-        {                                                                 \
-            void operator()() const noexcept {};                          \
-        };                                                                \
-        template <class K>                                                \
+#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                       \
+    namespace NAME##_cruft                                                 \
+    {                                                                      \
+        template <class T>                                                 \
+        struct ctx                                                         \
+        {                                                                  \
+            static constexpr size_t width = XSIMD_RVV_BITS;                \
+            static constexpr size_t vl = width / (sizeof(T) * 8);          \
+            using vec = rvv_reg_t<T, width>;                               \
+            using uvec = rvv_reg_t<xsimd::sized_uint_t<sizeof(T)>, width>; \
+            using svec = rvv_reg_t<xsimd::sized_int_t<sizeof(T)>, width>;  \
+            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;          \
+            using bvec = rvv_bool_t<T, width>;                             \
+            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>;  \
+            using wide_vec = rvv_reg_t<T, width * 2>;                      \
+            using narrow_vec = rvv_reg_t<T, width / 2>;                    \
+            using type = SIGNATURE;                                        \
+        };                                                                 \
+        template <class T>                                                 \
+        using sig_t = typename ctx<T>::type;                               \
+        template <class K, class T>                                        \
+        struct impl                                                        \
+        {                                                                  \
+            void operator()() const noexcept {};                           \
+        };                                                                 \
+        template <class K>                                                 \
         using impl_t = impl<K, sig_t<K>>;
 
 #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
@@ -294,57 +295,12 @@ namespace xsimd
             template <class T, size_t Width = XSIMD_RVV_BITS>
             using rvv_bool_t = types::detail::rvv_bool_t<T, Width>;
 
-            template <size_t>
-            struct as_signed_relaxed;
-            template <>
-            struct as_signed_relaxed<1>
-            {
-                using type = int8_t;
-            };
-            template <>
-            struct as_signed_relaxed<2>
-            {
-                using type = int16_t;
-            };
-            template <>
-            struct as_signed_relaxed<4>
-            {
-                using type = int32_t;
-            };
-            template <>
-            struct as_signed_relaxed<8>
-            {
-                using type = int64_t;
-            };
-            template <class T>
-            using as_signed_relaxed_t = typename as_signed_relaxed<sizeof(T)>::type;
-            template <size_t>
-            struct as_unsigned_relaxed;
-            template <>
-            struct as_unsigned_relaxed<1>
+            template <std::size_t S>
+            struct as_float_relaxed
             {
-                using type = uint8_t;
+                using type = xsimd::sized_fp_t<S>;
             };
             template <>
-            struct as_unsigned_relaxed<2>
-            {
-                using type = uint16_t;
-            };
-            template <>
-            struct as_unsigned_relaxed<4>
-            {
-                using type = uint32_t;
-            };
-            template <>
-            struct as_unsigned_relaxed<8>
-            {
-                using type = uint64_t;
-            };
-            template <class T>
-            using as_unsigned_relaxed_t = typename as_unsigned_relaxed<sizeof(T)>::type;
-            template <size_t>
-            struct as_float_relaxed;
-            template <>
             struct as_float_relaxed<1>
             {
                 using type = int8_t;
@@ -354,16 +310,6 @@ namespace xsimd
             {
                 using type = int16_t;
             };
-            template <>
-            struct as_float_relaxed<4>
-            {
-                using type = float;
-            };
-            template <>
-            struct as_float_relaxed<8>
-            {
-                using type = double;
-            };
             template <class T>
             using as_float_relaxed_t = typename as_float_relaxed<sizeof(T)>::type;
 
diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp
index 5be471e93..6636f4c37 100644
--- a/include/xsimd/arch/xsimd_sve.hpp
+++ b/include/xsimd/arch/xsimd_sve.hpp
@@ -16,15 +16,8 @@
 #include <complex>
 #include <type_traits>
 
-#include "../config/xsimd_macros.hpp"
 #include "../types/xsimd_sve_register.hpp"
 
-// Define a inline namespace with the explicit SVE vector size to avoid ODR violation
-// When dynamically dispatching between different SVE sizes.
-// While most code is safe from ODR violation as the size is already encoded in the
-// register (and hence batch) types, utilities can quickly fall prone to this issue.
-#define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS)
-
 namespace xsimd
 {
     template <typename T, class A, T... Values>
@@ -32,1243 +25,1170 @@ namespace xsimd
 
     namespace kernel
     {
-        inline namespace XSIMD_SVE_NAMESPACE
+        namespace detail
         {
-            namespace detail_sve
-            {
-                using xsimd::index;
-                using xsimd::types::detail::sve_vector_type;
-
-                // predicate creation
-                XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
-                XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
-                XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
-                XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
-
-                template <class T>
-                XSIMD_INLINE svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
-
-                // predicate loading
-                template <bool M0, bool M1>
-                XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); }
-                template <bool M0, bool M1, bool M2, bool M3>
-                XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); }
-                template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7>
-                XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); }
-                template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7,
-                          bool M8, bool M9, bool M10, bool M11, bool M12, bool M13, bool M14, bool M15>
-                XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); }
-
-                // count active lanes in a predicate
-                XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
-                XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
-                XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
-                XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
-
-                template <class T>
-                XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
-
-                // enable for signed integers
-                template <class T>
-                using sve_enable_signed_int_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value, int>;
-
-                // enable for unsigned integers
-                template <class T>
-                using sve_enable_unsigned_int_t = std::enable_if_t<std::is_integral<T>::value && !std::is_signed<T>::value, int>;
-
-                // enable for floating points
-                template <class T>
-                using sve_enable_floating_point_t = std::enable_if_t<std::is_floating_point<T>::value, int>;
-
-                // enable for signed integers or floating points
-                template <class T>
-                using sve_enable_signed_int_or_floating_point_t = std::enable_if_t<std::is_signed<T>::value, int>;
-
-                // enable for all SVE supported types
-                template <class T>
-                using sve_enable_all_t = std::enable_if_t<std::is_arithmetic<T>::value, int>;
-
-                // Trait describing the SVE types that correspond to a scalar,
-                // parameterised by (byte size, signedness, floating-point-ness).
-                //
-                // `scalar` is the matching fixed-width scalar (int8_t, ..., float,
-                // double). SVE load/store intrinsics are overloaded on these
-                // pointer types, so remapping integers through `scalar` avoids
-                // platform quirks such as darwin arm64's `long` vs `long long`
-                // distinction and rejects `char` as an element type.
-                //
-                // `sizeless` is the matching sizeless SVE type. xsimd stores SVE
-                // vectors as fixed-size attributed types (arm_sve_vector_bits),
-                // which clang treats as implicitly convertible to every sizeless
-                // SVE type — including multi-vector tuples — making the overloaded
-                // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting
-                // to `sizeless` first collapses the overload set to the single
-                // 1-vector candidate.
-                template <size_t N, bool Signed, bool FP>
-                struct sve_type;
-                template <>
-                struct sve_type<1, true, false>
-                {
-                    using scalar = int8_t;
-                    using sizeless = svint8_t;
-                };
-                template <>
-                struct sve_type<1, false, false>
-                {
-                    using scalar = uint8_t;
-                    using sizeless = svuint8_t;
-                };
-                template <>
-                struct sve_type<2, true, false>
-                {
-                    using scalar = int16_t;
-                    using sizeless = svint16_t;
-                };
-                template <>
-                struct sve_type<2, false, false>
-                {
-                    using scalar = uint16_t;
-                    using sizeless = svuint16_t;
-                };
-                template <>
-                struct sve_type<4, true, false>
-                {
-                    using scalar = int32_t;
-                    using sizeless = svint32_t;
-                };
-                template <>
-                struct sve_type<4, false, false>
-                {
-                    using scalar = uint32_t;
-                    using sizeless = svuint32_t;
-                };
-                template <>
-                struct sve_type<8, true, false>
-                {
-                    using scalar = int64_t;
-                    using sizeless = svint64_t;
-                };
-                template <>
-                struct sve_type<8, false, false>
-                {
-                    using scalar = uint64_t;
-                    using sizeless = svuint64_t;
-                };
-                template <>
-                struct sve_type<4, true, true>
-                {
-                    using scalar = float;
-                    using sizeless = svfloat32_t;
-                };
-                template <>
-                struct sve_type<8, true, true>
-                {
-                    using scalar = double;
-                    using sizeless = svfloat64_t;
-                };
+            using xsimd::index;
+            using xsimd::types::detail::sve_vector_type;
+
+            // predicate creation
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+
+            template <class T>
+            svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
+
+            // predicate loading
+            template <bool M0, bool M1>
+            svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); }
+            template <bool M0, bool M1, bool M2, bool M3>
+            svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); }
+            template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7>
+            svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); }
+            template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7,
+                      bool M8, bool M9, bool M10, bool M11, bool M12, bool M13, bool M14, bool M15>
+            svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); }
+
+            // count active lanes in a predicate
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+
+            template <class T>
+            XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
+
+            // enable for signed integers
+            template <class T>
+            using sve_enable_signed_int_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value, int>;
+
+            // enable for unsigned integers
+            template <class T>
+            using sve_enable_unsigned_int_t = std::enable_if_t<std::is_integral<T>::value && !std::is_signed<T>::value, int>;
+
+            // enable for floating points
+            template <class T>
+            using sve_enable_floating_point_t = std::enable_if_t<std::is_floating_point<T>::value, int>;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using sve_enable_signed_int_or_floating_point_t = std::enable_if_t<std::is_signed<T>::value, int>;
+
+            // enable for all SVE supported types
+            template <class T>
+            using sve_enable_all_t = std::enable_if_t<std::is_arithmetic<T>::value, int>;
+
+            // `sizeless` is the matching sizeless SVE type. xsimd stores SVE
+            // vectors as fixed-size attributed types (arm_sve_vector_bits),
+            // which clang treats as implicitly convertible to every sizeless
+            // SVE type — including multi-vector tuples — making the overloaded
+            // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting
+            // to `sizeless` first collapses the overload set to the single
+            // 1-vector candidate.
+            template <class T>
+            using sve_sizeless_t = xsimd::types::detail::sizeless_sve_vector_type<T>;
+
+            // Remap integer Ts to their matching fixed-width counterpart
+            // so svld1/svst1 see the pointer type their overload set expects;
+            // pass non-integer Ts through unchanged.
+            template <class T, bool IsInt = std::is_integral<std::decay_t<T>>::value>
+            struct sve_fix_integer_impl
+            {
+                using type = T;
+            };
+            template <class T>
+            struct sve_fix_integer_impl<T, true>
+            {
+                using type = std::conditional_t<std::is_signed<T>::value,
+                                                sized_int_t<sizeof(T)>, sized_uint_t<sizeof(T)>>;
+            };
+
+            // SVE load/store intrinsics are overloaded on these pointer for integer
+            // types, but some platform have explicit different types between
+            // `long` vs `long long` or `char` vs `int8_t`.
+            // We remap the type to avoid these.
+            template <class T>
+            using sve_fix_char_t = typename sve_fix_integer_impl<T>::type;
+        } // namespace detail
+
+        /*********
+         * Load *
+         *********/
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
+        }
 
-                template <class T>
-                using sve_type_for = sve_type<sizeof(T), std::is_signed<T>::value, std::is_floating_point<T>::value>;
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), sve {});
+        }
 
-                template <class T>
-                using sve_sizeless_t = typename sve_type_for<T>::sizeless;
+        // load_masked
+        template <class A, class T, bool... Values, class Mode, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
+        {
+            return svld1(detail::sve_pmask<Values...>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(mem));
+        }
 
-                // Remap integer Ts to their matching fixed-width counterpart (via
-                // sve_type::scalar) so svld1/svst1 see the pointer type their
-                // overload set expects; pass non-integer Ts through unchanged.
-                template <class T, bool IsInt = std::is_integral<std::decay_t<T>>::value>
-                struct sve_fix_integer_impl
-                {
-                    using type = T;
-                };
-                template <class T>
-                struct sve_fix_integer_impl<T, true>
-                {
-                    using type = typename sve_type_for<std::decay_t<T>>::scalar;
-                };
+        // load_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            const T* buf = reinterpret_cast<const T*>(mem);
+            const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
+            const auto real = svget2(tmp, 0);
+            const auto imag = svget2(tmp, 1);
+            return batch<std::complex<T>, A> { real, imag };
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
+        }
 
-                template <class T>
-                using sve_fix_char_t = typename sve_fix_integer_impl<T>::type;
-            } // namespace detail_sve
+        /*********
+         * Store *
+         *********/
 
-            /*********
-             * Load *
-             *********/
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
-            {
-                return svld1(detail_sve::sve_ptrue<T>(), reinterpret_cast<detail_sve::sve_fix_char_t<T> const*>(src));
-            }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_aligned<A>(dst, src, sve {});
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
-            {
-                return load_aligned<A>(src, convert<T>(), sve {});
-            }
+        // store_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>;
+            v2type tmp {};
+            tmp = svset2(tmp, 0, src.real());
+            tmp = svset2(tmp, 1, src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            svst2(detail::sve_ptrue<T>(), buf, tmp);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_complex_aligned(dst, src, sve {});
+        }
 
-            // load_masked
-            template <class A, class T, bool... Values, class Mode, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
-            {
-                return svld1(detail_sve::sve_pmask<Values...>(), reinterpret_cast<detail_sve::sve_fix_char_t<T> const*>(mem));
-            }
+        /******************
+         * scatter/gather *
+         ******************/
 
-            // load_complex
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
-            {
-                const T* buf = reinterpret_cast<const T*>(mem);
-                const auto tmp = svld2(detail_sve::sve_ptrue<T>(), buf);
-                const auto real = svget2(tmp, 0);
-                const auto imag = svget2(tmp, 1);
-                return batch<std::complex<T>, A> { real, imag };
-            }
+        namespace detail
+        {
+            template <class T, class U>
+            using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>;
+        }
 
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
-            {
-                return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
-            }
+        // scatter
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
+        }
 
-            /*********
-             * Store *
-             *********/
+        // gather
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
-            {
-                svst1(detail_sve::sve_ptrue<T>(), reinterpret_cast<detail_sve::sve_fix_char_t<T>*>(dst), src);
-            }
+        /********************
+         * Scalar to vector *
+         ********************/
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
-            {
-                store_aligned<A>(dst, src, sve {});
-            }
+        // broadcast
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u8(uint8_t(arg));
+        }
 
-            // store_complex
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
-            {
-                using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>;
-                v2type tmp {};
-                tmp = svset2(tmp, 0, src.real());
-                tmp = svset2(tmp, 1, src.imag());
-                T* buf = reinterpret_cast<T*>(dst);
-                svst2(detail_sve::sve_ptrue<T>(), buf, tmp);
-            }
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s8(int8_t(arg));
+        }
 
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
-            {
-                store_complex_aligned(dst, src, sve {});
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u16(uint16_t(arg));
+        }
 
-            /******************
-             * scatter/gather *
-             ******************/
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s16(int16_t(arg));
+        }
 
-            namespace detail_sve
-            {
-                template <class T, class U>
-                using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>;
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u32(uint32_t(arg));
+        }
 
-            // scatter
-            template <class A, class T, class U, detail_sve::sve_enable_sg_t<T, U> = 0>
-            XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
-            {
-                svst1_scatter_index(detail_sve::sve_ptrue<T>(), dst, index.data, src.data);
-            }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s32(int32_t(arg));
+        }
 
-            // gather
-            template <class A, class T, class U, detail_sve::sve_enable_sg_t<T, U> = 0>
-            XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
-            {
-                return svld1_gather_index(detail_sve::sve_ptrue<T>(), src, index.data);
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u64(uint64_t(arg));
+        }
 
-            /********************
-             * Scalar to vector *
-             ********************/
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s64(int64_t(arg));
+        }
 
-            // broadcast
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_u8(uint8_t(arg));
-            }
+        template <class A>
+        XSIMD_INLINE batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f32(arg);
+        }
 
-            template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_s8(int8_t(arg));
-            }
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f64(arg);
+        }
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_u16(uint16_t(arg));
-            }
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
+        {
+            return broadcast<sve>(val, sve {});
+        }
 
-            template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_s16(int16_t(arg));
-            }
+        /**************
+         * Arithmetic *
+         **************/
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_u32(uint32_t(arg));
-            }
+        // add
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_s32(int32_t(arg));
-            }
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqadd(lhs, rhs);
+        }
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_u64(uint64_t(arg));
-            }
+        // sub
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_s64(int64_t(arg));
-            }
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqsub(lhs, rhs);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_f32(arg);
-            }
+        // mul
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
-            {
-                return svdup_n_f64(arg);
-            }
+        // div
+        template <class A, class T, std::enable_if_t<sizeof(T) >= 4, int> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
-            {
-                return broadcast<sve>(val, sve {});
-            }
+        // max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            /**************
-             * Arithmetic *
-             **************/
+        // min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // add
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svadd_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // neg
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(static_cast<detail::sve_sizeless_t<T>>(arg))));
+        }
 
-            // sadd
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svqadd(lhs, rhs);
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(static_cast<detail::sve_sizeless_t<T>>(arg))));
+        }
 
-            // sub
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svsub_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(static_cast<detail::sve_sizeless_t<T>>(arg))));
+        }
 
-            // ssub
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svqsub(lhs, rhs);
-            }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(static_cast<detail::sve_sizeless_t<T>>(arg))));
+        }
 
-            // mul
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svmul_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svneg_x(detail::sve_ptrue<T>(), arg);
+        }
 
-            // div
-            template <class A, class T, std::enable_if_t<sizeof(T) >= 4, int> = 0>
-            XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svdiv_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // abs
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return arg;
+        }
 
-            // max
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svmax_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svabs_x(detail::sve_ptrue<T>(), arg);
+        }
 
-            // min
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svmin_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // fma: x * y + z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmad_x(detail::sve_ptrue<T>(), x, y, z);
+        }
 
-            // neg
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-            XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u8(svneg_x(detail_sve::sve_ptrue<T>(), svreinterpret_s8(static_cast<detail_sve::sve_sizeless_t<T>>(arg))));
-            }
+        // fnma: z - x * y
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
+        }
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-            XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u16(svneg_x(detail_sve::sve_ptrue<T>(), svreinterpret_s16(static_cast<detail_sve::sve_sizeless_t<T>>(arg))));
-            }
+        // fms: x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fnma(x, y, z, sve {});
+        }
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-            XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u32(svneg_x(detail_sve::sve_ptrue<T>(), svreinterpret_s32(static_cast<detail_sve::sve_sizeless_t<T>>(arg))));
-            }
+        // fnms: - x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fma(x, y, z, sve {});
+        }
 
-            template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-            XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u64(svneg_x(detail_sve::sve_ptrue<T>(), svreinterpret_s64(static_cast<detail_sve::sve_sizeless_t<T>>(arg))));
-            }
+        /**********************
+         * Logical operations *
+         **********************/
 
-            template <class A, class T, detail_sve::sve_enable_signed_int_or_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svneg_x(detail_sve::sve_ptrue<T>(), arg);
-            }
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // abs
-            template <class A, class T, detail_sve::sve_enable_unsigned_int_t<T> = 0>
-            XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return arg;
-            }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(rhs));
+            const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(rhs));
+            const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_signed_int_or_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svabs_x(detail_sve::sve_ptrue<T>(), arg);
-            }
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // fma: x * y + z
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
-            {
-                return svmad_x(detail_sve::sve_ptrue<T>(), x, y, z);
-            }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(rhs));
+            const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(rhs));
+            const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // fnma: z - x * y
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
-            {
-                return svmsb_x(detail_sve::sve_ptrue<T>(), x, y, z);
-            }
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // fms: x * y - z
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
-            {
-                return -fnma(x, y, z, sve {});
-            }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(rhs));
+            const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(rhs));
+            const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // fnms: - x * y - z
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
-            {
-                return -fma(x, y, z, sve {});
-            }
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            /**********************
-             * Logical operations *
-             **********************/
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(rhs));
+            const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(rhs));
+            const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // bitwise_and
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svand_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_x(detail::sve_ptrue<T>(), arg);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(lhs));
-                const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(rhs));
-                const auto result_bits = svand_x(detail_sve::sve_ptrue<float>(), lhs_bits, rhs_bits);
-                return svreinterpret_f32(result_bits);
-            }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u32(static_cast<detail::sve_sizeless_t<float>>(arg));
+            const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
+            return svreinterpret_f32(result_bits);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(lhs));
-                const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(rhs));
-                const auto result_bits = svand_x(detail_sve::sve_ptrue<double>(), lhs_bits, rhs_bits);
-                return svreinterpret_f64(result_bits);
-            }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u64(static_cast<detail::sve_sizeless_t<double>>(arg));
+            const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
+            return svreinterpret_f64(result_bits);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svand_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_z(detail::sve_ptrue<T>(), arg);
+        }
 
-            // bitwise_andnot
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svbic_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        /**********
+         * Shifts *
+         **********/
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        namespace detail
+        {
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
             {
-                const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(lhs));
-                const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(rhs));
-                const auto result_bits = svbic_x(detail_sve::sve_ptrue<float>(), lhs_bits, rhs_bits);
-                return svreinterpret_f32(result_bits);
+                return svreinterpret_u8(static_cast<sve_sizeless_t<T>>(arg));
             }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
             {
-                const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(lhs));
-                const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(rhs));
-                const auto result_bits = svbic_x(detail_sve::sve_ptrue<double>(), lhs_bits, rhs_bits);
-                return svreinterpret_f64(result_bits);
+                return svreinterpret_u16(static_cast<sve_sizeless_t<T>>(arg));
             }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
             {
-                return svbic_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
+                return svreinterpret_u32(static_cast<sve_sizeless_t<T>>(arg));
             }
 
-            // bitwise_or
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
             {
-                return svorr_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
+                return svreinterpret_u64(static_cast<sve_sizeless_t<T>>(arg));
             }
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
             {
-                const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(lhs));
-                const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(rhs));
-                const auto result_bits = svorr_x(detail_sve::sve_ptrue<float>(), lhs_bits, rhs_bits);
-                return svreinterpret_f32(result_bits);
+                return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
             }
+        } // namespace detail
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(lhs));
-                const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(rhs));
-                const auto result_bits = svorr_x(detail_sve::sve_ptrue<double>(), lhs_bits, rhs_bits);
-                return svreinterpret_f64(result_bits);
-            }
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsl_x(detail::sve_ptrue<T>(), arg, n);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svorr_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
 
-            // bitwise_xor
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return sveor_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // bitwise_rshift
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(lhs));
-                const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(rhs));
-                const auto result_bits = sveor_x(detail_sve::sve_ptrue<float>(), lhs_bits, rhs_bits);
-                return svreinterpret_f32(result_bits);
-            }
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(lhs));
-                const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(rhs));
-                const auto result_bits = sveor_x(detail_sve::sve_ptrue<double>(), lhs_bits, rhs_bits);
-                return svreinterpret_f64(result_bits);
-            }
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return sveor_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
 
-            // bitwise_not
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svnot_x(detail_sve::sve_ptrue<T>(), arg);
-            }
+        /**************
+         * Reductions *
+         **************/
 
-            template <class A>
-            XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
-            {
-                const auto arg_bits = svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<float>>(arg));
-                const auto result_bits = svnot_x(detail_sve::sve_ptrue<float>(), arg_bits);
-                return svreinterpret_f32(result_bits);
-            }
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            // sve integer reduction results are promoted to 64 bits
+            return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
-            {
-                const auto arg_bits = svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<double>>(arg));
-                const auto result_bits = svnot_x(detail_sve::sve_ptrue<double>(), arg_bits);
-                return svreinterpret_f64(result_bits);
-            }
+        // reduce_max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svmaxv(detail::sve_ptrue<T>(), arg);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        // reduce_min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svminv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // haddp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+            for (std::size_t i = 0; i < size; ++i)
             {
-                return svnot_z(detail_sve::sve_ptrue<T>(), arg);
+                sums[i] = reduce_add(row[i], sve {});
             }
+            return svld1(detail::sve_ptrue<T>(), sums);
+        }
 
-            /**********
-             * Shifts *
-             **********/
+        /***************
+         * Comparisons *
+         ***************/
 
-            namespace detail_sve
-            {
-                template <class A, class T, class U>
-                XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
-                {
-                    return svreinterpret_u8(static_cast<sve_sizeless_t<T>>(arg));
-                }
+        // eq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-                template <class A, class T, class U>
-                XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
-                {
-                    return svreinterpret_u16(static_cast<sve_sizeless_t<T>>(arg));
-                }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svnot_z(detail::sve_ptrue<T>(), neq_result);
+        }
 
-                template <class A, class T, class U>
-                XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
-                {
-                    return svreinterpret_u32(static_cast<sve_sizeless_t<T>>(arg));
-                }
+        // neq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-                template <class A, class T, class U>
-                XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
-                {
-                    return svreinterpret_u64(static_cast<sve_sizeless_t<T>>(arg));
-                }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-                template <class A, class T, class U = as_unsigned_integer_t<T>>
-                XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
-                {
-                    return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
-                }
-            } // namespace detail_sve
+        // lt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // bitwise_lshift
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
-            {
-                constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
-                assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-                return svlsl_x(detail_sve::sve_ptrue<T>(), arg, n);
-            }
+        // le
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail::enable_integral_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svlsl_x(detail_sve::sve_ptrue<T>(), lhs, detail_sve::sve_to_unsigned_batch<A, T>(rhs));
-            }
+        // gt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            // bitwise_rshift
-            template <class A, class T, detail_sve::sve_enable_unsigned_int_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
-            {
-                constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
-                assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-                return svlsr_x(detail_sve::sve_ptrue<T>(), arg, static_cast<T>(n));
-            }
+        // ge
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_unsigned_int_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svlsr_x(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        /***************
+         * Permutation *
+         ***************/
 
-            template <class A, class T, detail_sve::sve_enable_signed_int_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
-            {
-                constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
-                assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-                return svasr_x(detail_sve::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
-            }
+        //  rotate_left
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<sve>) noexcept
+        {
+            return svext(a, a, N);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_signed_int_t<T> = 0>
-            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svasr_x(detail_sve::sve_ptrue<T>(), lhs, detail_sve::sve_to_unsigned_batch<A, T>(rhs));
-            }
+        // swizzle (dynamic)
+        template <class A, class T, class I>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
+        {
+            return svtbl(arg, indices);
+        }
 
-            /**************
-             * Reductions *
-             **************/
+        template <class A, class T, class I>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                       batch<I, A> indices,
+                                                       requires_arch<sve>) noexcept
+        {
+            const auto real = swizzle(self.real(), indices, sve {});
+            const auto imag = swizzle(self.imag(), indices, sve {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        // swizzle (static)
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            return swizzle(arg, indices.as_batch(), sve {});
+        }
+
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
+                                                       batch_constant<I, A, idx...> indices,
+                                                       requires_arch<sve>) noexcept
+        {
+            static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
+            return swizzle(arg, indices.as_batch(), sve {});
+        }
 
-            // reduce_add
-            template <class A, class T, class V = typename batch<T, A>::value_type, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                // sve integer reduction results are promoted to 64 bits
-                return static_cast<V>(svaddv(detail_sve::sve_ptrue<T>(), arg));
-            }
+        /*************
+         * Selection *
+         *************/
 
-            // reduce_max
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        // extract_pair
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept
             {
-                return svmaxv(detail_sve::sve_ptrue<T>(), arg);
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
             }
 
-            // reduce_min
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+            template <class A, class T, size_t I, size_t... Is>
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<I, Is...>) noexcept
             {
-                return svminv(detail_sve::sve_ptrue<T>(), arg);
+                if (n == I)
+                {
+                    return svext(rhs, lhs, I);
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
+                }
             }
 
-            // haddp
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+            template <class A, class T, size_t... Is>
+            XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept
             {
-                constexpr std::size_t size = batch<T, A>::size;
-                T sums[size];
-                for (std::size_t i = 0; i < size; ++i)
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
                 {
-                    sums[i] = reduce_add(row[i], sve {});
+                    return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
                 }
-                return svld1(detail_sve::sve_ptrue<T>(), sums);
             }
+        }
 
-            /***************
-             * Comparisons *
-             ***************/
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence<size>());
+        }
+
+        // select
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
+        {
+            return svsel(cond, static_cast<detail::sve_sizeless_t<T>>(a), static_cast<detail::sve_sizeless_t<T>>(b));
+        }
 
-            // eq
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svcmpeq(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        template <class A, class T, bool... b>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                const auto neq_result = sveor_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
-                return svnot_z(detail_sve::sve_ptrue<T>(), neq_result);
-            }
+        // zip_lo
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip1(lhs, rhs);
+        }
 
-            // neq
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svcmpne(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // zip_hi
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip2(lhs, rhs);
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return sveor_z(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
 
-            // lt
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svcmplt(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // rsqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrsqrte(arg);
+        }
 
-            // le
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svcmple(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // sqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svsqrt_x(detail::sve_ptrue<T>(), arg);
+        }
 
-            // gt
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svcmpgt(detail_sve::sve_ptrue<T>(), lhs, rhs);
-            }
+        // reciprocal
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
+        {
+            return svrecpe(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
 
-            // ge
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        // fast_cast
+        namespace detail
+        {
+            template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
             {
-                return svcmpge(detail_sve::sve_ptrue<T>(), lhs, rhs);
+                return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
             }
 
-            /***************
-             * Permutation *
-             ***************/
-
-            //  rotate_left
-            template <size_t N, class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<sve>) noexcept
+            template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
             {
-                return svext(a, a, N);
+                return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
             }
 
-            // swizzle (dynamic)
-            template <class A, class T, class I>
-            XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
             {
-                return svtbl(arg, indices);
+                return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
             }
 
-            template <class A, class T, class I>
-            XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
-                                                           batch<I, A> indices,
-                                                           requires_arch<sve>) noexcept
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
             {
-                const auto real = swizzle(self.real(), indices, sve {});
-                const auto imag = swizzle(self.imag(), indices, sve {});
-                return batch<std::complex<T>>(real, imag);
+                return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
             }
 
-            // swizzle (static)
-            template <class A, class T, class I, I... idx>
-            XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
+            template <class A>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
             {
-                static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
-                return swizzle(arg, indices.as_batch(), sve {});
+                return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
             }
 
-            template <class A, class T, class I, I... idx>
-            XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
-                                                           batch_constant<I, A, idx...> indices,
-                                                           requires_arch<sve>) noexcept
+            template <class A>
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
             {
-                static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
-                return swizzle(arg, indices.as_batch(), sve {});
+                return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
             }
+        }
 
-            /*************
-             * Selection *
-             *************/
+        /*********
+         * Miscs *
+         *********/
 
-            // extract_pair
-            namespace detail_sve
-            {
-                template <class A, class T>
-                XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept
-                {
-                    assert(false && "extract_pair out of bounds");
-                    return batch<T, A> {};
-                }
+        // set
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            return detail::sve_vector_type<T> { args... };
+        }
 
-                template <class A, class T, size_t I, size_t... Is>
-                XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<I, Is...>) noexcept
-                {
-                    if (n == I)
-                    {
-                        return svext(rhs, lhs, I);
-                    }
-                    else
-                    {
-                        return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
-                    }
-                }
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
+                                                   Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
+                                          detail::sve_vector_type<T> { args_complex.imag()... });
+        }
 
-                template <class A, class T, size_t... Is>
-                XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept
-                {
-                    if (n == 0)
-                    {
-                        return rhs;
-                    }
-                    else
-                    {
-                        return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
-                    }
-                }
-            }
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
+            const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
+            return svcmpne(detail::sve_ptrue<T>(), values, zero);
+        }
+
+        // insert
+        namespace detail
+        {
+            // generate index sequence (iota)
+            XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+            XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+            XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+            XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+
+            template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
+            XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
+        } // namespace detail
+
+        template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
+        {
+            // create a predicate with only the I-th lane activated
+            const auto iota = detail::sve_iota<T>();
+            const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
+            return svsel(index_predicate, static_cast<detail::sve_sizeless_t<T>>(broadcast<A, T>(val, sve {})), static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
+
+        // first
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
+        {
+            return self.data[0];
+        }
 
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
-            {
-                constexpr std::size_t size = batch<T, A>::size;
-                assert(n < size && "index in bounds");
-                return detail_sve::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence<size>());
-            }
+        // all
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
+        }
 
-            // select
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
-            {
-                return svsel(cond, static_cast<detail_sve::sve_sizeless_t<T>>(a), static_cast<detail_sve::sve_sizeless_t<T>>(b));
-            }
+        // any
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svptest_any(arg, arg);
+        }
 
-            template <class A, class T, bool... b>
-            XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
-            {
-                return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
-            }
+        // bitwise_cast
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // zip_lo
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svzip1(lhs, rhs);
-            }
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s8(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // zip_hi
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
-            {
-                return svzip2(lhs, rhs);
-            }
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            /*****************************
-             * Floating-point arithmetic *
-             *****************************/
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s16(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // rsqrt
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svrsqrte(arg);
-            }
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // sqrt
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svsqrt_x(detail_sve::sve_ptrue<T>(), arg);
-            }
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s32(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // reciprocal
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
-            {
-                return svrecpe(arg);
-            }
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            /******************************
-             * Floating-point conversions *
-             ******************************/
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s64(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-            // fast_cast
-            namespace detail_sve
-            {
-                template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
-                XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
-                {
-                    return svcvt_f32_x(detail_sve::sve_ptrue<T>(), arg);
-                }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f32(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-                template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-                XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
-                {
-                    return svcvt_f64_x(detail_sve::sve_ptrue<T>(), arg);
-                }
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f64(static_cast<detail::sve_sizeless_t<T>>(arg));
+        }
 
-                template <class A>
-                XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
-                {
-                    return svcvt_s32_x(detail_sve::sve_ptrue<float>(), arg);
-                }
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
+        {
+            return arg.data;
+        }
 
-                template <class A>
-                XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
-                {
-                    return svcvt_u32_x(detail_sve::sve_ptrue<float>(), arg);
-                }
+        // from_bool
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return select(arg, batch<T, A>(1), batch<T, A>(0));
+        }
 
-                template <class A>
-                XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_left
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
-                    return svcvt_s64_x(detail_sve::sve_ptrue<double>(), arg);
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = svdup_n_u8(0);
+                    const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const u8_vector result(svext(left, right, u8_vector::size - N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
                 }
+            };
 
-                template <class A>
-                XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+            template <>
+            struct sve_slider_left<0>
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
-                    return svcvt_u64_x(detail_sve::sve_ptrue<double>(), arg);
+                    return arg;
                 }
-            }
-
-            /*********
-             * Miscs *
-             *********/
-
-            // set
-            template <class A, class T, class... Args>
-            XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
-            {
-                return detail_sve::sve_vector_type<T> { args... };
-            }
+            };
+        } // namespace detail
 
-            template <class A, class T, class... Args>
-            XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
-                                                       Args... args_complex) noexcept
-            {
-                return batch<std::complex<T>>(detail_sve::sve_vector_type<T> { args_complex.real()... },
-                                              detail_sve::sve_vector_type<T> { args_complex.imag()... });
-            }
-
-            template <class A, class T, class... Args>
-            XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
-            {
-                using U = as_unsigned_integer_t<T>;
-                const auto values = detail_sve::sve_vector_type<U> { static_cast<U>(args)... };
-                const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
-                return svcmpne(detail_sve::sve_ptrue<T>(), values, zero);
-            }
-
-            // insert
-            namespace detail_sve
-            {
-                // generate index sequence (iota)
-                XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
-                XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
-                XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
-                XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
-
-                template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
-                XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
-            } // namespace detail_sve
-
-            template <class A, class T, size_t I, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
-            {
-                // create a predicate with only the I-th lane activated
-                const auto iota = detail_sve::sve_iota<T>();
-                const auto index_predicate = svcmpeq(detail_sve::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
-                return svsel(index_predicate, static_cast<detail_sve::sve_sizeless_t<T>>(broadcast<A, T>(val, sve {})), static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            // first
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
-            {
-                return self.data[0];
-            }
-
-            // all
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return detail_sve::sve_pcount<T>(arg) == batch_bool<T, A>::size;
-            }
-
-            // any
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svptest_any(arg, arg);
-            }
-
-            // bitwise_cast
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u8(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_s8(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u16(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_s16(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u32(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_s32(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_u64(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, class R, detail_sve::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
-            XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_s64(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_f32(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
-            {
-                return svreinterpret_f64(static_cast<detail_sve::sve_sizeless_t<T>>(arg));
-            }
-
-            // batch_bool_cast
-            template <class A, class T_out, class T_in, detail_sve::sve_enable_all_t<T_in> = 0>
-            XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
-            {
-                return arg.data;
-            }
-
-            // from_bool
-            template <class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return select(arg, batch<T, A>(1), batch<T, A>(0));
-            }
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_left<N>()(arg);
+        }
 
-            // slide_left
-            namespace detail_sve
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_right
             {
-                template <size_t N>
-                struct sve_slider_left
-                {
-                    template <class A, class T>
-                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
-                    {
-                        using u8_vector = batch<uint8_t, A>;
-                        const auto left = svdup_n_u8(0);
-                        const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
-                        const u8_vector result(svext(left, right, u8_vector::size - N));
-                        return bitwise_cast(result, batch<T, A> {}, sve {});
-                    }
-                };
-
-                template <>
-                struct sve_slider_left<0>
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
-                    template <class A, class T>
-                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
-                    {
-                        return arg;
-                    }
-                };
-            } // namespace detail_sve
-
-            template <size_t N, class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return detail_sve::sve_slider_left<N>()(arg);
-            }
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const auto right = svdup_n_u8(0);
+                    const u8_vector result(svext(left, right, N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
 
-            // slide_right
-            namespace detail_sve
+            template <>
+            struct sve_slider_right<batch<uint8_t, sve>::size>
             {
-                template <size_t N>
-                struct sve_slider_right
-                {
-                    template <class A, class T>
-                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
-                    {
-                        using u8_vector = batch<uint8_t, A>;
-                        const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
-                        const auto right = svdup_n_u8(0);
-                        const u8_vector result(svext(left, right, N));
-                        return bitwise_cast(result, batch<T, A> {}, sve {});
-                    }
-                };
-
-                template <>
-                struct sve_slider_right<batch<uint8_t, sve>::size>
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
                 {
-                    template <class A, class T>
-                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
-                    {
-                        return batch<T, A> {};
-                    }
-                };
-            } // namespace detail_sve
-
-            template <size_t N, class A, class T, detail_sve::sve_enable_all_t<T> = 0>
-            XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return detail_sve::sve_slider_right<N>()(arg);
-            }
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
 
-            // isnan
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return !(arg == arg);
-            }
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_right<N>()(arg);
+        }
 
-            // nearbyint
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
-            {
-                return svrintx_x(detail_sve::sve_ptrue<T>(), arg);
-            }
+        // isnan
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return !(arg == arg);
+        }
 
-            // nearbyint_as_int
-            template <class A>
-            XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
-            {
-                const auto nearest = svrintx_x(detail_sve::sve_ptrue<float>(), arg);
-                return svcvt_s32_x(detail_sve::sve_ptrue<float>(), nearest);
-            }
+        // nearbyint
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrintx_x(detail::sve_ptrue<T>(), arg);
+        }
 
-            template <class A>
-            XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
-            {
-                const auto nearest = svrintx_x(detail_sve::sve_ptrue<double>(), arg);
-                return svcvt_s64_x(detail_sve::sve_ptrue<double>(), nearest);
-            }
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrintx_x(detail::sve_ptrue<float>(), arg);
+            return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
+        }
 
-            // ldexp
-            template <class A, class T, detail_sve::sve_enable_floating_point_t<T> = 0>
-            XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
-            {
-                return svscale_x(detail_sve::sve_ptrue<T>(), x, exp);
-            }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrintx_x(detail::sve_ptrue<double>(), arg);
+            return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
+        }
+
+        // ldexp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
+        {
+            return svscale_x(detail::sve_ptrue<T>(), x, exp);
+        }
 
-        } // namespace XSIMD_SVE_NAMESPACE
     } // namespace kernel
 } // namespace xsimd
 
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 5e6b3a209..1be681777 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -20,6 +20,7 @@
 #include "../arch/xsimd_isa.hpp"
 #include "../types/xsimd_batch.hpp"
 #include "../types/xsimd_traits.hpp"
+#include "../utils/xsimd_type_traits.hpp"
 
 namespace xsimd
 {
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index 1cac2abc2..b584a2d81 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -18,15 +18,11 @@
 #include "../config/xsimd_arch.hpp"
 #include "../config/xsimd_macros.hpp"
 #include "../memory/xsimd_alignment.hpp"
+#include "./xsimd_batch_fwd.hpp"
 #include "./xsimd_utils.hpp"
 
 namespace xsimd
 {
-    template <typename T, class A, bool... Values>
-    struct batch_bool_constant;
-    template <class T, class A = default_arch>
-    class batch;
-
     namespace types
     {
         template <class T, class A>
@@ -301,7 +297,7 @@ namespace xsimd
      * @tparam T the type of the predicated values.
      * @tparam A the architecture this batch is tied too.
      **/
-    template <class T, class A = default_arch>
+    template <class T, class A>
     class batch_bool : public types::get_bool_simd_register_t<T, A>
     {
         using base_type = types::get_bool_simd_register_t<T, A>;
diff --git a/include/xsimd/types/xsimd_batch_fwd.hpp b/include/xsimd/types/xsimd_batch_fwd.hpp
new file mode 100644
index 000000000..62e3cbba7
--- /dev/null
+++ b/include/xsimd/types/xsimd_batch_fwd.hpp
@@ -0,0 +1,41 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_FWD_HPP
+#define XSIMD_BATCH_FWD_HPP
+
+#include "../config/xsimd_config.hpp"
+
+// TODO this is somehow redundant with XSIMD_DEFAULT_ARCH but is only supported
+// when an architecture is defined.
+#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
+#define XSIMD_BATCH_DEFAULT_ARCH_IMPL void
+#else
+#include "../config/xsimd_arch.hpp"
+#define XSIMD_BATCH_DEFAULT_ARCH_IMPL default_arch
+#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE
+
+namespace xsimd
+{
+    template <class T, class A = XSIMD_BATCH_DEFAULT_ARCH_IMPL>
+    class batch_bool;
+
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T, class A = XSIMD_BATCH_DEFAULT_ARCH_IMPL>
+    class batch;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+}
+
+#endif
diff --git a/include/xsimd/types/xsimd_neon_register.hpp b/include/xsimd/types/xsimd_neon_register.hpp
index ef9973828..ae76e6dc2 100644
--- a/include/xsimd/types/xsimd_neon_register.hpp
+++ b/include/xsimd/types/xsimd_neon_register.hpp
@@ -12,8 +12,9 @@
 #ifndef XSIMD_NEON_REGISTER_HPP
 #define XSIMD_NEON_REGISTER_HPP
 
-#include "xsimd_common_arch.hpp"
-#include "xsimd_register.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
 
 #if XSIMD_WITH_NEON
 #include <arm_neon.h>
@@ -103,40 +104,10 @@ namespace xsimd
 
         namespace detail
         {
-            template <size_t S>
-            struct get_unsigned_type;
-
-            template <>
-            struct get_unsigned_type<1>
-            {
-                using type = uint8_t;
-            };
-
-            template <>
-            struct get_unsigned_type<2>
-            {
-                using type = uint16_t;
-            };
-
-            template <>
-            struct get_unsigned_type<4>
-            {
-                using type = uint32_t;
-            };
-
-            template <>
-            struct get_unsigned_type<8>
-            {
-                using type = uint64_t;
-            };
-
-            template <size_t S>
-            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
-
             template <class T, class A>
             struct neon_bool_simd_register
             {
-                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+                using type = simd_register<xsimd::sized_uint_t<sizeof(T)>, A>;
             };
         }
 
diff --git a/include/xsimd/types/xsimd_sve_register.hpp b/include/xsimd/types/xsimd_sve_register.hpp
index 7ac748f8d..a0d1b5b99 100644
--- a/include/xsimd/types/xsimd_sve_register.hpp
+++ b/include/xsimd/types/xsimd_sve_register.hpp
@@ -67,55 +67,115 @@ namespace xsimd
             struct sve_vector_type_impl;
 
             template <>
-            struct sve_vector_type_impl<8>
+            struct sve_vector_type_impl<1>
             {
                 using signed_type = sve_int8_t;
                 using unsigned_type = sve_uint8_t;
                 using floating_point_type = void;
+                using sizeless_unsigned_type = svuint8_t;
+                using sizeless_signed_type = svint8_t;
+                using sizeless_floating_point_type = void;
             };
 
             template <>
-            struct sve_vector_type_impl<16>
+            struct sve_vector_type_impl<2>
             {
                 using signed_type = sve_int16_t;
                 using unsigned_type = sve_uint16_t;
                 using floating_point_type = void;
+                using sizeless_unsigned_type = svuint16_t;
+                using sizeless_signed_type = svint16_t;
+                using sizeless_floating_point_type = void;
             };
 
             template <>
-            struct sve_vector_type_impl<32>
+            struct sve_vector_type_impl<4>
             {
                 using signed_type = sve_int32_t;
                 using unsigned_type = sve_uint32_t;
                 using floating_point_type = sve_float32_t;
+                using sizeless_unsigned_type = svuint32_t;
+                using sizeless_signed_type = svint32_t;
+                using sizeless_floating_point_type = svfloat32_t;
             };
 
             template <>
-            struct sve_vector_type_impl<64>
+            struct sve_vector_type_impl<8>
             {
                 using signed_type = sve_int64_t;
                 using unsigned_type = sve_uint64_t;
                 using floating_point_type = sve_float64_t;
+                using sizeless_unsigned_type = svuint64_t;
+                using sizeless_signed_type = svint64_t;
+                using sizeless_floating_point_type = svfloat64_t;
             };
 
             template <class T>
-            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::unsigned_type;
 
             template <class T>
-            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::floating_point_type;
 
             template <class T>
-            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+            using sizeless_signed_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_signed_type;
 
             template <class T>
-            using signed_int_or_floating_point_sve_vector_type = std::conditional_t<std::is_floating_point<T>::value,
-                                                                                    floating_point_sve_vector_type<T>,
-                                                                                    signed_int_sve_vector_type<T>>;
+            using sizeless_unsigned_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_unsigned_type;
 
             template <class T>
-            using sve_vector_type = std::conditional_t<std::is_signed<T>::value,
-                                                       signed_int_or_floating_point_sve_vector_type<T>,
-                                                       unsigned_int_sve_vector_type<T>>;
+            using sizeless_floating_point_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_floating_point_type;
+
+            template <typename T, typename = void>
+            struct sve_vector_impl;
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<std::is_floating_point<T>::value>>
+            {
+                using type = floating_point_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+            {
+                using type = signed_int_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+            {
+                using type = unsigned_int_sve_vector_type<T>;
+            };
+
+            template <typename T, typename = void>
+            struct sizeless_sve_vector_impl;
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<std::is_floating_point<T>::value>>
+            {
+                using type = sizeless_floating_point_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+            {
+                using type = sizeless_signed_int_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+            {
+                using type = sizeless_unsigned_int_sve_vector_type<T>;
+            };
+
+            template <class T>
+            using sve_vector_type = typename detail::sve_vector_impl<T>::type;
+
+            template <class T>
+            using sizeless_sve_vector_type = typename detail::sizeless_sve_vector_impl<T>::type;
+
         } // namespace detail
 
         XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
index 34b47c7ee..71ead2cc3 100644
--- a/include/xsimd/types/xsimd_traits.hpp
+++ b/include/xsimd/types/xsimd_traits.hpp
@@ -12,10 +12,16 @@
 #ifndef XSIMD_TRAITS_HPP
 #define XSIMD_TRAITS_HPP
 
+#include <complex>
 #include <cstdint>
 #include <type_traits>
 
-#include "xsimd_batch.hpp"
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include <xtl/xcomplex.hpp>
+#endif
+
+#include "./xsimd_batch_fwd.hpp"
+#include "./xsimd_utils.hpp"
 
 /**
  * high level type traits
@@ -397,53 +403,6 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
-
-    namespace detail
-    {
-        template <typename T>
-        struct widen
-        {
-            using type = std::make_signed_t<typename widen<std::make_unsigned_t<T>>::type>;
-        };
-
-        template <>
-        struct widen<uint32_t>
-        {
-            using type = uint64_t;
-        };
-        template <>
-        struct widen<uint16_t>
-        {
-            using type = uint32_t;
-        };
-        template <>
-        struct widen<uint8_t>
-        {
-            using type = uint16_t;
-        };
-        template <>
-        struct widen<int32_t>
-        {
-            using type = int64_t;
-        };
-        template <>
-        struct widen<int16_t>
-        {
-            using type = int32_t;
-        };
-        template <>
-        struct widen<int8_t>
-        {
-            using type = int16_t;
-        };
-        template <>
-        struct widen<float>
-        {
-            using type = double;
-        };
-    }
-    template <typename T>
-    using widen_t = typename detail::widen<T>::type;
 }
 
 #endif
diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp
index aa11b90db..3284e97c0 100644
--- a/include/xsimd/types/xsimd_utils.hpp
+++ b/include/xsimd/types/xsimd_utils.hpp
@@ -23,15 +23,10 @@
 #include "xtl/xcomplex.hpp"
 #endif
 
+#include "./xsimd_batch_fwd.hpp"
+
 namespace xsimd
 {
-
-    template <class T, class A>
-    class batch;
-
-    template <class T, class A>
-    class batch_bool;
-
     /**************
      * index      *
      **************/
diff --git a/include/xsimd/utils/xsimd_type_traits.hpp b/include/xsimd/utils/xsimd_type_traits.hpp
new file mode 100644
index 000000000..a3f6842f5
--- /dev/null
+++ b/include/xsimd/utils/xsimd_type_traits.hpp
@@ -0,0 +1,127 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TYPE_TRAITS_HPP
+#define XSIMD_TYPE_TRAITS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace detail
+    {
+        template <std::size_t S>
+        struct sized_num_types;
+
+        template <>
+        struct sized_num_types<1>
+        {
+            using signed_type = std::int8_t;
+            using unsigned_type = std::uint8_t;
+            using floating_point_type = void;
+        };
+
+        template <>
+        struct sized_num_types<2>
+        {
+            using signed_type = std::int16_t;
+            using unsigned_type = std::uint16_t;
+            using floating_point_type = void;
+        };
+
+        template <>
+        struct sized_num_types<4>
+        {
+            using signed_type = std::int32_t;
+            using unsigned_type = std::uint32_t;
+            using floating_point_type = float;
+        };
+
+        template <>
+        struct sized_num_types<8>
+        {
+            using signed_type = std::int64_t;
+            using unsigned_type = std::uint64_t;
+            using floating_point_type = double;
+        };
+    }
+
+    /**
+     * @ingroup batch_traits
+     *
+     * Signed integer type with exactly @c S bytes (1, 2, 4, or 8).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_int_t = typename detail::sized_num_types<S>::signed_type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * Unsigned integer type with exactly @c S bytes (1, 2, 4, or 8).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_uint_t = typename detail::sized_num_types<S>::unsigned_type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * Floating-point type with exactly @c S bytes (4 for @c float, 8 for @c double).
+     * Yields @c void for sizes without a standard floating-point type (1, 2).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_fp_t = typename detail::sized_num_types<S>::floating_point_type;
+
+    namespace detail
+    {
+        template <typename T, typename = void>
+        struct widen;
+
+        template <typename T>
+        struct widen<T, std::enable_if_t<std::is_floating_point<T>::value>>
+        {
+            using type = xsimd::sized_fp_t<sizeof(T) * 2>;
+        };
+
+        template <typename T>
+        struct widen<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+        {
+            using type = xsimd::sized_int_t<sizeof(T) * 2>;
+        };
+
+        template <typename T>
+        struct widen<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+        {
+            using type = xsimd::sized_uint_t<sizeof(T) * 2>;
+        };
+    }
+
+    /**
+     * @ingroup batch_traits
+     *
+     * The next-wider arithmetic type for @c T: doubles the size while preserving
+     * signedness for integers and yielding @c double for @c float.
+     * Supported input types: @c [u]int{8,16,32}_t and @c float.
+     *
+     * @tparam T arithmetic type to widen.
+     */
+    template <typename T>
+    using widen_t = typename detail::widen<T>::type;
+}
+
+#endif
diff --git a/include/xsimd/xsimd.hpp b/include/xsimd/xsimd.hpp
index df90a1b32..ea9087ef9 100644
--- a/include/xsimd/xsimd.hpp
+++ b/include/xsimd/xsimd.hpp
@@ -17,18 +17,20 @@
 
 #include "arch/xsimd_scalar.hpp"
 #include "memory/xsimd_aligned_allocator.hpp"
+#include "types/xsimd_batch_fwd.hpp"
 
 #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
-// no type definition or anything apart from scalar definition and aligned allocator
 namespace xsimd
 {
-    template <class T, class A = void>
+    // no type definition or anything apart from scalar definition and aligned allocator
+    template <class T, class A>
     class batch
     {
         static constexpr bool supported_architecture = sizeof(A*) == 0; // type-dependant but always false
         static_assert(supported_architecture, "No SIMD architecture detected, cannot instantiate a batch");
     };
 }
+
 #else
 #include "types/xsimd_batch.hpp"
 #include "types/xsimd_batch_constant.hpp"
@@ -36,5 +38,6 @@ namespace xsimd
 
 // This include must come last
 #include "types/xsimd_api.hpp"
-#endif
+#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE
+
 #endif