diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp index 1226c887c..b64613dfb 100644 --- a/include/xsimd/arch/common/xsimd_common_cast.hpp +++ b/include/xsimd/arch/common/xsimd_common_cast.hpp @@ -12,7 +12,10 @@ #ifndef XSIMD_COMMON_CAST_HPP #define XSIMD_COMMON_CAST_HPP -#include "../../types/xsimd_traits.hpp" +#include + +#include "../../config/xsimd_macros.hpp" +#include "../../utils/xsimd_type_traits.hpp" namespace xsimd { diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4af19a650..292dab7c3 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -21,6 +21,7 @@ #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" +#include "../utils/xsimd_type_traits.hpp" #include "./common/xsimd_common_bit.hpp" #include "./common/xsimd_common_cast.hpp" diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 73183a086..6d1eae59d 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -16,6 +16,7 @@ #include "../config/xsimd_macros.hpp" #include "../types/xsimd_batch_constant.hpp" #include "../types/xsimd_rvv_register.hpp" +#include "../utils/xsimd_type_traits.hpp" #include "./xsimd_constants.hpp" // This set of macros allows the synthesis of identifiers using a template and @@ -86,32 +87,32 @@ // for the function signature argument(s) to XSIMD_RVV_OVERLOAD. That signature can // also reference the template argument T, because it's a text substitution // into the template. -#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...) \ - namespace NAME##_cruft \ - { \ - template \ - struct ctx \ - { \ - static constexpr size_t width = XSIMD_RVV_BITS; \ - static constexpr size_t vl = width / (sizeof(T) * 8); \ - using vec = rvv_reg_t; \ - using uvec = rvv_reg_t, width>; \ - using svec = rvv_reg_t, width>; \ - using fvec = rvv_reg_t, width>; \ - using bvec = rvv_bool_t; \ - using scalar_vec = rvv_reg_t; \ - using wide_vec = rvv_reg_t; \ - using narrow_vec = rvv_reg_t; \ - using type = SIGNATURE; \ - }; \ - template \ - using sig_t = typename ctx::type; \ - template \ - struct impl \ - { \ - void operator()() const noexcept {}; \ - }; \ - template \ +#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...) \ + namespace NAME##_cruft \ + { \ + template \ + struct ctx \ + { \ + static constexpr size_t width = XSIMD_RVV_BITS; \ + static constexpr size_t vl = width / (sizeof(T) * 8); \ + using vec = rvv_reg_t; \ + using uvec = rvv_reg_t, width>; \ + using svec = rvv_reg_t, width>; \ + using fvec = rvv_reg_t, width>; \ + using bvec = rvv_bool_t; \ + using scalar_vec = rvv_reg_t; \ + using wide_vec = rvv_reg_t; \ + using narrow_vec = rvv_reg_t; \ + using type = SIGNATURE; \ + }; \ + template \ + using sig_t = typename ctx::type; \ + template \ + struct impl \ + { \ + void operator()() const noexcept {}; \ + }; \ + template \ using impl_t = impl>; #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) @@ -294,57 +295,12 @@ namespace xsimd template using rvv_bool_t = types::detail::rvv_bool_t; - template - struct as_signed_relaxed; - template <> - struct as_signed_relaxed<1> - { - using type = int8_t; - }; - template <> - struct as_signed_relaxed<2> - { - using type = int16_t; - }; - template <> - struct as_signed_relaxed<4> - { - using type = int32_t; - }; - template <> - struct as_signed_relaxed<8> - { - using type = int64_t; - }; - template - using as_signed_relaxed_t = typename as_signed_relaxed::type; - template - struct as_unsigned_relaxed; - template <> - struct as_unsigned_relaxed<1> + template + struct as_float_relaxed { - using type = uint8_t; + using type = xsimd::sized_fp_t; }; template <> - struct as_unsigned_relaxed<2> - { - using type = uint16_t; - }; - template <> - struct as_unsigned_relaxed<4> - { - using type = uint32_t; - }; - template <> - struct as_unsigned_relaxed<8> - { - using type = uint64_t; - }; - template - using as_unsigned_relaxed_t = typename as_unsigned_relaxed::type; - template - struct as_float_relaxed; - template <> struct as_float_relaxed<1> { using type = int8_t; @@ -354,16 +310,6 @@ namespace xsimd { using type = int16_t; }; - template <> - struct as_float_relaxed<4> - { - using type = float; - }; - template <> - struct as_float_relaxed<8> - { - using type = double; - }; template using as_float_relaxed_t = typename as_float_relaxed::type; diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 5be471e93..6636f4c37 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -16,15 +16,8 @@ #include #include -#include "../config/xsimd_macros.hpp" #include "../types/xsimd_sve_register.hpp" -// Define a inline namespace with the explicit SVE vector size to avoid ODR violation -// When dynamically dispatching between different SVE sizes. -// While most code is safe from ODR violation as the size is already encoded in the -// register (and hence batch) types, utilities can quickly fall prone to this issue. -#define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS) - namespace xsimd { template @@ -32,1243 +25,1170 @@ namespace xsimd namespace kernel { - inline namespace XSIMD_SVE_NAMESPACE + namespace detail { - namespace detail_sve - { - using xsimd::index; - using xsimd::types::detail::sve_vector_type; - - // predicate creation - XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); } - - template - XSIMD_INLINE svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } - - // predicate loading - template - XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } - template - XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } - template - XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } - template - XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } - - // count active lanes in a predicate - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } - - template - XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index {}); } - - // enable for signed integers - template - using sve_enable_signed_int_t = std::enable_if_t::value && std::is_signed::value, int>; - - // enable for unsigned integers - template - using sve_enable_unsigned_int_t = std::enable_if_t::value && !std::is_signed::value, int>; - - // enable for floating points - template - using sve_enable_floating_point_t = std::enable_if_t::value, int>; - - // enable for signed integers or floating points - template - using sve_enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; - - // enable for all SVE supported types - template - using sve_enable_all_t = std::enable_if_t::value, int>; - - // Trait describing the SVE types that correspond to a scalar, - // parameterised by (byte size, signedness, floating-point-ness). - // - // `scalar` is the matching fixed-width scalar (int8_t, ..., float, - // double). SVE load/store intrinsics are overloaded on these - // pointer types, so remapping integers through `scalar` avoids - // platform quirks such as darwin arm64's `long` vs `long long` - // distinction and rejects `char` as an element type. - // - // `sizeless` is the matching sizeless SVE type. xsimd stores SVE - // vectors as fixed-size attributed types (arm_sve_vector_bits), - // which clang treats as implicitly convertible to every sizeless - // SVE type — including multi-vector tuples — making the overloaded - // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting - // to `sizeless` first collapses the overload set to the single - // 1-vector candidate. - template - struct sve_type; - template <> - struct sve_type<1, true, false> - { - using scalar = int8_t; - using sizeless = svint8_t; - }; - template <> - struct sve_type<1, false, false> - { - using scalar = uint8_t; - using sizeless = svuint8_t; - }; - template <> - struct sve_type<2, true, false> - { - using scalar = int16_t; - using sizeless = svint16_t; - }; - template <> - struct sve_type<2, false, false> - { - using scalar = uint16_t; - using sizeless = svuint16_t; - }; - template <> - struct sve_type<4, true, false> - { - using scalar = int32_t; - using sizeless = svint32_t; - }; - template <> - struct sve_type<4, false, false> - { - using scalar = uint32_t; - using sizeless = svuint32_t; - }; - template <> - struct sve_type<8, true, false> - { - using scalar = int64_t; - using sizeless = svint64_t; - }; - template <> - struct sve_type<8, false, false> - { - using scalar = uint64_t; - using sizeless = svuint64_t; - }; - template <> - struct sve_type<4, true, true> - { - using scalar = float; - using sizeless = svfloat32_t; - }; - template <> - struct sve_type<8, true, true> - { - using scalar = double; - using sizeless = svfloat64_t; - }; + using xsimd::index; + using xsimd::types::detail::sve_vector_type; + + // predicate creation + XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); } + + template + svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } + + // predicate loading + template + svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } + template + svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } + template + svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } + template + svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } + + // count active lanes in a predicate + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } + + template + XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index {}); } + + // enable for signed integers + template + using sve_enable_signed_int_t = std::enable_if_t::value && std::is_signed::value, int>; + + // enable for unsigned integers + template + using sve_enable_unsigned_int_t = std::enable_if_t::value && !std::is_signed::value, int>; + + // enable for floating points + template + using sve_enable_floating_point_t = std::enable_if_t::value, int>; + + // enable for signed integers or floating points + template + using sve_enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; + + // enable for all SVE supported types + template + using sve_enable_all_t = std::enable_if_t::value, int>; + + // `sizeless` is the matching sizeless SVE type. xsimd stores SVE + // vectors as fixed-size attributed types (arm_sve_vector_bits), + // which clang treats as implicitly convertible to every sizeless + // SVE type — including multi-vector tuples — making the overloaded + // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting + // to `sizeless` first collapses the overload set to the single + // 1-vector candidate. + template + using sve_sizeless_t = xsimd::types::detail::sizeless_sve_vector_type; + + // Remap integer Ts to their matching fixed-width counterpart + // so svld1/svst1 see the pointer type their overload set expects; + // pass non-integer Ts through unchanged. + template >::value> + struct sve_fix_integer_impl + { + using type = T; + }; + template + struct sve_fix_integer_impl + { + using type = std::conditional_t::value, + sized_int_t, sized_uint_t>; + }; + + // SVE load/store intrinsics are overloaded on these pointer for integer + // types, but some platform have explicit different types between + // `long` vs `long long` or `char` vs `int8_t`. + // We remap the type to avoid these. + template + using sve_fix_char_t = typename sve_fix_integer_impl::type; + } // namespace detail + + /********* + * Load * + *********/ + + template = 0> + XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept + { + return svld1(detail::sve_ptrue(), reinterpret_cast const*>(src)); + } - template - using sve_type_for = sve_type::value, std::is_floating_point::value>; + template = 0> + XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept + { + return load_aligned(src, convert(), sve {}); + } - template - using sve_sizeless_t = typename sve_type_for::sizeless; + // load_masked + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + { + return svld1(detail::sve_pmask(), reinterpret_cast const*>(mem)); + } - // Remap integer Ts to their matching fixed-width counterpart (via - // sve_type::scalar) so svld1/svst1 see the pointer type their - // overload set expects; pass non-integer Ts through unchanged. - template >::value> - struct sve_fix_integer_impl - { - using type = T; - }; - template - struct sve_fix_integer_impl - { - using type = typename sve_type_for>::scalar; - }; + // load_complex + template = 0> + XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept + { + const T* buf = reinterpret_cast(mem); + const auto tmp = svld2(detail::sve_ptrue(), buf); + const auto real = svget2(tmp, 0); + const auto imag = svget2(tmp, 1); + return batch, A> { real, imag }; + } + + template = 0> + XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept + { + return load_complex_aligned(mem, convert> {}, sve {}); + } - template - using sve_fix_char_t = typename sve_fix_integer_impl::type; - } // namespace detail_sve + /********* + * Store * + *********/ - /********* - * Load * - *********/ + template = 0> + XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept + { + svst1(detail::sve_ptrue(), reinterpret_cast*>(dst), src); + } - template = 0> - XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept - { - return svld1(detail_sve::sve_ptrue(), reinterpret_cast const*>(src)); - } + template = 0> + XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept + { + store_aligned(dst, src, sve {}); + } - template = 0> - XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept - { - return load_aligned(src, convert(), sve {}); - } + // store_complex + template = 0> + XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>; + v2type tmp {}; + tmp = svset2(tmp, 0, src.real()); + tmp = svset2(tmp, 1, src.imag()); + T* buf = reinterpret_cast(dst); + svst2(detail::sve_ptrue(), buf, tmp); + } + + template = 0> + XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, sve {}); + } - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept - { - return svld1(detail_sve::sve_pmask(), reinterpret_cast const*>(mem)); - } + /****************** + * scatter/gather * + ******************/ - // load_complex - template = 0> - XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept - { - const T* buf = reinterpret_cast(mem); - const auto tmp = svld2(detail_sve::sve_ptrue(), buf); - const auto real = svget2(tmp, 0); - const auto imag = svget2(tmp, 1); - return batch, A> { real, imag }; - } + namespace detail + { + template + using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; + } - template = 0> - XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept - { - return load_complex_aligned(mem, convert> {}, sve {}); - } + // scatter + template = 0> + XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept + { + svst1_scatter_index(detail::sve_ptrue(), dst, index.data, src.data); + } - /********* - * Store * - *********/ + // gather + template = 0> + XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept + { + return svld1_gather_index(detail::sve_ptrue(), src, index.data); + } - template = 0> - XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept - { - svst1(detail_sve::sve_ptrue(), reinterpret_cast*>(dst), src); - } + /******************** + * Scalar to vector * + ********************/ - template = 0> - XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept - { - store_aligned(dst, src, sve {}); - } + // broadcast + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u8(uint8_t(arg)); + } - // store_complex - template = 0> - XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept - { - using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>; - v2type tmp {}; - tmp = svset2(tmp, 0, src.real()); - tmp = svset2(tmp, 1, src.imag()); - T* buf = reinterpret_cast(dst); - svst2(detail_sve::sve_ptrue(), buf, tmp); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s8(int8_t(arg)); + } - template = 0> - XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept - { - store_complex_aligned(dst, src, sve {}); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u16(uint16_t(arg)); + } - /****************** - * scatter/gather * - ******************/ + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s16(int16_t(arg)); + } - namespace detail_sve - { - template - using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u32(uint32_t(arg)); + } - // scatter - template = 0> - XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept - { - svst1_scatter_index(detail_sve::sve_ptrue(), dst, index.data, src.data); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s32(int32_t(arg)); + } - // gather - template = 0> - XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept - { - return svld1_gather_index(detail_sve::sve_ptrue(), src, index.data); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u64(uint64_t(arg)); + } - /******************** - * Scalar to vector * - ********************/ + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s64(int64_t(arg)); + } - // broadcast - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u8(uint8_t(arg)); - } + template + XSIMD_INLINE batch broadcast(float arg, requires_arch) noexcept + { + return svdup_n_f32(arg); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s8(int8_t(arg)); - } + template + XSIMD_INLINE batch broadcast(double arg, requires_arch) noexcept + { + return svdup_n_f64(arg); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u16(uint16_t(arg)); - } + template = 0> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + return broadcast(val, sve {}); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s16(int16_t(arg)); - } + /************** + * Arithmetic * + **************/ - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u32(uint32_t(arg)); - } + // add + template = 0> + XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svadd_x(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s32(int32_t(arg)); - } + // sadd + template = 0> + XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svqadd(lhs, rhs); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u64(uint64_t(arg)); - } + // sub + template = 0> + XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svsub_x(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s64(int64_t(arg)); - } + // ssub + template = 0> + XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svqsub(lhs, rhs); + } - template - XSIMD_INLINE batch broadcast(float arg, requires_arch) noexcept - { - return svdup_n_f32(arg); - } + // mul + template = 0> + XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmul_x(detail::sve_ptrue(), lhs, rhs); + } - template - XSIMD_INLINE batch broadcast(double arg, requires_arch) noexcept - { - return svdup_n_f64(arg); - } + // div + template = 4, int> = 0> + XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svdiv_x(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept - { - return broadcast(val, sve {}); - } + // max + template = 0> + XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmax_x(detail::sve_ptrue(), lhs, rhs); + } - /************** - * Arithmetic * - **************/ + // min + template = 0> + XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmin_x(detail::sve_ptrue(), lhs, rhs); + } - // add - template = 0> - XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svadd_x(detail_sve::sve_ptrue(), lhs, rhs); - } + // neg + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u8(svneg_x(detail::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); + } - // sadd - template = 0> - XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svqadd(lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u16(svneg_x(detail::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); + } - // sub - template = 0> - XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svsub_x(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u32(svneg_x(detail::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); + } - // ssub - template = 0> - XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svqsub(lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u64(svneg_x(detail::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); + } - // mul - template = 0> - XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmul_x(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svneg_x(detail::sve_ptrue(), arg); + } - // div - template = 4, int> = 0> - XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svdiv_x(detail_sve::sve_ptrue(), lhs, rhs); - } + // abs + template = 0> + XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept + { + return arg; + } - // max - template = 0> - XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmax_x(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept + { + return svabs_x(detail::sve_ptrue(), arg); + } - // min - template = 0> - XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmin_x(detail_sve::sve_ptrue(), lhs, rhs); - } + // fma: x * y + z + template = 0> + XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return svmad_x(detail::sve_ptrue(), x, y, z); + } - // neg - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u8(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); - } + // fnma: z - x * y + template = 0> + XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return svmsb_x(detail::sve_ptrue(), x, y, z); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u16(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); - } + // fms: x * y - z + template = 0> + XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -fnma(x, y, z, sve {}); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u32(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); - } + // fnms: - x * y - z + template = 0> + XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -fma(x, y, z, sve {}); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u64(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); - } + /********************** + * Logical operations * + **********************/ - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svneg_x(detail_sve::sve_ptrue(), arg); - } + // bitwise_and + template = 0> + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svand_x(detail::sve_ptrue(), lhs, rhs); + } - // abs - template = 0> - XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept - { - return arg; - } + template + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } + + template + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } + + template = 0> + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return svand_z(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept - { - return svabs_x(detail_sve::sve_ptrue(), arg); - } + // bitwise_andnot + template = 0> + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svbic_x(detail::sve_ptrue(), lhs, rhs); + } - // fma: x * y + z - template = 0> - XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return svmad_x(detail_sve::sve_ptrue(), x, y, z); - } + template + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } + + template + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } + + template = 0> + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return svbic_z(detail::sve_ptrue(), lhs, rhs); + } - // fnma: z - x * y - template = 0> - XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return svmsb_x(detail_sve::sve_ptrue(), x, y, z); - } + // bitwise_or + template = 0> + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svorr_x(detail::sve_ptrue(), lhs, rhs); + } - // fms: x * y - z - template = 0> - XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return -fnma(x, y, z, sve {}); - } + template + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } + + template + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } + + template = 0> + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return svorr_z(detail::sve_ptrue(), lhs, rhs); + } - // fnms: - x * y - z - template = 0> - XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return -fma(x, y, z, sve {}); - } + // bitwise_xor + template = 0> + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return sveor_x(detail::sve_ptrue(), lhs, rhs); + } - /********************** - * Logical operations * - **********************/ + template + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } + + template + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } + + template = 0> + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return sveor_z(detail::sve_ptrue(), lhs, rhs); + } - // bitwise_and - template = 0> - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svand_x(detail_sve::sve_ptrue(), lhs, rhs); - } + // bitwise_not + template = 0> + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + return svnot_x(detail::sve_ptrue(), arg); + } - template - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svand_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } + template + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + const auto arg_bits = svreinterpret_u32(static_cast>(arg)); + const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); + return svreinterpret_f32(result_bits); + } - template - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svand_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } + template + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + const auto arg_bits = svreinterpret_u64(static_cast>(arg)); + const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); + return svreinterpret_f64(result_bits); + } - template = 0> - XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return svand_z(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept + { + return svnot_z(detail::sve_ptrue(), arg); + } - // bitwise_andnot - template = 0> - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svbic_x(detail_sve::sve_ptrue(), lhs, rhs); - } + /********** + * Shifts * + **********/ - template - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + namespace detail + { + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svbic_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); + return svreinterpret_u8(static_cast>(arg)); } - template - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svbic_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); + return svreinterpret_u16(static_cast>(arg)); } - template = 0> - XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept { - return svbic_z(detail_sve::sve_ptrue(), lhs, rhs); + return svreinterpret_u32(static_cast>(arg)); } - // bitwise_or - template = 0> - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept { - return svorr_x(detail_sve::sve_ptrue(), lhs, rhs); + return svreinterpret_u64(static_cast>(arg)); } - template - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + template > + XSIMD_INLINE batch sve_to_unsigned_batch(batch const& arg) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svorr_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); + return sve_to_unsigned_batch_impl(arg, index {}); } + } // namespace detail - template - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svorr_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } + // bitwise_lshift + template = 0> + XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svlsl_x(detail::sve_ptrue(), arg, n); + } - template = 0> - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return svorr_z(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svlsl_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); + } - // bitwise_xor - template = 0> - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return sveor_x(detail_sve::sve_ptrue(), lhs, rhs); - } + // bitwise_rshift + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svlsr_x(detail::sve_ptrue(), arg, static_cast(n)); + } - template - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = sveor_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svlsr_x(detail::sve_ptrue(), lhs, rhs); + } - template - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = sveor_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svasr_x(detail::sve_ptrue(), arg, static_cast>(n)); + } - template = 0> - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return sveor_z(detail_sve::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svasr_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); + } - // bitwise_not - template = 0> - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - return svnot_x(detail_sve::sve_ptrue(), arg); - } + /************** + * Reductions * + **************/ - template - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - const auto arg_bits = svreinterpret_u32(static_cast>(arg)); - const auto result_bits = svnot_x(detail_sve::sve_ptrue(), arg_bits); - return svreinterpret_f32(result_bits); - } + // reduce_add + template ::value_type, detail::sve_enable_all_t = 0> + XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept + { + // sve integer reduction results are promoted to 64 bits + return static_cast(svaddv(detail::sve_ptrue(), arg)); + } - template - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - const auto arg_bits = svreinterpret_u64(static_cast>(arg)); - const auto result_bits = svnot_x(detail_sve::sve_ptrue(), arg_bits); - return svreinterpret_f64(result_bits); - } + // reduce_max + template = 0> + XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept + { + return svmaxv(detail::sve_ptrue(), arg); + } - template = 0> - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept + // reduce_min + template = 0> + XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept + { + return svminv(detail::sve_ptrue(), arg); + } + + // haddp + template = 0> + XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + T sums[size]; + for (std::size_t i = 0; i < size; ++i) { - return svnot_z(detail_sve::sve_ptrue(), arg); + sums[i] = reduce_add(row[i], sve {}); } + return svld1(detail::sve_ptrue(), sums); + } - /********** - * Shifts * - **********/ + /*************** + * Comparisons * + ***************/ - namespace detail_sve - { - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept - { - return svreinterpret_u8(static_cast>(arg)); - } + // eq + template = 0> + XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpeq(detail::sve_ptrue(), lhs, rhs); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept - { - return svreinterpret_u16(static_cast>(arg)); - } + template = 0> + XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + const auto neq_result = sveor_z(detail::sve_ptrue(), lhs, rhs); + return svnot_z(detail::sve_ptrue(), neq_result); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept - { - return svreinterpret_u32(static_cast>(arg)); - } + // neq + template = 0> + XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpne(detail::sve_ptrue(), lhs, rhs); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept - { - return svreinterpret_u64(static_cast>(arg)); - } + template = 0> + XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return sveor_z(detail::sve_ptrue(), lhs, rhs); + } - template > - XSIMD_INLINE batch sve_to_unsigned_batch(batch const& arg) noexcept - { - return sve_to_unsigned_batch_impl(arg, index {}); - } - } // namespace detail_sve + // lt + template = 0> + XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmplt(detail::sve_ptrue(), lhs, rhs); + } - // bitwise_lshift - template = 0> - XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsl_x(detail_sve::sve_ptrue(), arg, n); - } + // le + template = 0> + XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmple(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svlsl_x(detail_sve::sve_ptrue(), lhs, detail_sve::sve_to_unsigned_batch(rhs)); - } + // gt + template = 0> + XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpgt(detail::sve_ptrue(), lhs, rhs); + } - // bitwise_rshift - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsr_x(detail_sve::sve_ptrue(), arg, static_cast(n)); - } + // ge + template = 0> + XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpge(detail::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svlsr_x(detail_sve::sve_ptrue(), lhs, rhs); - } + /*************** + * Permutation * + ***************/ - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svasr_x(detail_sve::sve_ptrue(), arg, static_cast>(n)); - } + // rotate_left + template = 0> + XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept + { + return svext(a, a, N); + } - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svasr_x(detail_sve::sve_ptrue(), lhs, detail_sve::sve_to_unsigned_batch(rhs)); - } + // swizzle (dynamic) + template + XSIMD_INLINE batch swizzle(batch const& arg, batch indices, requires_arch) noexcept + { + return svtbl(arg, indices); + } - /************** - * Reductions * - **************/ + template + XSIMD_INLINE batch, A> swizzle(batch, A> const& self, + batch indices, + requires_arch) noexcept + { + const auto real = swizzle(self.real(), indices, sve {}); + const auto imag = swizzle(self.imag(), indices, sve {}); + return batch>(real, imag); + } + + // swizzle (static) + template + XSIMD_INLINE batch swizzle(batch const& arg, batch_constant indices, requires_arch) noexcept + { + static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); + return swizzle(arg, indices.as_batch(), sve {}); + } + + template + XSIMD_INLINE batch, A> swizzle(batch, A> const& arg, + batch_constant indices, + requires_arch) noexcept + { + static_assert(batch, A>::size == sizeof...(idx), "invalid swizzle indices"); + return swizzle(arg, indices.as_batch(), sve {}); + } - // reduce_add - template ::value_type, detail_sve::sve_enable_all_t = 0> - XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept - { - // sve integer reduction results are promoted to 64 bits - return static_cast(svaddv(detail_sve::sve_ptrue(), arg)); - } + /************* + * Selection * + *************/ - // reduce_max - template = 0> - XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept + // extract_pair + namespace detail + { + template + XSIMD_INLINE batch sve_extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept { - return svmaxv(detail_sve::sve_ptrue(), arg); + assert(false && "extract_pair out of bounds"); + return batch {}; } - // reduce_min - template = 0> - XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept + template + XSIMD_INLINE batch sve_extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { - return svminv(detail_sve::sve_ptrue(), arg); + if (n == I) + { + return svext(rhs, lhs, I); + } + else + { + return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + } } - // haddp - template = 0> - XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept + template + XSIMD_INLINE batch sve_extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept { - constexpr std::size_t size = batch::size; - T sums[size]; - for (std::size_t i = 0; i < size; ++i) + if (n == 0) + { + return rhs; + } + else { - sums[i] = reduce_add(row[i], sve {}); + return sve_extract_pair(lhs, rhs, n, std::index_sequence()); } - return svld1(detail_sve::sve_ptrue(), sums); } + } - /*************** - * Comparisons * - ***************/ + template = 0> + XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + assert(n < size && "index in bounds"); + return detail::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); + } + + // select + template = 0> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept + { + return svsel(cond, static_cast>(a), static_cast>(b)); + } - // eq - template = 0> - XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpeq(detail_sve::sve_ptrue(), lhs, rhs); - } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { b... }, true_br, false_br, sve {}); + } - template = 0> - XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - const auto neq_result = sveor_z(detail_sve::sve_ptrue(), lhs, rhs); - return svnot_z(detail_sve::sve_ptrue(), neq_result); - } + // zip_lo + template = 0> + XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svzip1(lhs, rhs); + } - // neq - template = 0> - XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpne(detail_sve::sve_ptrue(), lhs, rhs); - } + // zip_hi + template = 0> + XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svzip2(lhs, rhs); + } - template = 0> - XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return sveor_z(detail_sve::sve_ptrue(), lhs, rhs); - } + /***************************** + * Floating-point arithmetic * + *****************************/ - // lt - template = 0> - XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmplt(detail_sve::sve_ptrue(), lhs, rhs); - } + // rsqrt + template = 0> + XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept + { + return svrsqrte(arg); + } - // le - template = 0> - XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmple(detail_sve::sve_ptrue(), lhs, rhs); - } + // sqrt + template = 0> + XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept + { + return svsqrt_x(detail::sve_ptrue(), arg); + } - // gt - template = 0> - XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpgt(detail_sve::sve_ptrue(), lhs, rhs); - } + // reciprocal + template = 0> + XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept + { + return svrecpe(arg); + } + + /****************************** + * Floating-point conversions * + ******************************/ - // ge - template = 0> - XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept + // fast_cast + namespace detail + { + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svcmpge(detail_sve::sve_ptrue(), lhs, rhs); + return svcvt_f32_x(detail::sve_ptrue(), arg); } - /*************** - * Permutation * - ***************/ - - // rotate_left - template = 0> - XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svext(a, a, N); + return svcvt_f64_x(detail::sve_ptrue(), arg); } - // swizzle (dynamic) - template - XSIMD_INLINE batch swizzle(batch const& arg, batch indices, requires_arch) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svtbl(arg, indices); + return svcvt_s32_x(detail::sve_ptrue(), arg); } - template - XSIMD_INLINE batch, A> swizzle(batch, A> const& self, - batch indices, - requires_arch) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - const auto real = swizzle(self.real(), indices, sve {}); - const auto imag = swizzle(self.imag(), indices, sve {}); - return batch>(real, imag); + return svcvt_u32_x(detail::sve_ptrue(), arg); } - // swizzle (static) - template - XSIMD_INLINE batch swizzle(batch const& arg, batch_constant indices, requires_arch) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); - return swizzle(arg, indices.as_batch(), sve {}); + return svcvt_s64_x(detail::sve_ptrue(), arg); } - template - XSIMD_INLINE batch, A> swizzle(batch, A> const& arg, - batch_constant indices, - requires_arch) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - static_assert(batch, A>::size == sizeof...(idx), "invalid swizzle indices"); - return swizzle(arg, indices.as_batch(), sve {}); + return svcvt_u64_x(detail::sve_ptrue(), arg); } + } - /************* - * Selection * - *************/ + /********* + * Miscs * + *********/ - // extract_pair - namespace detail_sve - { - template - XSIMD_INLINE batch sve_extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept - { - assert(false && "extract_pair out of bounds"); - return batch {}; - } + // set + template + XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept + { + return detail::sve_vector_type { args... }; + } - template - XSIMD_INLINE batch sve_extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept - { - if (n == I) - { - return svext(rhs, lhs, I); - } - else - { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); - } - } + template + XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, + Args... args_complex) noexcept + { + return batch>(detail::sve_vector_type { args_complex.real()... }, + detail::sve_vector_type { args_complex.imag()... }); + } - template - XSIMD_INLINE batch sve_extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept - { - if (n == 0) - { - return rhs; - } - else - { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); - } - } - } + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept + { + using U = as_unsigned_integer_t; + const auto values = detail::sve_vector_type { static_cast(args)... }; + const auto zero = broadcast(static_cast(0), sve {}); + return svcmpne(detail::sve_ptrue(), values, zero); + } + + // insert + namespace detail + { + // generate index sequence (iota) + XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } + XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } + XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } + XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } + + template >> + XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index {}); } + } // namespace detail + + template = 0> + XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept + { + // create a predicate with only the I-th lane activated + const auto iota = detail::sve_iota(); + const auto index_predicate = svcmpeq(detail::sve_ptrue(), iota, static_cast>(I)); + return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); + } + + // first + template = 0> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + return self.data[0]; + } - template = 0> - XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - assert(n < size && "index in bounds"); - return detail_sve::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); - } + // all + template = 0> + XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept + { + return detail::sve_pcount(arg) == batch_bool::size; + } - // select - template = 0> - XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept - { - return svsel(cond, static_cast>(a), static_cast>(b)); - } + // any + template = 0> + XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept + { + return svptest_any(arg, arg); + } - template - XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept - { - return select(batch_bool { b... }, true_br, false_br, sve {}); - } + // bitwise_cast + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u8(static_cast>(arg)); + } - // zip_lo - template = 0> - XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svzip1(lhs, rhs); - } + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s8(static_cast>(arg)); + } - // zip_hi - template = 0> - XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svzip2(lhs, rhs); - } + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u16(static_cast>(arg)); + } - /***************************** - * Floating-point arithmetic * - *****************************/ + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s16(static_cast>(arg)); + } - // rsqrt - template = 0> - XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept - { - return svrsqrte(arg); - } + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u32(static_cast>(arg)); + } - // sqrt - template = 0> - XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept - { - return svsqrt_x(detail_sve::sve_ptrue(), arg); - } + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s32(static_cast>(arg)); + } - // reciprocal - template = 0> - XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept - { - return svrecpe(arg); - } + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u64(static_cast>(arg)); + } - /****************************** - * Floating-point conversions * - ******************************/ + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s64(static_cast>(arg)); + } - // fast_cast - namespace detail_sve - { - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_f32_x(detail_sve::sve_ptrue(), arg); - } + template = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_f32(static_cast>(arg)); + } - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_f64_x(detail_sve::sve_ptrue(), arg); - } + template = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_f64(static_cast>(arg)); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_s32_x(detail_sve::sve_ptrue(), arg); - } + // batch_bool_cast + template = 0> + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept + { + return arg.data; + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_u32_x(detail_sve::sve_ptrue(), arg); - } + // from_bool + template = 0> + XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept + { + return select(arg, batch(1), batch(0)); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + // slide_left + namespace detail + { + template + struct sve_slider_left + { + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept { - return svcvt_s64_x(detail_sve::sve_ptrue(), arg); + using u8_vector = batch; + const auto left = svdup_n_u8(0); + const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; + const u8_vector result(svext(left, right, u8_vector::size - N)); + return bitwise_cast(result, batch {}, sve {}); } + }; - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + template <> + struct sve_slider_left<0> + { + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept { - return svcvt_u64_x(detail_sve::sve_ptrue(), arg); + return arg; } - } - - /********* - * Miscs * - *********/ - - // set - template - XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept - { - return detail_sve::sve_vector_type { args... }; - } + }; + } // namespace detail - template - XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, - Args... args_complex) noexcept - { - return batch>(detail_sve::sve_vector_type { args_complex.real()... }, - detail_sve::sve_vector_type { args_complex.imag()... }); - } - - template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept - { - using U = as_unsigned_integer_t; - const auto values = detail_sve::sve_vector_type { static_cast(args)... }; - const auto zero = broadcast(static_cast(0), sve {}); - return svcmpne(detail_sve::sve_ptrue(), values, zero); - } - - // insert - namespace detail_sve - { - // generate index sequence (iota) - XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } - XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } - XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } - XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } - - template >> - XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index {}); } - } // namespace detail_sve - - template = 0> - XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept - { - // create a predicate with only the I-th lane activated - const auto iota = detail_sve::sve_iota(); - const auto index_predicate = svcmpeq(detail_sve::sve_ptrue(), iota, static_cast>(I)); - return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); - } - - // first - template = 0> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - return self.data[0]; - } - - // all - template = 0> - XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept - { - return detail_sve::sve_pcount(arg) == batch_bool::size; - } - - // any - template = 0> - XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept - { - return svptest_any(arg, arg); - } - - // bitwise_cast - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u8(static_cast>(arg)); - } - - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s8(static_cast>(arg)); - } - - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u16(static_cast>(arg)); - } - - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s16(static_cast>(arg)); - } - - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u32(static_cast>(arg)); - } - - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s32(static_cast>(arg)); - } - - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u64(static_cast>(arg)); - } - - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s64(static_cast>(arg)); - } - - template = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_f32(static_cast>(arg)); - } - - template = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_f64(static_cast>(arg)); - } - - // batch_bool_cast - template = 0> - XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept - { - return arg.data; - } - - // from_bool - template = 0> - XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept - { - return select(arg, batch(1), batch(0)); - } + template = 0> + XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept + { + return detail::sve_slider_left()(arg); + } - // slide_left - namespace detail_sve + // slide_right + namespace detail + { + template + struct sve_slider_right { - template - struct sve_slider_left - { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept - { - using u8_vector = batch; - const auto left = svdup_n_u8(0); - const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; - const u8_vector result(svext(left, right, u8_vector::size - N)); - return bitwise_cast(result, batch {}, sve {}); - } - }; - - template <> - struct sve_slider_left<0> + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept - { - return arg; - } - }; - } // namespace detail_sve - - template = 0> - XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept - { - return detail_sve::sve_slider_left()(arg); - } + using u8_vector = batch; + const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; + const auto right = svdup_n_u8(0); + const u8_vector result(svext(left, right, N)); + return bitwise_cast(result, batch {}, sve {}); + } + }; - // slide_right - namespace detail_sve + template <> + struct sve_slider_right::size> { - template - struct sve_slider_right - { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept - { - using u8_vector = batch; - const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; - const auto right = svdup_n_u8(0); - const u8_vector result(svext(left, right, N)); - return bitwise_cast(result, batch {}, sve {}); - } - }; - - template <> - struct sve_slider_right::size> + template + XSIMD_INLINE batch operator()(batch const&) noexcept { - template - XSIMD_INLINE batch operator()(batch const&) noexcept - { - return batch {}; - } - }; - } // namespace detail_sve - - template = 0> - XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept - { - return detail_sve::sve_slider_right()(arg); - } + return batch {}; + } + }; + } // namespace detail - // isnan - template = 0> - XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept - { - return !(arg == arg); - } + template = 0> + XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept + { + return detail::sve_slider_right()(arg); + } - // nearbyint - template = 0> - XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept - { - return svrintx_x(detail_sve::sve_ptrue(), arg); - } + // isnan + template = 0> + XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept + { + return !(arg == arg); + } - // nearbyint_as_int - template - XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept - { - const auto nearest = svrintx_x(detail_sve::sve_ptrue(), arg); - return svcvt_s32_x(detail_sve::sve_ptrue(), nearest); - } + // nearbyint + template = 0> + XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept + { + return svrintx_x(detail::sve_ptrue(), arg); + } - template - XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept - { - const auto nearest = svrintx_x(detail_sve::sve_ptrue(), arg); - return svcvt_s64_x(detail_sve::sve_ptrue(), nearest); - } + // nearbyint_as_int + template + XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept + { + const auto nearest = svrintx_x(detail::sve_ptrue(), arg); + return svcvt_s32_x(detail::sve_ptrue(), nearest); + } - // ldexp - template = 0> - XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept - { - return svscale_x(detail_sve::sve_ptrue(), x, exp); - } + template + XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept + { + const auto nearest = svrintx_x(detail::sve_ptrue(), arg); + return svcvt_s64_x(detail::sve_ptrue(), nearest); + } + + // ldexp + template = 0> + XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept + { + return svscale_x(detail::sve_ptrue(), x, exp); + } - } // namespace XSIMD_SVE_NAMESPACE } // namespace kernel } // namespace xsimd diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 5e6b3a209..1be681777 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -20,6 +20,7 @@ #include "../arch/xsimd_isa.hpp" #include "../types/xsimd_batch.hpp" #include "../types/xsimd_traits.hpp" +#include "../utils/xsimd_type_traits.hpp" namespace xsimd { diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 1cac2abc2..b584a2d81 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -18,15 +18,11 @@ #include "../config/xsimd_arch.hpp" #include "../config/xsimd_macros.hpp" #include "../memory/xsimd_alignment.hpp" +#include "./xsimd_batch_fwd.hpp" #include "./xsimd_utils.hpp" namespace xsimd { - template - struct batch_bool_constant; - template - class batch; - namespace types { template @@ -301,7 +297,7 @@ namespace xsimd * @tparam T the type of the predicated values. * @tparam A the architecture this batch is tied too. **/ - template + template class batch_bool : public types::get_bool_simd_register_t { using base_type = types::get_bool_simd_register_t; diff --git a/include/xsimd/types/xsimd_batch_fwd.hpp b/include/xsimd/types/xsimd_batch_fwd.hpp new file mode 100644 index 000000000..62e3cbba7 --- /dev/null +++ b/include/xsimd/types/xsimd_batch_fwd.hpp @@ -0,0 +1,41 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_BATCH_FWD_HPP +#define XSIMD_BATCH_FWD_HPP + +#include "../config/xsimd_config.hpp" + +// TODO this is somehow redundant with XSIMD_DEFAULT_ARCH but is only supported +// when an architecture is defined. +#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) +#define XSIMD_BATCH_DEFAULT_ARCH_IMPL void +#else +#include "../config/xsimd_arch.hpp" +#define XSIMD_BATCH_DEFAULT_ARCH_IMPL default_arch +#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE + +namespace xsimd +{ + template + class batch_bool; + + template + struct batch_bool_constant; + + template + class batch; + + template + struct batch_constant; +} + +#endif diff --git a/include/xsimd/types/xsimd_neon_register.hpp b/include/xsimd/types/xsimd_neon_register.hpp index ef9973828..ae76e6dc2 100644 --- a/include/xsimd/types/xsimd_neon_register.hpp +++ b/include/xsimd/types/xsimd_neon_register.hpp @@ -12,8 +12,9 @@ #ifndef XSIMD_NEON_REGISTER_HPP #define XSIMD_NEON_REGISTER_HPP -#include "xsimd_common_arch.hpp" -#include "xsimd_register.hpp" +#include "../utils/xsimd_type_traits.hpp" +#include "./xsimd_common_arch.hpp" +#include "./xsimd_register.hpp" #if XSIMD_WITH_NEON #include @@ -103,40 +104,10 @@ namespace xsimd namespace detail { - template - struct get_unsigned_type; - - template <> - struct get_unsigned_type<1> - { - using type = uint8_t; - }; - - template <> - struct get_unsigned_type<2> - { - using type = uint16_t; - }; - - template <> - struct get_unsigned_type<4> - { - using type = uint32_t; - }; - - template <> - struct get_unsigned_type<8> - { - using type = uint64_t; - }; - - template - using get_unsigned_type_t = typename get_unsigned_type::type; - template struct neon_bool_simd_register { - using type = simd_register, A>; + using type = simd_register, A>; }; } diff --git a/include/xsimd/types/xsimd_sve_register.hpp b/include/xsimd/types/xsimd_sve_register.hpp index 7ac748f8d..a0d1b5b99 100644 --- a/include/xsimd/types/xsimd_sve_register.hpp +++ b/include/xsimd/types/xsimd_sve_register.hpp @@ -67,55 +67,115 @@ namespace xsimd struct sve_vector_type_impl; template <> - struct sve_vector_type_impl<8> + struct sve_vector_type_impl<1> { using signed_type = sve_int8_t; using unsigned_type = sve_uint8_t; using floating_point_type = void; + using sizeless_unsigned_type = svuint8_t; + using sizeless_signed_type = svint8_t; + using sizeless_floating_point_type = void; }; template <> - struct sve_vector_type_impl<16> + struct sve_vector_type_impl<2> { using signed_type = sve_int16_t; using unsigned_type = sve_uint16_t; using floating_point_type = void; + using sizeless_unsigned_type = svuint16_t; + using sizeless_signed_type = svint16_t; + using sizeless_floating_point_type = void; }; template <> - struct sve_vector_type_impl<32> + struct sve_vector_type_impl<4> { using signed_type = sve_int32_t; using unsigned_type = sve_uint32_t; using floating_point_type = sve_float32_t; + using sizeless_unsigned_type = svuint32_t; + using sizeless_signed_type = svint32_t; + using sizeless_floating_point_type = svfloat32_t; }; template <> - struct sve_vector_type_impl<64> + struct sve_vector_type_impl<8> { using signed_type = sve_int64_t; using unsigned_type = sve_uint64_t; using floating_point_type = sve_float64_t; + using sizeless_unsigned_type = svuint64_t; + using sizeless_signed_type = svint64_t; + using sizeless_floating_point_type = svfloat64_t; }; template - using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type; + using signed_int_sve_vector_type = typename sve_vector_type_impl::signed_type; + + template + using unsigned_int_sve_vector_type = typename sve_vector_type_impl::unsigned_type; template - using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type; + using floating_point_sve_vector_type = typename sve_vector_type_impl::floating_point_type; template - using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type; + using sizeless_signed_int_sve_vector_type = typename sve_vector_type_impl::sizeless_signed_type; template - using signed_int_or_floating_point_sve_vector_type = std::conditional_t::value, - floating_point_sve_vector_type, - signed_int_sve_vector_type>; + using sizeless_unsigned_int_sve_vector_type = typename sve_vector_type_impl::sizeless_unsigned_type; template - using sve_vector_type = std::conditional_t::value, - signed_int_or_floating_point_sve_vector_type, - unsigned_int_sve_vector_type>; + using sizeless_floating_point_sve_vector_type = typename sve_vector_type_impl::sizeless_floating_point_type; + + template + struct sve_vector_impl; + + template + struct sve_vector_impl::value>> + { + using type = floating_point_sve_vector_type; + }; + + template + struct sve_vector_impl::value && std::is_signed::value>> + { + using type = signed_int_sve_vector_type; + }; + + template + struct sve_vector_impl::value && std::is_unsigned::value>> + { + using type = unsigned_int_sve_vector_type; + }; + + template + struct sizeless_sve_vector_impl; + + template + struct sizeless_sve_vector_impl::value>> + { + using type = sizeless_floating_point_sve_vector_type; + }; + + template + struct sizeless_sve_vector_impl::value && std::is_signed::value>> + { + using type = sizeless_signed_int_sve_vector_type; + }; + + template + struct sizeless_sve_vector_impl::value && std::is_unsigned::value>> + { + using type = sizeless_unsigned_int_sve_vector_type; + }; + + template + using sve_vector_type = typename detail::sve_vector_impl::type; + + template + using sizeless_sve_vector_type = typename detail::sizeless_sve_vector_impl::type; + } // namespace detail XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type); diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp index 34b47c7ee..71ead2cc3 100644 --- a/include/xsimd/types/xsimd_traits.hpp +++ b/include/xsimd/types/xsimd_traits.hpp @@ -12,10 +12,16 @@ #ifndef XSIMD_TRAITS_HPP #define XSIMD_TRAITS_HPP +#include #include #include -#include "xsimd_batch.hpp" +#ifdef XSIMD_ENABLE_XTL_COMPLEX +#include +#endif + +#include "./xsimd_batch_fwd.hpp" +#include "./xsimd_utils.hpp" /** * high level type traits @@ -397,53 +403,6 @@ namespace xsimd template using mask_type_t = typename mask_type::type; - - namespace detail - { - template - struct widen - { - using type = std::make_signed_t>::type>; - }; - - template <> - struct widen - { - using type = uint64_t; - }; - template <> - struct widen - { - using type = uint32_t; - }; - template <> - struct widen - { - using type = uint16_t; - }; - template <> - struct widen - { - using type = int64_t; - }; - template <> - struct widen - { - using type = int32_t; - }; - template <> - struct widen - { - using type = int16_t; - }; - template <> - struct widen - { - using type = double; - }; - } - template - using widen_t = typename detail::widen::type; } #endif diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp index aa11b90db..3284e97c0 100644 --- a/include/xsimd/types/xsimd_utils.hpp +++ b/include/xsimd/types/xsimd_utils.hpp @@ -23,15 +23,10 @@ #include "xtl/xcomplex.hpp" #endif +#include "./xsimd_batch_fwd.hpp" + namespace xsimd { - - template - class batch; - - template - class batch_bool; - /************** * index * **************/ diff --git a/include/xsimd/utils/xsimd_type_traits.hpp b/include/xsimd/utils/xsimd_type_traits.hpp new file mode 100644 index 000000000..a3f6842f5 --- /dev/null +++ b/include/xsimd/utils/xsimd_type_traits.hpp @@ -0,0 +1,127 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_TYPE_TRAITS_HPP +#define XSIMD_TYPE_TRAITS_HPP + +#include +#include +#include + +namespace xsimd +{ + namespace detail + { + template + struct sized_num_types; + + template <> + struct sized_num_types<1> + { + using signed_type = std::int8_t; + using unsigned_type = std::uint8_t; + using floating_point_type = void; + }; + + template <> + struct sized_num_types<2> + { + using signed_type = std::int16_t; + using unsigned_type = std::uint16_t; + using floating_point_type = void; + }; + + template <> + struct sized_num_types<4> + { + using signed_type = std::int32_t; + using unsigned_type = std::uint32_t; + using floating_point_type = float; + }; + + template <> + struct sized_num_types<8> + { + using signed_type = std::int64_t; + using unsigned_type = std::uint64_t; + using floating_point_type = double; + }; + } + + /** + * @ingroup batch_traits + * + * Signed integer type with exactly @c S bytes (1, 2, 4, or 8). + * + * @tparam S size in bytes. + */ + template + using sized_int_t = typename detail::sized_num_types::signed_type; + + /** + * @ingroup batch_traits + * + * Unsigned integer type with exactly @c S bytes (1, 2, 4, or 8). + * + * @tparam S size in bytes. + */ + template + using sized_uint_t = typename detail::sized_num_types::unsigned_type; + + /** + * @ingroup batch_traits + * + * Floating-point type with exactly @c S bytes (4 for @c float, 8 for @c double). + * Yields @c void for sizes without a standard floating-point type (1, 2). + * + * @tparam S size in bytes. + */ + template + using sized_fp_t = typename detail::sized_num_types::floating_point_type; + + namespace detail + { + template + struct widen; + + template + struct widen::value>> + { + using type = xsimd::sized_fp_t; + }; + + template + struct widen::value && std::is_signed::value>> + { + using type = xsimd::sized_int_t; + }; + + template + struct widen::value && std::is_unsigned::value>> + { + using type = xsimd::sized_uint_t; + }; + } + + /** + * @ingroup batch_traits + * + * The next-wider arithmetic type for @c T: doubles the size while preserving + * signedness for integers and yielding @c double for @c float. + * Supported input types: @c [u]int{8,16,32}_t and @c float. + * + * @tparam T arithmetic type to widen. + */ + template + using widen_t = typename detail::widen::type; +} + +#endif diff --git a/include/xsimd/xsimd.hpp b/include/xsimd/xsimd.hpp index df90a1b32..ea9087ef9 100644 --- a/include/xsimd/xsimd.hpp +++ b/include/xsimd/xsimd.hpp @@ -17,18 +17,20 @@ #include "arch/xsimd_scalar.hpp" #include "memory/xsimd_aligned_allocator.hpp" +#include "types/xsimd_batch_fwd.hpp" #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) -// no type definition or anything apart from scalar definition and aligned allocator namespace xsimd { - template + // no type definition or anything apart from scalar definition and aligned allocator + template class batch { static constexpr bool supported_architecture = sizeof(A*) == 0; // type-dependant but always false static_assert(supported_architecture, "No SIMD architecture detected, cannot instantiate a batch"); }; } + #else #include "types/xsimd_batch.hpp" #include "types/xsimd_batch_constant.hpp" @@ -36,5 +38,6 @@ namespace xsimd // This include must come last #include "types/xsimd_api.hpp" -#endif +#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE + #endif