diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 2acc650b7..6db583ff8 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -570,6 +570,68 @@ namespace xsimd batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); return { real, imag }; } + + // load_unaligned + namespace detail + { + template + XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto bpack = _mm_loadu_si128((__m128i const*)mem); + return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)); + } + // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct load. + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { +#if defined(__x86_64__) + uint64_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + auto val = _mm_cvtsi64_si128(tmp); +#else + __m128i val; + memcpy(&val, mem, sizeof(uint64_t)); +#endif + return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + uint32_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))); + } + else + { + assert(false && "unsupported arch/op combination"); + return __m256i {}; + } + } + } + + template ::value, void>::type> + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(detail::load_bool_avx2(mem)); + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(_mm256_castsi256_ps(detail::load_bool_avx2(mem))); + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(_mm256_castsi256_pd(detail::load_bool_avx2(mem))); + } + // mask template ::value, void>::type> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept @@ -923,6 +985,66 @@ namespace xsimd return _mm256_or_si256(y, w); } + // store + namespace detail + { + template + XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept + { + // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct store. + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // negate mask to convert to 0 or 1 + auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b); + memcpy(mem, &val, sizeof(val)); + return; + } + + auto b_hi = _mm256_extractf128_si256(b, 1); + auto b_lo = _mm256_castsi256_si128(b); + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi)); + memcpy(mem, &val, sizeof(val)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto pack_16 = _mm_packs_epi32(b_lo, b_hi); + auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)); +#if defined(__x86_64__) + auto val_lo = _mm_cvtsi128_si64(val); + memcpy(mem, &val_lo, sizeof(val_lo)); +#else + memcpy(mem, &val, sizeof(uint64_t)); +#endif + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + const auto bmask = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 8, 0); + auto pack = _mm_unpacklo_epi16(_mm_shuffle_epi8(b_lo, bmask), _mm_shuffle_epi8(b_hi, bmask)); + uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), pack)); + memcpy(mem, &val, sizeof(val)); + } + else + { + assert(false && "unsupported arch/op combination"); + } + } + + XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); } + XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); } + XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; } + } + + template + XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept + { + detail::store_bool_avx2(detail::avx_to_i(b), mem, T {}); + } + // ssub template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index ac98e522a..01787de44 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1697,6 +1697,60 @@ namespace xsimd } } + // store + namespace detail + { + template + XSIMD_INLINE void store_bool_sse2(__m128i b, bool* mem, T) noexcept + { + // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct store. + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // negate mask to convert to 0 or 1 + auto val = _mm_sub_epi8(_mm_set1_epi8(0), b); + memcpy(mem, &val, sizeof(val)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b)); +#if defined(__x86_64__) + auto val_lo = _mm_cvtsi128_si64(val); + memcpy(mem, &val_lo, sizeof(val_lo)); +#else + memcpy(mem, &val, sizeof(uint64_t)); +#endif + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto pack_16 = _mm_packs_epi32(b, b); + uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16))); + memcpy(mem, &val, sizeof(val)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto pack_32 = _mm_packs_epi32(b, b); + auto pack_16 = _mm_packs_epi32(pack_32, pack_32); + uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16))); + memcpy(mem, &val, sizeof(val)); + } + else + { + assert(false && "unsupported arch/op combination"); + } + } + + XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); } + XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); } + XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; } + } + + template + XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept + { + detail::store_bool_sse2(detail::sse_to_i(b), mem, T {}); + } + // store_aligned template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp index a88d693cd..5ad7575b3 100644 --- a/include/xsimd/arch/xsimd_sse3.hpp +++ b/include/xsimd/arch/xsimd_sse3.hpp @@ -59,7 +59,6 @@ namespace xsimd __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1)); return _mm_cvtss_f32(tmp2); } - } } diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 7fce2c314..ae01c8d02 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -122,6 +122,68 @@ namespace xsimd } } + // load_unaligned + namespace detail + { + template + XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem)); + } + // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct load. + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { +#if defined(__x86_64__) + uint64_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + auto val = _mm_cvtsi64_si128(tmp); +#else + __m128i val; + memcpy(&val, mem, sizeof(uint64_t)); +#endif + return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + uint32_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + uint16_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))); + } + else + { + assert(false && "unsupported arch/op combination"); + return __m128i {}; + } + } + } + + template ::value, void>::type> + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(detail::load_bool_sse4_1(mem)); + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(_mm_castsi128_ps(detail::load_bool_sse4_1(mem))); + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return batch_bool(_mm_castsi128_pd(detail::load_bool_sse4_1(mem))); + } + // max template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept diff --git a/test/test_batch_bool.cpp b/test/test_batch_bool.cpp index b94540761..b262e65a6 100644 --- a/test/test_batch_bool.cpp +++ b/test/test_batch_bool.cpp @@ -185,26 +185,22 @@ struct batch_bool_test { }; - template - void check_constructor_from_sequence(std::integral_constant, pack) const + template + static batch_bool_type make_batch_impl(F&& f, std::integral_constant, pack) { - bool_array_type res = { bool(Values % 3)... }; - bool_array_type tmp; - batch_bool_type b0(bool(Values % 3)...); - b0.store_unaligned(tmp.data()); - INFO("batch_bool(values...)"); - CHECK_EQ(tmp, res); - - batch_bool_type b1 { bool(Values % 3)... }; - b1.store_unaligned(tmp.data()); - INFO("batch_bool{values...}"); - CHECK_EQ(tmp, res); + return batch_bool_type(bool(f(Values))...); } - template - void check_constructor_from_sequence(std::integral_constant, pack) const + template + static batch_bool_type make_batch_impl(F&& f, std::integral_constant, pack) { - return check_constructor_from_sequence(std::integral_constant(), pack()); + return make_batch_impl(std::forward(f), std::integral_constant(), pack()); + } + + template + static batch_bool_type make_batch(F&& f) + { + return make_batch_impl(std::forward(f), std::integral_constant(), pack<> {}); } void test_constructors() const @@ -213,18 +209,40 @@ struct batch_bool_test // value uninitialized, cannot test it. (void)a; - bool_array_type res; - batch_bool_type b(true); - b.store_unaligned(res.data()); - INFO("batch_bool{value}"); - CHECK_EQ(res, all_true); + { + bool_array_type res; + batch_bool_type b(true); + b.store_unaligned(res.data()); + INFO("batch_bool{value}"); + CHECK_EQ(res, all_true); + + batch_bool_type c { true }; + c.store_unaligned(res.data()); + INFO("batch_bool{value}"); + CHECK_EQ(res, all_true); + } - batch_bool_type c { true }; - c.store_unaligned(res.data()); - INFO("batch_bool{value}"); - CHECK_EQ(res, all_true); + { + auto f_bool = [](size_t i) + { return bool(i % 3); }; + + bool_array_type res; + for (size_t i = 0; i < res.size(); i++) + { + res[i] = f_bool(i); + } - check_constructor_from_sequence(std::integral_constant(), pack<>()); + bool_array_type tmp; + batch_bool_type b0 = make_batch(f_bool); + b0.store_unaligned(tmp.data()); + INFO("batch_bool(values...)"); + CHECK_EQ(tmp, res); + + batch_bool_type b1 = make_batch(f_bool); + b1.store_unaligned(tmp.data()); + INFO("batch_bool{values...}"); + CHECK_EQ(tmp, res); + } } void test_load_store() const @@ -239,6 +257,38 @@ struct batch_bool_test b = batch_bool_type::load_aligned(arhs.data()); b.store_aligned(ares.data()); CHECK_EQ(ares, arhs); + + auto bool_g = xsimd::get_bool {}; + // load/store, almost all false + { + size_t i = 0; + for (const auto& vec : bool_g.almost_all_false()) + { + batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); + batch_bool_type expected = make_batch([i](size_t x) + { return x == i; }); + i++; + CHECK_UNARY(xsimd::all(b == expected)); + b.store_unaligned(res.data()); + // Check that the representation is bitwise exact. + CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0); + } + } + + // load/store, almost all true + { + size_t i = 0; + for (const auto& vec : bool_g.almost_all_true()) + { + batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); + batch_bool_type expected = make_batch([i](size_t x) + { return x != i; }); + i++; + CHECK_UNARY(xsimd::all(b == expected)); + b.store_unaligned(res.data()); + CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0); + } + } } void test_any_all() const