Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 2 additions & 18 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -920,16 +920,8 @@ namespace xsimd
using int_t = as_integer_t<T>;
constexpr size_t half_size = batch<T, A>::size / 2;

XSIMD_IF_CONSTEXPR(mask.none())
{
return batch<T, A>(T { 0 });
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
// confined to lower 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
Expand Down Expand Up @@ -970,16 +962,8 @@ namespace xsimd
{
constexpr size_t half_size = batch<T, A>::size / 2;

XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
// confined to lower 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
Expand Down
29 changes: 5 additions & 24 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,10 @@ namespace xsimd
XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>::type
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return _mm256_setzero_si256();
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
else
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
using int_t = typename std::conditional<sizeof(T) == 4, int32_t, long long>::type;
// Use the raw register-level maskload helpers for the remaining cases.
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
using int_t = typename std::conditional<sizeof(T) == 4, int32_t, long long>::type;
// Use the raw register-level maskload helpers for the remaining cases.
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}

template <class A, bool... Values, class Mode>
Expand Down Expand Up @@ -206,16 +195,8 @@ namespace xsimd
{
constexpr size_t lanes_per_half = sizeof(__m128i) / sizeof(T);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure why I did this here instead of using batch::size.


XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
// confined to lower 128-bit half → forward to SSE
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
Expand Down
66 changes: 22 additions & 44 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,34 +304,23 @@ namespace xsimd
batch_bool_constant<T, A, Values...> mask,
convert<T>, Mode, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
constexpr auto half = batch<T, A>::size / 2;
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
{
return batch<T, A>(T { 0 });
constexpr auto mlo = ::xsimd::detail::lower_half<avx2>(mask);
const auto lo = load_masked<avx2>(mem, mlo, convert<T> {}, Mode {}, avx2 {});
return detail::load_masked(lo); // zero-extend low half
}
else XSIMD_IF_CONSTEXPR(mask.all())
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding
{
return load<A>(mem, Mode {});
constexpr auto mhi = ::xsimd::detail::upper_half<avx2>(mask);
const auto hi = load_masked<avx2>(mem + half, mhi, convert<T> {}, Mode {}, avx2 {});
return detail::load_masked(hi, detail::high_tag {});
}
else
{
constexpr auto half = batch<T, A>::size / 2;
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
{
constexpr auto mlo = ::xsimd::detail::lower_half<avx2>(mask);
const auto lo = load_masked<avx2>(mem, mlo, convert<T> {}, Mode {}, avx2 {});
return detail::load_masked(lo); // zero-extend low half
}
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding
{
constexpr auto mhi = ::xsimd::detail::upper_half<avx2>(mask);
const auto hi = load_masked<avx2>(mem + half, mhi, convert<T> {}, Mode {}, avx2 {});
return detail::load_masked(hi, detail::high_tag {});
}
else
{
// fallback to centralized pointer-level helper
return detail::load_masked(mem, mask.mask(), Mode {});
}
// fallback to centralized pointer-level helper
return detail::load_masked(mem, mask.mask(), Mode {});
}
}

Expand All @@ -342,34 +331,23 @@ namespace xsimd
batch_bool_constant<T, A, Values...> mask,
Mode, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
constexpr auto half = batch<T, A>::size / 2;
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
{
return;
constexpr auto mlo = ::xsimd::detail::lower_half<avx2>(mask);
const auto lo = detail::lower_half(src);
store_masked<avx2>(mem, lo, mlo, Mode {}, avx2 {});
}
else XSIMD_IF_CONSTEXPR(mask.all())
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding
{
src.store(mem, Mode {});
constexpr auto mhi = ::xsimd::detail::upper_half<avx2>(mask);
const auto hi = detail::upper_half(src);
store_masked<avx2>(mem + half, hi, mhi, Mode {}, avx2 {});
}
else
{
constexpr auto half = batch<T, A>::size / 2;
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
{
constexpr auto mlo = ::xsimd::detail::lower_half<avx2>(mask);
const auto lo = detail::lower_half(src);
store_masked<avx2>(mem, lo, mlo, Mode {}, avx2 {});
}
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding
{
constexpr auto mhi = ::xsimd::detail::upper_half<avx2>(mask);
const auto hi = detail::upper_half(src);
store_masked<avx2>(mem + half, hi, mhi, Mode {}, avx2 {});
}
else
{
// fallback to centralized pointer-level helper
detail::store_masked(mem, src, mask.mask(), Mode {});
}
// fallback to centralized pointer-level helper
detail::store_masked(mem, src, mask.mask(), Mode {});
}
}

Expand Down
38 changes: 38 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,44 @@ namespace xsimd
return load_unaligned(mem, t, r);
}

/* masked version */
namespace detail
{
template <bool... Values>
struct load_masked;

template <>
struct load_masked<>
{
template <size_t I, class A, class T, bool Use>
static XSIMD_INLINE batch<T, A> apply(T const* mem, batch<T, A> acc, std::integral_constant<bool, Use>) noexcept
{
return acc;
}
};
template <bool Value, bool... Values>
struct load_masked<Value, Values...>
{
template <size_t I, class A, class T>
static XSIMD_INLINE batch<T, A> apply(T const* mem, batch<T, A> acc, std::true_type) noexcept
{
return load_masked<Values...>::template apply<I + 1>(mem, insert(acc, mem[I], index<I> {}), std::integral_constant<bool, Value> {});
}
template <size_t I, class A, class T>
static XSIMD_INLINE batch<T, A> apply(T const* mem, batch<T, A> acc, std::false_type) noexcept
{
return load_masked<Values...>::template apply<I + 1>(mem, acc, std::integral_constant<bool, Value> {});
}
};
}

template <class A, class T, bool Value, bool... Values, class Mode>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Value, Values...> mask, Mode, requires_arch<neon>) noexcept
{
// Call insert whenever Values... are true
return detail::load_masked<Values...>::template apply<0>(mem, broadcast(T(0), A {}), std::integral_constant<bool, Value> {});
}

/*********
* store *
*********/
Expand Down
52 changes: 6 additions & 46 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1071,15 +1071,7 @@ namespace xsimd
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return _mm_setzero_ps();
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
else XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
{
return _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const*>(mem));
}
Expand All @@ -1095,15 +1087,7 @@ namespace xsimd
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return _mm_setzero_pd();
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
else XSIMD_IF_CONSTEXPR(mask.countr_one() == 1)
XSIMD_IF_CONSTEXPR(mask.countr_one() == 1)
{
return _mm_move_sd(_mm_setzero_pd(), _mm_load_sd(mem));
}
Expand All @@ -1121,15 +1105,7 @@ namespace xsimd
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
else XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
{
_mm_storel_pi(reinterpret_cast<__m64*>(mem), src);
}
Expand All @@ -1144,17 +1120,9 @@ namespace xsimd
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode mode, requires_arch<sse2>) noexcept
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, mode);
}
else XSIMD_IF_CONSTEXPR(mask.countr_one() == 1)
XSIMD_IF_CONSTEXPR(mask.countr_one() == 1)
{
_mm_store_sd(mem, src);
}
Expand Down Expand Up @@ -2205,15 +2173,7 @@ namespace xsimd
aligned_mode,
requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
_mm_store_ps(mem, src);
}
else XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
XSIMD_IF_CONSTEXPR(mask.countr_one() == 2)
{
_mm_storel_pi(reinterpret_cast<__m64*>(mem), src);
}
Expand Down
Loading