From 42b96a1ddcd2d8c2f0aa2a0bab82a9b83bd6e5db Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 11 Nov 2025 22:03:15 +0100 Subject: [PATCH] Improve xsimd::expand common implementation Previous implementation was making the assumption of cheap xsimd::insert, and it always generated batch::size inserts. This implementation can take advantage of smaller popcount on the bitmask. Note to self: it would be great to have a good implementation for constant mask. --- .../xsimd/arch/common/xsimd_common_memory.hpp | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 54bf87eb6..f5744a110 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -88,28 +88,21 @@ namespace xsimd } // expand - namespace detail - { - template - XSIMD_INLINE batch create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) - { - batch swizzle_mask(IT(0)); - IT j = 0; - (void)std::initializer_list { ((swizzle_mask = insert(swizzle_mask, j, index())), (j += ((bitmask >> Is) & 1u)), true)... }; - return swizzle_mask; - } - } - template XSIMD_INLINE batch expand(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { - constexpr std::size_t size = batch_bool::size; - auto bitmask = mask.mask(); - auto swizzle_mask = detail::create_expand_swizzle_mask, A>(bitmask, ::xsimd::detail::make_index_sequence()); - auto z = swizzle(x, swizzle_mask); - return select(mask, z, batch(T(0))); + constexpr auto size = batch::size; + alignas(A::alignment()) T x_in[size], x_out[size] = { T() }; + x.store_aligned(x_in); + int i = 0, j = 0; + for (auto bitmask = mask.mask(); bitmask; bitmask >>= 1, ++i) + { + if (bitmask & 1) + x_out[i] = x_in[j++]; + } + return xsimd::batch::load_aligned(x_out); } // extract_pair