diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index a9e30b4ab..6e5e4342e 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1095,42 +1095,67 @@ namespace xsimd } } - // swizzle (dynamic mask) + // swizzle (dynamic mask) on 8 and 16 bits; see avx for 32 and 64 bits versions template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { - return swizzle(self, mask, avx {}); + // swap lanes + __m256i swapped = _mm256_permute2x128_si256(self, self, 0x01); // [high | low] + + // normalize mask taking modulo 16 + batch half_mask = mask & 0b1111u; + + // permute bytes within each lane (AVX2 only) + __m256i r0 = _mm256_shuffle_epi8(self, half_mask); + __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask); + + // select lane by the mask index divided by 16 + constexpr auto lane = batch_constant< + uint8_t, A, + 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {}; + batch_bool blend_mask = (mask & 0b10000u) != lane; + return _mm256_blendv_epi8(r0, r1, blend_mask); } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { - batch broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 }; - constexpr uint64_t comb = 0x0000000100000001ul * 2; - return bitwise_cast(swizzle(bitwise_cast(self), bitwise_cast(mask * comb) + broadcaster, avx2 {})); + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle( + batch const& self, batch mask, requires_arch req) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); + // No blend/shuffle for 16 bits, we need to use the 8 bits version + const auto self_bytes = bitwise_cast(self); + // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte + const auto mask_2k_2kp1 = bitwise_cast((mask << 1) | (mask << 9) | 0x100); + return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, req)); } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + + // swizzle (constant mask) + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { - return swizzle(self, mask, avx {}); + static_assert(sizeof...(Vals) == 32, "Must contain as many uint8_t as can fit in avx register"); + return swizzle(self, mask.as_batch(), req); } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); + static_assert(sizeof...(Vals) == 16, "Must contain as many uint16_t as can fit in avx register"); + return swizzle(self, mask.as_batch(), req); } - // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept {