diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index b57d5c187..9eb3e7b9a 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1427,41 +1427,39 @@ namespace xsimd template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { - // duplicate low and high part of input - // Duplicate lanes separately - // 1) duplicate low and high lanes - __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] - __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high] + // swap lanes + __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low] - // normalize mask - batch half_mask = mask % 4; + // normalize mask taking modulo 4 + batch half_mask = mask & 0b11u; // permute within each lane - __m256 r0 = _mm256_permutevar_ps(lo, half_mask); - __m256 r1 = _mm256_permutevar_ps(hi, half_mask); + __m256 r0 = _mm256_permutevar_ps(self, half_mask); + __m256 r1 = _mm256_permutevar_ps(swapped, half_mask); - batch_bool blend_mask = mask >= 4; + // select lane by the mask index divided by 4 + constexpr auto lane = batch_constant {}; + batch_bool blend_mask = (mask & 0b100u) != lane; return _mm256_blendv_ps(r0, r1, batch_bool_cast(blend_mask)); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { - // duplicate low and high part of input - __m256d lo = _mm256_permute2f128_pd(self, self, 0x00); - __m256d hi = _mm256_permute2f128_pd(self, self, 0x11); + // swap lanes + __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low] - // normalize mask - batch half_mask = -(mask & 1); + // The half mask value is found in mask modulo 2, but the intrinsic expect it in the + // second least significant bit. We use negative as a cheap alternative to lshift. + batch half_mask = -(mask & 0b1u); // permute within each lane - __m256d r0 = _mm256_permutevar_pd(lo, half_mask); - __m256d r1 = _mm256_permutevar_pd(hi, half_mask); - - // mask to choose the right lane - batch_bool blend_mask = mask >= 2; + __m256d r0 = _mm256_permutevar_pd(self, half_mask); + __m256d r1 = _mm256_permutevar_pd(swapped, half_mask); - // blend the two permutes + // select lane by the mask index divided by 2 + constexpr auto lane = batch_constant {}; + batch_bool blend_mask = (mask & 0b10u) != lane; return _mm256_blendv_pd(r0, r1, batch_bool_cast(blend_mask)); } diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp index 4f29632e0..dcc88c6cb 100644 --- a/test/test_batch_manip.cpp +++ b/test/test_batch_manip.cpp @@ -247,6 +247,7 @@ struct swizzle_test using idx_t = typename xsimd::as_index::type; auto idx_batch = xsimd::make_batch_constant, arch_type>(); + CAPTURE(idx_batch.as_batch()); CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, idx_batch), b_expect); CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, static_cast>(idx_batch)),