xtensor-stack · AntoinePrv · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025 · DiamonDinoia
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1427,41 +1427,39 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
         {
-            // duplicate low and high part of input
-            // Duplicate lanes separately
-            // 1) duplicate low and high lanes
-            __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
-            __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
+            // swap lanes
+            __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low]
 
-            // normalize mask
-            batch<uint32_t, A> half_mask = mask % 4;
+            // normalize mask taking modulo 4
+            batch<uint32_t, A> half_mask = mask & 0b11u;
 
             // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(lo, half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi, half_mask);
+            __m256 r0 = _mm256_permutevar_ps(self, half_mask);
+            __m256 r1 = _mm256_permutevar_ps(swapped, half_mask);
 
-            batch_bool<uint32_t, A> blend_mask = mask >= 4;
+            // select lane by the mask index divided by 4
+            constexpr auto lane = batch_constant<uint32_t, A, 0, 0, 0, 0, 4, 4, 4, 4> {};
+            batch_bool<uint32_t, A> blend_mask = (mask & 0b100u) != lane;
             return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
         {
-            // duplicate low and high part of input
-            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
-            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
+            // swap lanes
+            __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]
 
-            // normalize mask
-            batch<uint64_t, A> half_mask = -(mask & 1);
+            // The half mask value is found in mask modulo 2, but the intrinsic expect it in the
+            // second least significant bit. We use negative as a cheap alternative to lshift.
+            batch<uint64_t, A> half_mask = -(mask & 0b1u);
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
-
-            // mask to choose the right lane
-            batch_bool<uint64_t, A> blend_mask = mask >= 2;
+            __m256d r0 = _mm256_permutevar_pd(self, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(swapped, half_mask);
 
-            // blend the two permutes
+            // select lane by the mask index divided by 2
+            constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 2, 2> {};
+            batch_bool<uint64_t, A> blend_mask = (mask & 0b10u) != lane;
             return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
         }
 

diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp
@@ -247,6 +247,7 @@ struct swizzle_test
         using idx_t = typename xsimd::as_index<value_type>::type;
         auto idx_batch = xsimd::make_batch_constant<idx_t, Pattern<idx_t>, arch_type>();
 
+        CAPTURE(idx_batch.as_batch());
         CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, idx_batch), b_expect);
         CHECK_BATCH_EQ(xsimd::swizzle(b_lhs,
                                       static_cast<xsimd::batch<idx_t, arch_type>>(idx_batch)),