From d889cdb3f6677c9d2dde7aa4301e0226f3befa6d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 17 Nov 2025 12:26:39 +0100 Subject: [PATCH 1/5] Swap instead of duplicate --- include/xsimd/arch/xsimd_avx.hpp | 100 +++++++++++++------------------ 1 file changed, 43 insertions(+), 57 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index df6bc86f4..21d7d9957 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1629,88 +1629,74 @@ namespace xsimd } return split; } - // Duplicate lanes separately - // 1) duplicate low and high lanes - __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] - __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high] - // 2) build lane-local index vector (each element = source_index & 3) - constexpr batch_constant half_mask; + // Fallback to general algorithm. This is the same as the dynamic version with the exception + // that possible operations are done at compile time. - __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane - __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane + // swap lanes + __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low] + + // normalize mask taking modulo 4 + constexpr auto half_mask = mask % make_batch_constant(); + + // permute within each lane + __m256 r0 = _mm256_permutevar_ps(self, half_mask.as_batch()); + __m256 r1 = _mm256_permutevar_ps(swapped, half_mask.as_batch()); - constexpr batch_bool_constant= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {}; + // select lane by the mask index divided by 4 + constexpr auto lane = batch_constant {}; + constexpr int lane_mask = ((mask / make_batch_constant()) != lane).mask(); - return _mm256_blend_ps(r0, r1, lane_mask.mask()); + return _mm256_blend_ps(r0, r1, lane_mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { // cannot use detail::mod_shuffle as the mod and shift are different in this case - constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3); + constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3); XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { return _mm256_permute_pd(self, imm); } - // duplicate low and high part of input - __m256d lo = _mm256_permute2f128_pd(self, self, 0x00); - __m256d hi = _mm256_permute2f128_pd(self, self, 0x11); + + // Fallback to general algorithm. This is the same as the dynamic version with the exception + // that possible operations are done at compile time. + + // swap lanes + __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low] // permute within each lane - __m256d r0 = _mm256_permute_pd(lo, imm); - __m256d r1 = _mm256_permute_pd(hi, imm); + __m256d r0 = _mm256_permute_pd(self, imm); + __m256d r1 = _mm256_permute_pd(swapped, imm); - // mask to choose the right lane - constexpr batch_bool_constant= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; + // select lane by the mask index divided by 2 + constexpr auto lane = batch_constant {}; + constexpr int lane_mask = ((mask / make_batch_constant()) != lane).mask(); // blend the two permutes - return _mm256_blend_pd(r0, r1, blend_mask.mask()); - } - template = 0> - XSIMD_INLINE batch swizzle(batch const& self, - batch_constant const& mask, - requires_arch) noexcept + return _mm256_blend_pd(r0, r1, lane_mask); + } + + template < + class A, typename T, + uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7, + detail::enable_sized_integral_t = 0> + XSIMD_INLINE batch swizzle( + batch const& self, + batch_constant const& mask, + requires_arch) noexcept { - return bitwise_cast( - swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask)); } - template = 0> - XSIMD_INLINE batch - swizzle(batch const& self, - batch_constant const& mask, - requires_arch) noexcept + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept { - return bitwise_cast( - swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask)); } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept From be3e8bdda44cdb5252cd38c95375cfae9bc45b03 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 18 Nov 2025 10:24:57 +0100 Subject: [PATCH 2/5] Add broadcast optimization --- include/xsimd/arch/xsimd_avx.hpp | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 21d7d9957..192a8d78f 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1629,6 +1629,17 @@ namespace xsimd } return split; } + constexpr auto lane_mask = mask % make_batch_constant(); + XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) + { + __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] + return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); + } + XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) + { + __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] + return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); + } // Fallback to general algorithm. This is the same as the dynamic version with the exception // that possible operations are done at compile time. @@ -1655,11 +1666,25 @@ namespace xsimd { // cannot use detail::mod_shuffle as the mod and shift are different in this case constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3); - XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } + XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) + { + return self; + } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { return _mm256_permute_pd(self, imm); } + constexpr auto lane_mask = mask % make_batch_constant(); + XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) + { + __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] + return _mm256_permute_pd(broadcast, lane_mask.as_batch()); + } + XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) + { + __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] + return _mm256_permute_pd(broadcast, lane_mask.as_batch()); + } // Fallback to general algorithm. This is the same as the dynamic version with the exception // that possible operations are done at compile time. From 00ccfd3237224c7c64915ef913652dcf2781aeab Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 18 Nov 2025 15:20:14 +0100 Subject: [PATCH 3/5] Fix duplicate var --- include/xsimd/arch/xsimd_avx.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 192a8d78f..7762f61a2 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1656,9 +1656,9 @@ namespace xsimd // select lane by the mask index divided by 4 constexpr auto lane = batch_constant {}; - constexpr int lane_mask = ((mask / make_batch_constant()) != lane).mask(); + constexpr int lane_idx = ((mask / make_batch_constant()) != lane).mask(); - return _mm256_blend_ps(r0, r1, lane_mask); + return _mm256_blend_ps(r0, r1, lane_idx); } template @@ -1698,10 +1698,10 @@ namespace xsimd // select lane by the mask index divided by 2 constexpr auto lane = batch_constant {}; - constexpr int lane_mask = ((mask / make_batch_constant()) != lane).mask(); + constexpr int lane_idx = ((mask / make_batch_constant()) != lane).mask(); // blend the two permutes - return _mm256_blend_pd(r0, r1, lane_mask); + return _mm256_blend_pd(r0, r1, lane_idx); } template < From 35e135e9672da0f842dd456d806af418b083aaf7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 18 Nov 2025 16:04:12 +0100 Subject: [PATCH 4/5] Fix types --- include/xsimd/arch/xsimd_avx.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 7762f61a2..9c0e246b9 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1632,12 +1632,12 @@ namespace xsimd constexpr auto lane_mask = mask % make_batch_constant(); XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { - __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] + __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { - __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] + __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x11); // [high | high] return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); } @@ -1677,12 +1677,12 @@ namespace xsimd constexpr auto lane_mask = mask % make_batch_constant(); XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { - __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] + __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] return _mm256_permute_pd(broadcast, lane_mask.as_batch()); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { - __m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] + __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] return _mm256_permute_pd(broadcast, lane_mask.as_batch()); } From c99e4ba79b753c077cc13a560d6d7b0b1c0b7073 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 18 Nov 2025 17:00:37 +0100 Subject: [PATCH 5/5] Fix imm constant --- include/xsimd/arch/xsimd_avx.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 9c0e246b9..ddcc9e4bf 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1674,16 +1674,15 @@ namespace xsimd { return _mm256_permute_pd(self, imm); } - constexpr auto lane_mask = mask % make_batch_constant(); XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] - return _mm256_permute_pd(broadcast, lane_mask.as_batch()); + return _mm256_permute_pd(broadcast, imm); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] - return _mm256_permute_pd(broadcast, lane_mask.as_batch()); + return _mm256_permute_pd(broadcast, imm); } // Fallback to general algorithm. This is the same as the dynamic version with the exception