From 7ede28f3c3ac020c8acdc669d05b948f5b9f0236 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 31 Oct 2025 15:58:35 +0100 Subject: [PATCH 1/6] Add dynamic swizzle for uin8_t on avx2 --- include/xsimd/arch/xsimd_avx2.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index a9e30b4ab..bcbfa9e2a 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1096,6 +1096,34 @@ namespace xsimd } // swizzle (dynamic mask) + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + // swap lanes + __m256i swapped = _mm256_permute2x128_si256(self, self, 0x01); // [high | low] + + // normalize mask taking modulo 16 + batch half_mask = mask & 0b1111u; + + // permute bytes within each lane (AVX2 only) + __m256i r0 = _mm256_shuffle_epi8(self, half_mask); + __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask); + + // select lane by the mask index divided by 16 + constexpr auto lane = batch_constant< + uint8_t, A, + 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {}; + batch_bool blend_mask = (mask & 0b10000u) != lane; + return _mm256_blendv_epi8(r0, r1, blend_mask); + } + + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask)); + } + template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { From ca01f17d0d312a7f3c9996c6f04e79857e0b0724 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 31 Oct 2025 16:14:25 +0100 Subject: [PATCH 2/6] Add dynamic swizzle for uin16_t on avx2 --- include/xsimd/arch/xsimd_avx2.hpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index bcbfa9e2a..fd182806b 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1119,11 +1119,28 @@ namespace xsimd } template = 0> - XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask)); } + template + XSIMD_INLINE batch swizzle( + batch const& self, batch mask, requires_arch) noexcept + { + // No blend/shuffle for 16 bits, we need to use the 8 bits version + const auto self_bytes = bitwise_cast(self); + // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte + const auto mask_2k_2kp1 = bitwise_cast((mask << 1) | (mask << 9) | 0x100); + return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, requires_arch)); + } + + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask)); + } + template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { From a5350d7abf208c584679befb2106034f57223793 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 31 Oct 2025 16:19:56 +0100 Subject: [PATCH 3/6] Fix required arch --- include/xsimd/arch/xsimd_avx2.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index fd182806b..4b4769fd0 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1119,26 +1119,26 @@ namespace xsimd } template = 0> - XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template XSIMD_INLINE batch swizzle( - batch const& self, batch mask, requires_arch) noexcept + batch const& self, batch mask, requires_arch req) noexcept { // No blend/shuffle for 16 bits, we need to use the 8 bits version const auto self_bytes = bitwise_cast(self); // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte const auto mask_2k_2kp1 = bitwise_cast((mask << 1) | (mask << 9) | 0x100); - return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, requires_arch)); + return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, req)); } template = 0> - XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template From a4993fb7b90a5f05e8033a4c827c5e964908d7a9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 31 Oct 2025 16:26:40 +0100 Subject: [PATCH 4/6] Remove avx2 -> avx forward already implicit --- include/xsimd/arch/xsimd_avx2.hpp | 36 +------------------------------ 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 4b4769fd0..353e6b8ce 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1095,7 +1095,7 @@ namespace xsimd } } - // swizzle (dynamic mask) + // swizzle (dynamic mask) on 8 and 16 bits; see avx for 32 and 64 bits versions template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { @@ -1141,40 +1141,6 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - return swizzle(self, mask, avx {}); - } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - batch broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 }; - constexpr uint64_t comb = 0x0000000100000001ul * 2; - return bitwise_cast(swizzle(bitwise_cast(self), bitwise_cast(mask * comb) + broadcaster, avx2 {})); - } - - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); - } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); - } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - return swizzle(self, mask, avx {}); - } - template - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); - } - // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept From 27e181320ca9a56c4ecad114e0e29ea21fdd0988 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 10 Nov 2025 16:46:34 -0800 Subject: [PATCH 5/6] Add fallback AVX2 swizzle constant mask --- include/xsimd/arch/xsimd_avx2.hpp | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 353e6b8ce..f21766759 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1142,6 +1142,39 @@ namespace xsimd } // swizzle (constant mask) + template < + class A, typename T, + uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7, + uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15, + uint8_t V16, uint8_t V17, uint8_t V18, uint8_t V19, uint8_t V20, uint8_t V21, uint8_t V22, uint8_t V23, + uint8_t V24, uint8_t V25, uint8_t V26, uint8_t V27, uint8_t V28, uint8_t V29, uint8_t V30, uint8_t V31, + detail::enable_sized_t = 0> + XSIMD_INLINE batch swizzle( + batch const& self, + batch_constant< + uint8_t, A, + V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, + V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31> + mask, + requires_arch req) noexcept + { + return swizzle(self, mask.as_batch(), req); + } + + template < + class A, typename T, + uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7, + uint16_t V8, uint16_t V9, uint16_t V10, uint16_t V11, uint16_t V12, uint16_t V13, uint16_t V14, uint16_t V15, + detail::enable_sized_t = 0> + XSIMD_INLINE batch swizzle( + batch const& self, + batch_constant + mask, + requires_arch req) noexcept + { + return swizzle(self, mask.as_batch(), req); + } + template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { From 3eecf19d9eaf05a349782dc7ba5abf719bd452bd Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 11 Nov 2025 08:06:55 -0800 Subject: [PATCH 6/6] Use variadic template arguments --- include/xsimd/arch/xsimd_avx2.hpp | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index f21766759..6e5e4342e 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1142,36 +1142,17 @@ namespace xsimd } // swizzle (constant mask) - template < - class A, typename T, - uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7, - uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15, - uint8_t V16, uint8_t V17, uint8_t V18, uint8_t V19, uint8_t V20, uint8_t V21, uint8_t V22, uint8_t V23, - uint8_t V24, uint8_t V25, uint8_t V26, uint8_t V27, uint8_t V28, uint8_t V29, uint8_t V30, uint8_t V31, - detail::enable_sized_t = 0> - XSIMD_INLINE batch swizzle( - batch const& self, - batch_constant< - uint8_t, A, - V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, - V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31> - mask, - requires_arch req) noexcept + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { + static_assert(sizeof...(Vals) == 32, "Must contain as many uint8_t as can fit in avx register"); return swizzle(self, mask.as_batch(), req); } - template < - class A, typename T, - uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7, - uint16_t V8, uint16_t V9, uint16_t V10, uint16_t V11, uint16_t V12, uint16_t V13, uint16_t V14, uint16_t V15, - detail::enable_sized_t = 0> - XSIMD_INLINE batch swizzle( - batch const& self, - batch_constant - mask, - requires_arch req) noexcept + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { + static_assert(sizeof...(Vals) == 16, "Must contain as many uint16_t as can fit in avx register"); return swizzle(self, mask.as_batch(), req); }