diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index df6bc86f4..ddcc9e4bf 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1629,88 +1629,98 @@ namespace xsimd } return split; } - // Duplicate lanes separately - // 1) duplicate low and high lanes - __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] - __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high] + constexpr auto lane_mask = mask % make_batch_constant(); + XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) + { + __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] + return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); + } + XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) + { + __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x11); // [high | high] + return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); + } + + // Fallback to general algorithm. This is the same as the dynamic version with the exception + // that possible operations are done at compile time. + + // swap lanes + __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low] - // 2) build lane-local index vector (each element = source_index & 3) - constexpr batch_constant half_mask; + // normalize mask taking modulo 4 + constexpr auto half_mask = mask % make_batch_constant(); - __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane - __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane + // permute within each lane + __m256 r0 = _mm256_permutevar_ps(self, half_mask.as_batch()); + __m256 r1 = _mm256_permutevar_ps(swapped, half_mask.as_batch()); - constexpr batch_bool_constant= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {}; + // select lane by the mask index divided by 4 + constexpr auto lane = batch_constant {}; + constexpr int lane_idx = ((mask / make_batch_constant()) != lane).mask(); - return _mm256_blend_ps(r0, r1, lane_mask.mask()); + return _mm256_blend_ps(r0, r1, lane_idx); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { // cannot use detail::mod_shuffle as the mod and shift are different in this case - constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3); - XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } + constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3); + XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) + { + return self; + } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { return _mm256_permute_pd(self, imm); } - // duplicate low and high part of input - __m256d lo = _mm256_permute2f128_pd(self, self, 0x00); - __m256d hi = _mm256_permute2f128_pd(self, self, 0x11); + XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) + { + __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] + return _mm256_permute_pd(broadcast, imm); + } + XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) + { + __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] + return _mm256_permute_pd(broadcast, imm); + } + + // Fallback to general algorithm. This is the same as the dynamic version with the exception + // that possible operations are done at compile time. + + // swap lanes + __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low] // permute within each lane - __m256d r0 = _mm256_permute_pd(lo, imm); - __m256d r1 = _mm256_permute_pd(hi, imm); + __m256d r0 = _mm256_permute_pd(self, imm); + __m256d r1 = _mm256_permute_pd(swapped, imm); - // mask to choose the right lane - constexpr batch_bool_constant= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; + // select lane by the mask index divided by 2 + constexpr auto lane = batch_constant {}; + constexpr int lane_idx = ((mask / make_batch_constant()) != lane).mask(); // blend the two permutes - return _mm256_blend_pd(r0, r1, blend_mask.mask()); - } - template = 0> - XSIMD_INLINE batch swizzle(batch const& self, - batch_constant const& mask, - requires_arch) noexcept + return _mm256_blend_pd(r0, r1, lane_idx); + } + + template < + class A, typename T, + uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7, + detail::enable_sized_integral_t = 0> + XSIMD_INLINE batch swizzle( + batch const& self, + batch_constant const& mask, + requires_arch) noexcept { - return bitwise_cast( - swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask)); } - template = 0> - XSIMD_INLINE batch - swizzle(batch const& self, - batch_constant const& mask, - requires_arch) noexcept + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept { - return bitwise_cast( - swizzle(bitwise_cast(self), mask)); + return bitwise_cast(swizzle(bitwise_cast(self), mask)); } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept