Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1427,41 +1427,39 @@ namespace xsimd
template <class A>
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
// Duplicate lanes separately
// 1) duplicate low and high lanes
__m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
__m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
// swap lanes
__m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low]

// normalize mask
batch<uint32_t, A> half_mask = mask % 4;
Copy link
Contributor

@DiamonDinoia DiamonDinoia Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this generate different asm actually?

PS I am fine either way. Might be worth having a normalize<value> API so that we can use everywhere that converts value to mask if it is a pow2

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure. For a regular integer operation I would say so but with intrinsics I had a doubt.

// normalize mask taking modulo 4
batch<uint32_t, A> half_mask = mask & 0b11u;

// permute within each lane
__m256 r0 = _mm256_permutevar_ps(lo, half_mask);
__m256 r1 = _mm256_permutevar_ps(hi, half_mask);
__m256 r0 = _mm256_permutevar_ps(self, half_mask);
__m256 r1 = _mm256_permutevar_ps(swapped, half_mask);

batch_bool<uint32_t, A> blend_mask = mask >= 4;
// select lane by the mask index divided by 4
constexpr auto lane = batch_constant<uint32_t, A, 0, 0, 0, 0, 4, 4, 4, 4> {};
batch_bool<uint32_t, A> blend_mask = (mask & 0b100u) != lane;
return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
}

template <class A>
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
__m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
__m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
// swap lanes
__m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]

// normalize mask
batch<uint64_t, A> half_mask = -(mask & 1);
// The half mask value is found in mask modulo 2, but the intrinsic expect it in the
// second least significant bit. We use negative as a cheap alternative to lshift.
batch<uint64_t, A> half_mask = -(mask & 0b1u);

// permute within each lane
__m256d r0 = _mm256_permutevar_pd(lo, half_mask);
__m256d r1 = _mm256_permutevar_pd(hi, half_mask);

// mask to choose the right lane
batch_bool<uint64_t, A> blend_mask = mask >= 2;
__m256d r0 = _mm256_permutevar_pd(self, half_mask);
__m256d r1 = _mm256_permutevar_pd(swapped, half_mask);

// blend the two permutes
// select lane by the mask index divided by 2
constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 2, 2> {};
batch_bool<uint64_t, A> blend_mask = (mask & 0b10u) != lane;
return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
}

Expand Down
1 change: 1 addition & 0 deletions test/test_batch_manip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ struct swizzle_test
using idx_t = typename xsimd::as_index<value_type>::type;
auto idx_batch = xsimd::make_batch_constant<idx_t, Pattern<idx_t>, arch_type>();

CAPTURE(idx_batch.as_batch());
CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, idx_batch), b_expect);
CHECK_BATCH_EQ(xsimd::swizzle(b_lhs,
static_cast<xsimd::batch<idx_t, arch_type>>(idx_batch)),
Expand Down
Loading