Skip to content

Commit

Permalink
Merge pull request #963 from xtensor-stack/feature/syndicate-fast-cas…
Browse files Browse the repository at this point in the history
…t-code

Provide a generic version for float to uint32_t conversion
  • Loading branch information
JohanMabille committed Oct 31, 2023
2 parents 029aa9b + f9dcafb commit 011d355
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 46 deletions.
17 changes: 17 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,23 @@ namespace xsimd
{
return bitwise_cast<int64_t>(self);
}

// Provide a generic uint32_t -> float cast only if we have a
// non-generic int32_t -> float fast_cast
template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
batch<uint32_t, A> msk_lo(0xFFFF);
batch<float, A> cnst65536f(65536.0f);

auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self */
auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v */
auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding */
auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding */
v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}
}

namespace detail
Expand Down
16 changes: 0 additions & 16 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,22 +515,6 @@ namespace xsimd
return _mm256_cvtepi32_ps(self);
}

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
// adapted to avx
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
__m256 cnst65536f = _mm256_set1_ps(65536.0f);

__m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self */
__m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v */
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
{
Expand Down
15 changes: 0 additions & 15 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,21 +279,6 @@ namespace xsimd
namespace detail
{

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
__m256 cnst65536f = _mm256_set1_ps(65536.0f);

__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
{
Expand Down
15 changes: 0 additions & 15 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,21 +541,6 @@ namespace xsimd
return _mm_cvtepi32_ps(self);
}

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
__m128i msk_lo = _mm_set1_epi32(0xFFFF);
__m128 cnst65536f = _mm_set1_ps(65536.0f);

__m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */
__m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */
__m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */
__m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
{
Expand Down

0 comments on commit 011d355

Please sign in to comment.