Skip to content

Commit

Permalink
Merge pull request #964 from xtensor-stack/feature/syndicate-fast-cas…
Browse files Browse the repository at this point in the history
…t-code

Provide a generic version for uint32_t to float conversion, only if t…
  • Loading branch information
JohanMabille committed Nov 1, 2023
2 parents 011d355 + 0ba53ef commit eefd19c
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 34 deletions.
13 changes: 13 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,19 @@ namespace xsimd
v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

// Provide a generic float -> uint32_t cast only if we have a
// non-generic float -> int32_t fast_cast
template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
{
auto is_large = v >= batch<float, A>(1u << 31);
auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
auto large = bitwise_cast<float>(
batch_cast<int32_t>(v - batch<float, A>(1u << 31))
^ batch<int32_t, A>(1u << 31));
return bitwise_cast<uint32_t>(select(is_large, large, small));
}
}

namespace detail
Expand Down
11 changes: 0 additions & 11 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,17 +520,6 @@ namespace xsimd
{
return _mm256_cvttps_epi32(self);
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
{
return _mm256_castps_si256(
_mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
_mm256_xor_ps(
_mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
_mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
_mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
}
}

// decr_if
Expand Down
12 changes: 0 additions & 12 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,18 +573,6 @@ namespace xsimd
{
return _mm_cvttps_epi32(self);
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
{
__m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
__m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
__m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1u << 31)));
return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
}

}

// eq
Expand Down
11 changes: 0 additions & 11 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,6 @@ namespace xsimd
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
{
return _mm_castps_si128(
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
_mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1u << 31))),
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
}
}

// eq
Expand Down

0 comments on commit eefd19c

Please sign in to comment.