Merge pull request #964 from xtensor-stack/feature/syndicate-fast-cas…

…t-code Provide a generic version for uint32_t to float conversion, only if t…
xtensor-stack · Nov 1, 2023 · eefd19c · eefd19c
2 parents 011d355 + 0ba53ef
commit eefd19c
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 34 deletions.
diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -197,6 +197,19 @@ namespace xsimd
                 v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
                 return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
             }
+
+            // Provide a generic float -> uint32_t cast only if we have a
+            // non-generic float -> int32_t fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                auto is_large = v >= batch<float, A>(1u << 31);
+                auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
+                auto large = bitwise_cast<float>(
+                    batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+                    ^ batch<int32_t, A>(1u << 31));
+                return bitwise_cast<uint32_t>(select(is_large, large, small));
+            }
         }
 
         namespace detail

diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -520,17 +520,6 @@ namespace xsimd
             {
                 return _mm256_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
-            {
-                return _mm256_castps_si256(
-                    _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
-                                     _mm256_xor_ps(
-                                         _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
-                                         _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
-                                     _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
-            }
         }
 
         // decr_if

diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -573,18 +573,6 @@ namespace xsimd
             {
                 return _mm_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
-            {
-                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
-                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
-                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
-                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                    _mm_set1_epi32(1u << 31)));
-                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
-            }
-
         }
 
         // eq

diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -65,17 +65,6 @@ namespace xsimd
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                return _mm_castps_si128(
-                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
-                                  _mm_castsi128_ps(_mm_xor_si128(
-                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                                      _mm_set1_epi32(1u << 31))),
-                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
-            }
         }
 
         // eq