diff --git a/include/xsimd/math/xsimd_fp_manipulation.hpp b/include/xsimd/math/xsimd_fp_manipulation.hpp index 8219f9fc4..96fb4f223 100644 --- a/include/xsimd/math/xsimd_fp_manipulation.hpp +++ b/include/xsimd/math/xsimd_fp_manipulation.hpp @@ -22,6 +22,9 @@ namespace xsimd template batch frexp(const batch& arg, batch, N>& exp); + template + as_float_t fexp_to_float(const T& x); + /******************************************************** * Floating point manipulation functions implementation * ********************************************************/ @@ -66,6 +69,27 @@ namespace xsimd exp = select(bool_cast(arg != b_type(0.)), exp, zero()); return select((arg != b_type(0.)), x | bitwise_cast(mask2frexp()), b_type(0.)); } + + template + inline as_float_t fexp_to_float(const T& x) + { + return to_float(x); + } + + // fexps are 8/11-bit numbers. so we can down-cast to int16_t/int32_t then do the job +#if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION + inline batch fexp_to_float(const batch& x) + { + return _mm256_cvtepi32_pd(detail::xsimd_cvtepi64_epi32(x)); + } +#endif + +#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION + inline batch fexp_to_float(const batch& x) + { + return _mm_cvtepi32_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(0, 0, 2, 0))); + } +#endif } #endif diff --git a/include/xsimd/math/xsimd_logarithm.hpp b/include/xsimd/math/xsimd_logarithm.hpp index d04e7b77d..dec3750a9 100644 --- a/include/xsimd/math/xsimd_logarithm.hpp +++ b/include/xsimd/math/xsimd_logarithm.hpp @@ -95,7 +95,7 @@ namespace xsimd B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; - B dk = to_float(k); + B dk = fexp_to_float(k); B r = fma(dk, log_2hi(), fma(s, (hfsq + R), dk * log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); @@ -136,7 +136,7 @@ namespace xsimd #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; - B dk = to_float(k); + B dk = fexp_to_float(k); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = bitwise_cast(hx << 32 | (i_type(0xffffffff) & bitwise_cast(x))); @@ -223,7 +223,7 @@ namespace xsimd B t2 = z * horner(w); B R = t1 + t2; B hfsq = B(0.5) * f * f; - B dk = to_float(k); + B dk = fexp_to_float(k); B r = fma(fms(s, hfsq + R, hfsq) + f, invlog_2(), dk); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); @@ -287,7 +287,7 @@ namespace xsimd B lo = fma(s, hfsq + R, f - hi - hfsq); B val_hi = hi * invlog_2hi(); B val_lo = fma(lo + hi, invlog_2lo(), lo * invlog_2hi()); - B dk = to_float(k); + B dk = fexp_to_float(k); B w1 = dk + val_hi; val_lo += (dk - w1) + val_hi; val_hi = w1; @@ -362,7 +362,7 @@ namespace xsimd B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; - B dk = to_float(k); + B dk = fexp_to_float(k); B hfsq = B(0.5) * f * f; B hibits = f - hfsq; hibits &= bitwise_cast(i_type(0xfffff000)); @@ -419,7 +419,7 @@ namespace xsimd hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = bitwise_cast(hx << 32 | (i_type(0xffffffff) & bitwise_cast(x))); B f = --x; - B dk = to_float(k); + B dk = fexp_to_float(k); B s = f / (B(2.) + f); B z = s * s; B w = z * z; @@ -498,7 +498,7 @@ namespace xsimd B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; - B dk = to_float(k); + B dk = fexp_to_float(k); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ B c = select(bool_cast(k >= i_type(2)), B(1.) - (uf - a), a - (uf - B(1.))) / uf; B r = fma(dk, log_2hi(), fma(s, (hfsq + R), dk * log_2lo() + c) - hfsq + f); @@ -550,7 +550,7 @@ namespace xsimd 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w); B R = t2 + t1; - B dk = to_float(k); + B dk = fexp_to_float(k); B r = fma(dk, log_2hi(), fma(s, hfsq + R, dk * log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); diff --git a/include/xsimd/types/xsimd_int_conversion.hpp b/include/xsimd/types/xsimd_int_conversion.hpp index 9d5062251..62533dc98 100644 --- a/include/xsimd/types/xsimd_int_conversion.hpp +++ b/include/xsimd/types/xsimd_int_conversion.hpp @@ -37,6 +37,8 @@ namespace xsimd __m128i xsimd_cvtepi32_epi16(__m256i a); __m128i xsimd_cvtepi32_epu16(__m256i a); + __m128i xsimd_cvtepi64_epi32(__m256i a); + /****************** * Implementation * ******************/ @@ -165,6 +167,13 @@ namespace xsimd #endif return res; } + + inline __m128i xsimd_cvtepi64_epi32(__m256i a) + { + __m128 hi = _mm_castsi128_ps(_mm256_extractf128_si256(a, 1)); + __m128 lo = _mm_castsi128_ps(_mm256_castsi256_si128(a)); + return _mm_castps_si128(_mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 2, 0))); + } } }