diff --git a/.appveyor.yml b/.appveyor.yml index 4df9fbde5..9bfdd1d1f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -6,12 +6,22 @@ platform: - x64 environment: + global: + MINICONDA: C:\xsimd-conda matrix: - - MINICONDA: C:\xsimd-conda + - JOB: "AVX2" + CXXFLAGS: "/arch:AVX2" + VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat" + RUNTEST: ".\\test_xsimd" + - JOB: "AVX512" + CXXFLAGS: "/arch:AVX512" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" + RUNTEST: "ECHO" init: - "ECHO %MINICONDA%" - - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM% + - call "%VCVARSALL%" %PLATFORM% - ps: if($env:Platform -eq "x64"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe' C:\Miniconda.exe; echo "Done"} - ps: if($env:Platform -eq "x86"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86.exe' C:\Miniconda.exe; echo "Done"} - cmd: C:\Miniconda.exe /S /D=C:\xsimd-conda @@ -27,4 +37,4 @@ install: - cd test build_script: - - .\test_xsimd + - "%RUNTEST%" diff --git a/include/xsimd/math/xsimd_rounding.hpp b/include/xsimd/math/xsimd_rounding.hpp index 37382dcce..f9ccfc01e 100644 --- a/include/xsimd/math/xsimd_rounding.hpp +++ b/include/xsimd/math/xsimd_rounding.hpp @@ -329,13 +329,13 @@ namespace xsimd static inline batch_type ceil(const batch_type& x) { - auto res = _mm512_ceil_ps(x); + auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { - auto res = _mm512_floor_ps(x); + auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF); return res; } @@ -359,13 +359,13 @@ namespace xsimd static inline batch_type ceil(const batch_type& x) { - auto res = _mm512_ceil_pd(x); + auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { - auto res = _mm512_floor_pd(x); + auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF); return res; } diff --git a/include/xsimd/types/xsimd_avx512_double.hpp b/include/xsimd/types/xsimd_avx512_double.hpp index 5f3477a23..1f8fe6310 100644 --- a/include/xsimd/types/xsimd_avx512_double.hpp +++ b/include/xsimd/types/xsimd_avx512_double.hpp @@ -449,8 +449,11 @@ namespace xsimd static batch_type abs(const batch_type& rhs) { - return (__m512d)(_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), - (__m512i)((__m512d)(rhs)))); + __m512d rhs_asd = (__m512d)rhs; + __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asd); + __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + rhs_asi); + return *reinterpret_cast<__m512d*>(&res_asi); } static batch_type fabs(const batch_type& rhs) @@ -487,7 +490,7 @@ namespace xsimd { __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); - __m256d res1 = tmp1 + tmp2; + __m256d res1 = _mm256_add_pd(tmp1, tmp2); return xsimd::hadd(batch(res1)); } @@ -498,7 +501,7 @@ namespace xsimd { \ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ - res ## I = (tmp1 + tmp2); \ + res ## I = _mm512_add_pd(tmp1, tmp2); \ } \ step1(1, row[0], row[2]); @@ -511,12 +514,12 @@ namespace xsimd batch tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); batch tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); - batch resx1 = (tmp5 + tmp6); + batch resx1 = _mm512_add_pd(tmp5, tmp6); batch tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); batch tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); - batch resx2 = (tmp7 + tmp8); + batch resx2 = _mm512_add_pd(tmp7, tmp8); batch tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); batch tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); diff --git a/include/xsimd/types/xsimd_avx512_float.hpp b/include/xsimd/types/xsimd_avx512_float.hpp index 4699c8ac3..a8ce9460e 100644 --- a/include/xsimd/types/xsimd_avx512_float.hpp +++ b/include/xsimd/types/xsimd_avx512_float.hpp @@ -472,8 +472,11 @@ namespace xsimd static batch_type abs(const batch_type& rhs) { - return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)), - _mm512_set1_epi32(0x7fffffff))); + __m512 rhs_asf = (__m512)rhs; + __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asf); + __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), + rhs_asi); + return *reinterpret_cast<__m512*>(&res_asi); } static batch_type fabs(const batch_type& rhs) @@ -510,7 +513,7 @@ namespace xsimd { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); - __m256 res1 = tmp1 + tmp2; + __m256 res1 = _mm256_add_ps(tmp1, tmp2); return xsimd::hadd(batch(res1)); } @@ -524,7 +527,7 @@ namespace xsimd { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ - res ## I = tmp1 + tmp2; \ + res ## I = _mm512_add_ps(tmp1, tmp2); \ } \ XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); @@ -548,17 +551,17 @@ namespace xsimd batch tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ - batch resx1 = tmp1 + tmp2; \ + batch resx1 = _mm512_add_ps(tmp1, tmp2); \ \ batch tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ - batch resx2 = tmp3 + tmp4; \ + batch resx2 = _mm512_add_ps(tmp3, tmp4); \ \ batch tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ - batch resx3 = tmp5 + tmp6; \ + batch resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ _mm512_extractf32x8_ps(resx3, 1)); \ @@ -576,7 +579,20 @@ namespace xsimd static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { + #if !defined(_MSC_VER) return _mm512_mask_blend_ps(cond, b, a); + #else + __m512i mcondi = _mm512_maskz_broadcastd_epi32 ((__mmask16)cond, _mm_set1_epi32(~0)); + __m512 mcond = *reinterpret_cast<__m512*>(&mcondi); + XSIMD_SPLITPS_AVX512(mcond); + XSIMD_SPLITPS_AVX512(a); + XSIMD_SPLITPS_AVX512(b); + + auto res_lo = _mm256_blendv_ps(b_low, a_low, mcond_low); + auto res_hi = _mm256_blendv_ps(b_high, a_high, mcond_high); + + XSIMD_RETURN_MERGEDPS_AVX(res_lo, res_hi); + #endif } static batch_bool_type isnan(const batch_type& x) diff --git a/include/xsimd/types/xsimd_avx512_int16.hpp b/include/xsimd/types/xsimd_avx512_int16.hpp index d3cb40849..656a87629 100644 --- a/include/xsimd/types/xsimd_avx512_int16.hpp +++ b/include/xsimd/types/xsimd_avx512_int16.hpp @@ -316,19 +316,17 @@ namespace xsimd static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { - #if defined(XSIMD_AVX512BW_AVAILABLE) - // Some compilers are not happy with passing directly a and b to the intrinsics - // See https://github.com/xtensor-stack/xsimd/issues/315 - __m512i ma = a; - __m512i mb = b; - return _mm512_mask_blend_epi16(cond, mb, ma); + #if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER) + auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a); + return batch_type(res); #else - XSIMD_SPLIT_AVX512(cond); + __m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0)); + XSIMD_SPLIT_AVX512(mcond); XSIMD_SPLIT_AVX512(a); XSIMD_SPLIT_AVX512(b); - auto res_lo = _mm256_blendv_epi8(b_low, a_low, cond_low); - auto res_hi = _mm256_blendv_epi8(b_high, a_high, cond_high); + auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low); + auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high); XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); #endif diff --git a/include/xsimd/types/xsimd_avx512_int32.hpp b/include/xsimd/types/xsimd_avx512_int32.hpp index 143b70a68..a533aaa75 100644 --- a/include/xsimd/types/xsimd_avx512_int32.hpp +++ b/include/xsimd/types/xsimd_avx512_int32.hpp @@ -236,7 +236,7 @@ namespace xsimd // TODO Why not _mm512_reduce_add_...? __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0); __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1); - __m256i res1 = tmp1 + tmp2; + __m256i res1 = _mm256_add_epi32(tmp1, tmp2); return xsimd::hadd(batch(res1)); } diff --git a/include/xsimd/types/xsimd_avx512_int64.hpp b/include/xsimd/types/xsimd_avx512_int64.hpp index 657212259..3d6f524de 100644 --- a/include/xsimd/types/xsimd_avx512_int64.hpp +++ b/include/xsimd/types/xsimd_avx512_int64.hpp @@ -293,13 +293,25 @@ namespace xsimd { __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0); __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1); - __m256i res1 = tmp1 + tmp2; + __m256i res1 = _mm256_add_epi64(tmp1, tmp2); return xsimd::hadd(batch(res1)); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { + #if !defined(_MSC_VER) return _mm512_mask_blend_epi64(cond, b, a); + #else + __m512i mcond = _mm512_maskz_broadcastq_epi64((__mmask8)cond, _mm_set1_epi32(~0)); + XSIMD_SPLIT_AVX512(mcond); + XSIMD_SPLIT_AVX512(a); + XSIMD_SPLIT_AVX512(b); + + auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low); + auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high); + + XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); + #endif } }; diff --git a/include/xsimd/types/xsimd_avx512_int_base.hpp b/include/xsimd/types/xsimd_avx512_int_base.hpp index cd80f8b6d..45eefaf96 100644 --- a/include/xsimd/types/xsimd_avx512_int_base.hpp +++ b/include/xsimd/types/xsimd_avx512_int_base.hpp @@ -18,13 +18,29 @@ namespace xsimd { #define XSIMD_SPLIT_AVX512(avx_name) \ - __m256i avx_name##_low = _mm512_castsi512_si256(avx_name); \ - __m256i avx_name##_high = _mm512_extracti64x4_epi64(avx_name, 1) \ + __m256i avx_name##_low = _mm512_castsi512_si256((__m512i)avx_name); \ + __m256i avx_name##_high = _mm512_extracti64x4_epi64((__m512i)avx_name, 1) \ + +#define XSIMD_SPLITPS_AVX512(avx_name) \ + __m256 avx_name##_low = _mm512_castps512_ps256((__m512)avx_name); \ + __m256 avx_name##_high = _mm512_extractf32x8_ps((__m512)avx_name, 1) \ + +#define XSIMD_SPLITPD_AVX512(avx_name) \ + __m256d avx_name##_low = _mm512_castpd512_pd256((__m512d)avx_name); \ + __m256d avx_name##_high = _mm512_extractf64x4_pd((__m512d)avx_name, 1) \ #define XSIMD_RETURN_MERGED_AVX(res_low, res_high) \ __m512i result = _mm512_castsi256_si512(res_low); \ return _mm512_inserti64x4(result, res_high, 1) \ +#define XSIMD_RETURN_MERGEDPS_AVX(res_low, res_high) \ + __m512 result = _mm512_castps256_ps512(res_low); \ + return _mm512_insertf32x8(result, res_high, 1) \ + +#define XSIMD_RETURN_MERGEDPD_AVX(res_low, res_high) \ + __m512d result = _mm512_castpd256_pd512(res_low); \ + return _mm512_insertf64x4(result, res_high, 1) \ + #define XSIMD_APPLY_AVX2_FUNCTION(N, func, avx_lhs, avx_rhs) \ XSIMD_SPLIT_AVX512(avx_lhs); \ XSIMD_SPLIT_AVX512(avx_rhs); \ diff --git a/include/xsimd/types/xsimd_avx_conversion.hpp b/include/xsimd/types/xsimd_avx_conversion.hpp index 0f1a5617b..166f01e4a 100644 --- a/include/xsimd/types/xsimd_avx_conversion.hpp +++ b/include/xsimd/types/xsimd_avx_conversion.hpp @@ -123,7 +123,17 @@ namespace xsimd XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 8, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 8, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 8, _mm256_cvtepi32_epi16) +#if defined(_MSC_VER) + namespace detail { + static inline __m256 xsimd_mm256_cvtepu32_ps(__m256i a) + { + return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a))); + } + } + XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, detail::xsimd_mm256_cvtepu32_ps) +#else XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, _mm256_cvtepu32_ps) +#endif XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 4, _mm256_cvtepu32_pd) XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 4, _mm256_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 4, _mm256_cvtepi64_epi32) diff --git a/include/xsimd/types/xsimd_sse_conversion.hpp b/include/xsimd/types/xsimd_sse_conversion.hpp index aa54193d8..151fbc87c 100644 --- a/include/xsimd/types/xsimd_sse_conversion.hpp +++ b/include/xsimd/types/xsimd_sse_conversion.hpp @@ -87,7 +87,19 @@ namespace xsimd XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 2) XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, _mm_cvttps_epi32) #if defined(XSIMD_AVX512VL_AVAILABLE) + +#if defined(_MSC_VER) + namespace detail { + static inline __m128 xsimd_mm_cvtepu32_ps(__m128i a) + { + return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a))); + } + } + XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, detail::xsimd_mm_cvtepu32_ps) +#else XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, _mm_cvtepu32_ps) +#endif + XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, _mm_cvttps_epu32) #if defined(XSIMD_AVX512DQ_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, _mm_cvtepi64_pd)