xtensor-stack · JohanMabille · Apr 13, 2021 · Feb 24, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -6,12 +6,22 @@ platform:
   - x64
 
 environment:
+  global:
+    MINICONDA: C:\xsimd-conda
   matrix:
-    - MINICONDA: C:\xsimd-conda
+    - JOB: "AVX2"
+      CXXFLAGS: "/arch:AVX2"
+      VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat"
+      RUNTEST: ".\\test_xsimd"
+    - JOB: "AVX512"
+      CXXFLAGS: "/arch:AVX512"
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat"
+      RUNTEST: "ECHO"
 
 init:
   - "ECHO %MINICONDA%"
-  - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
+  - call "%VCVARSALL%" %PLATFORM%
   - ps: if($env:Platform -eq "x64"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe' C:\Miniconda.exe; echo "Done"}
   - ps: if($env:Platform -eq "x86"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86.exe' C:\Miniconda.exe; echo "Done"}
   - cmd: C:\Miniconda.exe /S /D=C:\xsimd-conda
@@ -27,4 +37,4 @@ install:
   - cd test
 
 build_script:
-  - .\test_xsimd
+  - "%RUNTEST%"
diff --git a/include/xsimd/math/xsimd_rounding.hpp b/include/xsimd/math/xsimd_rounding.hpp
@@ -329,13 +329,13 @@ namespace xsimd
 
             static inline batch_type ceil(const batch_type& x)
             {
-                auto res = _mm512_ceil_ps(x);
+                auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF);
                 return res;
             }
 
             static inline batch_type floor(const batch_type& x)
             {
-                auto res = _mm512_floor_ps(x);
+                auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF);
                 return res;
             }
 
@@ -359,13 +359,13 @@ namespace xsimd
 
             static inline batch_type ceil(const batch_type& x)
             {
-                auto res = _mm512_ceil_pd(x);
+                auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF);
                 return res;
             }
 
             static inline batch_type floor(const batch_type& x)
             {
-                auto res = _mm512_floor_pd(x);
+                auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF);
                 return res;
             }
 

diff --git a/include/xsimd/types/xsimd_avx512_double.hpp b/include/xsimd/types/xsimd_avx512_double.hpp
@@ -449,8 +449,11 @@ namespace xsimd
 
             static batch_type abs(const batch_type& rhs)
             {
-                return (__m512d)(_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
-                                                  (__m512i)((__m512d)(rhs))));
+                __m512d rhs_asd = (__m512d)rhs;
+                __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asd);
+                __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                                   rhs_asi);
+                return *reinterpret_cast<__m512d*>(&res_asi);
             }
 
             static batch_type fabs(const batch_type& rhs)
@@ -487,7 +490,7 @@ namespace xsimd
             {
                 __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
                 __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
-                __m256d res1 = tmp1 + tmp2;
+                __m256d res1 = _mm256_add_pd(tmp1, tmp2);
                 return xsimd::hadd(batch<double, 4>(res1));
             }
 
@@ -498,7 +501,7 @@ namespace xsimd
         {                                                                    \
             auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
             auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
-            res ## I = (tmp1 + tmp2);                                        \
+            res ## I = _mm512_add_pd(tmp1, tmp2);                                        \
         }                                                                    \
 
                 step1(1, row[0], row[2]);
@@ -511,12 +514,12 @@ namespace xsimd
                 batch<double, 8> tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
                 batch<double, 8> tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
 
-                batch<double, 8> resx1 = (tmp5 + tmp6);
+                batch<double, 8> resx1 = _mm512_add_pd(tmp5, tmp6);
 
                 batch<double, 8> tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
                 batch<double, 8> tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
 
-                batch<double, 8> resx2 = (tmp7 + tmp8);
+                batch<double, 8> resx2 = _mm512_add_pd(tmp7, tmp8);
 
                 batch<double, 8> tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
                 batch<double, 8> tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);

diff --git a/include/xsimd/types/xsimd_avx512_float.hpp b/include/xsimd/types/xsimd_avx512_float.hpp
@@ -472,8 +472,11 @@ namespace xsimd
 
             static batch_type abs(const batch_type& rhs)
             {
-                return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)),
-                                                 _mm512_set1_epi32(0x7fffffff)));
+                __m512 rhs_asf = (__m512)rhs;
+                __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asf);
+                __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),
+                                                   rhs_asi);
+                return *reinterpret_cast<__m512*>(&res_asi);
             }
 
             static batch_type fabs(const batch_type& rhs)
@@ -510,7 +513,7 @@ namespace xsimd
             {
                 __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
                 __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
-                __m256 res1 = tmp1 + tmp2;
+                __m256 res1 = _mm256_add_ps(tmp1, tmp2);
                 return xsimd::hadd(batch<float, 8>(res1));
             }
 
@@ -524,7 +527,7 @@ namespace xsimd
         {                                                                                      \
             auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0));                   \
             auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2));                   \
-            res ## I = tmp1 + tmp2;                                                            \
+            res ## I = _mm512_add_ps(tmp1, tmp2);                                              \
         }                                                                                      \
 
                 XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
@@ -548,17 +551,17 @@ namespace xsimd
             batch<float, 16> tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));        \
             batch<float, 16> tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));        \
                                                                                                 \
-            batch<float, 16> resx1 = tmp1 + tmp2;                                               \
+            batch<float, 16> resx1 = _mm512_add_ps(tmp1, tmp2);                                               \
                                                                                                 \
             batch<float, 16> tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));        \
             batch<float, 16> tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));        \
                                                                                                 \
-            batch<float, 16> resx2 = tmp3 + tmp4;                                               \
+            batch<float, 16> resx2 = _mm512_add_ps(tmp3, tmp4);                                               \
                                                                                                 \
             batch<float, 16> tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0));   \
             batch<float, 16> tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1));   \
                                                                                                 \
-            batch<float, 16> resx3 = tmp5 + tmp6;                                               \
+            batch<float, 16> resx3 = _mm512_add_ps(tmp5, tmp6);                                               \
                                                                                                 \
             halfx ## I  = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),                      \
                                          _mm512_extractf32x8_ps(resx3, 1));                     \
@@ -576,7 +579,20 @@ namespace xsimd
 
             static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
             {
+            #if !defined(_MSC_VER)
                 return _mm512_mask_blend_ps(cond, b, a);
+            #else
+                __m512i mcondi = _mm512_maskz_broadcastd_epi32 ((__mmask16)cond, _mm_set1_epi32(~0));
+                __m512 mcond = *reinterpret_cast<__m512*>(&mcondi);
+                XSIMD_SPLITPS_AVX512(mcond);
+                XSIMD_SPLITPS_AVX512(a);
+                XSIMD_SPLITPS_AVX512(b);
+
+                auto res_lo = _mm256_blendv_ps(b_low, a_low, mcond_low);
+                auto res_hi = _mm256_blendv_ps(b_high, a_high, mcond_high);
+
+                XSIMD_RETURN_MERGEDPS_AVX(res_lo, res_hi);
+            #endif
             }
 
             static batch_bool_type isnan(const batch_type& x)

diff --git a/include/xsimd/types/xsimd_avx512_int16.hpp b/include/xsimd/types/xsimd_avx512_int16.hpp
@@ -316,19 +316,17 @@ namespace xsimd
 
             static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
             {
-            #if defined(XSIMD_AVX512BW_AVAILABLE)
-                // Some compilers are not happy with passing directly a and b to the intrinsics
-                // See https://github.com/xtensor-stack/xsimd/issues/315
-                __m512i ma = a;
-                __m512i mb = b;
-                return _mm512_mask_blend_epi16(cond, mb, ma);
+            #if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER)
+                auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a);
+                return batch_type(res);
             #else
-                XSIMD_SPLIT_AVX512(cond);
+                __m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0));
+                XSIMD_SPLIT_AVX512(mcond);
                 XSIMD_SPLIT_AVX512(a);
                 XSIMD_SPLIT_AVX512(b);
 
-                auto res_lo = _mm256_blendv_epi8(b_low, a_low, cond_low);
-                auto res_hi = _mm256_blendv_epi8(b_high, a_high, cond_high);
+                auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
+                auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);
 
                 XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
             #endif

diff --git a/include/xsimd/types/xsimd_avx512_int32.hpp b/include/xsimd/types/xsimd_avx512_int32.hpp
@@ -236,7 +236,7 @@ namespace xsimd
                 // TODO Why not _mm512_reduce_add_...?
                 __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
                 __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
-                __m256i res1 = tmp1 + tmp2;
+                __m256i res1 = _mm256_add_epi32(tmp1, tmp2);
                 return xsimd::hadd(batch<int32_t, 8>(res1));
             }
 

diff --git a/include/xsimd/types/xsimd_avx512_int64.hpp b/include/xsimd/types/xsimd_avx512_int64.hpp
@@ -293,13 +293,25 @@ namespace xsimd
             {
                 __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
                 __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
-                __m256i res1 = tmp1 + tmp2;
+                __m256i res1 = _mm256_add_epi64(tmp1, tmp2);
                 return xsimd::hadd(batch<int64_t, 4>(res1));
             }
 
             static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
             {
+            #if !defined(_MSC_VER)
                 return _mm512_mask_blend_epi64(cond, b, a);
+            #else
+                __m512i mcond = _mm512_maskz_broadcastq_epi64((__mmask8)cond, _mm_set1_epi32(~0));
+                XSIMD_SPLIT_AVX512(mcond);
+                XSIMD_SPLIT_AVX512(a);
+                XSIMD_SPLIT_AVX512(b);
+
+                auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
+                auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);
+
+                XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
+            #endif
             }
         };
 

diff --git a/include/xsimd/types/xsimd_avx512_int_base.hpp b/include/xsimd/types/xsimd_avx512_int_base.hpp
@@ -18,13 +18,29 @@ namespace xsimd
 {
 
 #define XSIMD_SPLIT_AVX512(avx_name)                                                                  \
-    __m256i avx_name##_low = _mm512_castsi512_si256(avx_name);                                        \
-    __m256i avx_name##_high = _mm512_extracti64x4_epi64(avx_name, 1)                                  \
+    __m256i avx_name##_low = _mm512_castsi512_si256((__m512i)avx_name);                                        \
+    __m256i avx_name##_high = _mm512_extracti64x4_epi64((__m512i)avx_name, 1)                                  \
+
+#define XSIMD_SPLITPS_AVX512(avx_name)                                                                  \
+    __m256 avx_name##_low = _mm512_castps512_ps256((__m512)avx_name);                                        \
+    __m256 avx_name##_high = _mm512_extractf32x8_ps((__m512)avx_name, 1)                                  \
+
+#define XSIMD_SPLITPD_AVX512(avx_name)                                                                  \
+    __m256d avx_name##_low = _mm512_castpd512_pd256((__m512d)avx_name);                                        \
+    __m256d avx_name##_high = _mm512_extractf64x4_pd((__m512d)avx_name, 1)                                  \
 
 #define XSIMD_RETURN_MERGED_AVX(res_low, res_high)                                                    \
     __m512i result = _mm512_castsi256_si512(res_low);                                                 \
     return _mm512_inserti64x4(result, res_high, 1)                                                    \
 
+#define XSIMD_RETURN_MERGEDPS_AVX(res_low, res_high)                                                    \
+    __m512 result = _mm512_castps256_ps512(res_low);                                                 \
+    return _mm512_insertf32x8(result, res_high, 1)                                                    \
+
+#define XSIMD_RETURN_MERGEDPD_AVX(res_low, res_high)                                                    \
+    __m512d result = _mm512_castpd256_pd512(res_low);                                                 \
+    return _mm512_insertf64x4(result, res_high, 1)                                                    \
+
 #define XSIMD_APPLY_AVX2_FUNCTION(N, func, avx_lhs, avx_rhs)                                          \
     XSIMD_SPLIT_AVX512(avx_lhs);                                                                      \
     XSIMD_SPLIT_AVX512(avx_rhs);                                                                      \

diff --git a/include/xsimd/types/xsimd_avx_conversion.hpp b/include/xsimd/types/xsimd_avx_conversion.hpp
@@ -123,7 +123,17 @@ namespace xsimd
     XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
     XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 8, _mm256_cvtepi32_epi16)
     XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
+#if defined(_MSC_VER)
+    namespace detail {
+        static inline __m256 xsimd_mm256_cvtepu32_ps(__m256i a)
+        {
+          return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a)));
+        }
+    }
+    XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, detail::xsimd_mm256_cvtepu32_ps)
+#else
     XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, _mm256_cvtepu32_ps)
+#endif
     XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 4, _mm256_cvtepu32_pd)
     XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 4, _mm256_cvtepi64_epi32)
     XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 4, _mm256_cvtepi64_epi32)

diff --git a/include/xsimd/types/xsimd_sse_conversion.hpp b/include/xsimd/types/xsimd_sse_conversion.hpp
@@ -87,7 +87,19 @@ namespace xsimd
     XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 2)
     XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, _mm_cvttps_epi32)
 #if defined(XSIMD_AVX512VL_AVAILABLE)
+
+#if defined(_MSC_VER)
+    namespace detail {
+        static inline __m128 xsimd_mm_cvtepu32_ps(__m128i a)
+        {
+          return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a)));
+        }
+    }
+    XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, detail::xsimd_mm_cvtepu32_ps)
+#else
     XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, _mm_cvtepu32_ps)
+#endif
+
     XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, _mm_cvttps_epu32)
 #if defined(XSIMD_AVX512DQ_AVAILABLE)
     XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, _mm_cvtepi64_pd)