Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,22 @@ platform:
- x64

environment:
global:
MINICONDA: C:\xsimd-conda
matrix:
- MINICONDA: C:\xsimd-conda
- JOB: "AVX2"
CXXFLAGS: "/arch:AVX2"
VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat"
RUNTEST: ".\\test_xsimd"
- JOB: "AVX512"
CXXFLAGS: "/arch:AVX512"
APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat"
RUNTEST: "ECHO"

init:
- "ECHO %MINICONDA%"
- C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
- call "%VCVARSALL%" %PLATFORM%
- ps: if($env:Platform -eq "x64"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe' C:\Miniconda.exe; echo "Done"}
- ps: if($env:Platform -eq "x86"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86.exe' C:\Miniconda.exe; echo "Done"}
- cmd: C:\Miniconda.exe /S /D=C:\xsimd-conda
Expand All @@ -27,4 +37,4 @@ install:
- cd test

build_script:
- .\test_xsimd
- "%RUNTEST%"
8 changes: 4 additions & 4 deletions include/xsimd/math/xsimd_rounding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,13 +329,13 @@ namespace xsimd

static inline batch_type ceil(const batch_type& x)
{
auto res = _mm512_ceil_ps(x);
auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF);
return res;
}

static inline batch_type floor(const batch_type& x)
{
auto res = _mm512_floor_ps(x);
auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF);
return res;
}

Expand All @@ -359,13 +359,13 @@ namespace xsimd

static inline batch_type ceil(const batch_type& x)
{
auto res = _mm512_ceil_pd(x);
auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF);
return res;
}

static inline batch_type floor(const batch_type& x)
{
auto res = _mm512_floor_pd(x);
auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF);
return res;
}

Expand Down
15 changes: 9 additions & 6 deletions include/xsimd/types/xsimd_avx512_double.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,11 @@ namespace xsimd

static batch_type abs(const batch_type& rhs)
{
return (__m512d)(_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
(__m512i)((__m512d)(rhs))));
__m512d rhs_asd = (__m512d)rhs;
__m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asd);
__m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
rhs_asi);
return *reinterpret_cast<__m512d*>(&res_asi);
}

static batch_type fabs(const batch_type& rhs)
Expand Down Expand Up @@ -487,7 +490,7 @@ namespace xsimd
{
__m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
__m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
__m256d res1 = tmp1 + tmp2;
__m256d res1 = _mm256_add_pd(tmp1, tmp2);
return xsimd::hadd(batch<double, 4>(res1));
}

Expand All @@ -498,7 +501,7 @@ namespace xsimd
{ \
auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
res ## I = (tmp1 + tmp2); \
res ## I = _mm512_add_pd(tmp1, tmp2); \
} \

step1(1, row[0], row[2]);
Expand All @@ -511,12 +514,12 @@ namespace xsimd
batch<double, 8> tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
batch<double, 8> tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));

batch<double, 8> resx1 = (tmp5 + tmp6);
batch<double, 8> resx1 = _mm512_add_pd(tmp5, tmp6);

batch<double, 8> tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
batch<double, 8> tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));

batch<double, 8> resx2 = (tmp7 + tmp8);
batch<double, 8> resx2 = _mm512_add_pd(tmp7, tmp8);

batch<double, 8> tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
batch<double, 8> tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
Expand Down
30 changes: 23 additions & 7 deletions include/xsimd/types/xsimd_avx512_float.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,11 @@ namespace xsimd

static batch_type abs(const batch_type& rhs)
{
return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)),
_mm512_set1_epi32(0x7fffffff)));
__m512 rhs_asf = (__m512)rhs;
__m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asf);
__m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),
rhs_asi);
return *reinterpret_cast<__m512*>(&res_asi);
}

static batch_type fabs(const batch_type& rhs)
Expand Down Expand Up @@ -510,7 +513,7 @@ namespace xsimd
{
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
__m256 res1 = tmp1 + tmp2;
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
return xsimd::hadd(batch<float, 8>(res1));
}

Expand All @@ -524,7 +527,7 @@ namespace xsimd
{ \
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
res ## I = tmp1 + tmp2; \
res ## I = _mm512_add_ps(tmp1, tmp2); \
} \

XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
Expand All @@ -548,17 +551,17 @@ namespace xsimd
batch<float, 16> tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
batch<float, 16> tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
\
batch<float, 16> resx1 = tmp1 + tmp2; \
batch<float, 16> resx1 = _mm512_add_ps(tmp1, tmp2); \
\
batch<float, 16> tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
batch<float, 16> tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
\
batch<float, 16> resx2 = tmp3 + tmp4; \
batch<float, 16> resx2 = _mm512_add_ps(tmp3, tmp4); \
\
batch<float, 16> tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
batch<float, 16> tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
\
batch<float, 16> resx3 = tmp5 + tmp6; \
batch<float, 16> resx3 = _mm512_add_ps(tmp5, tmp6); \
\
halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
_mm512_extractf32x8_ps(resx3, 1)); \
Expand All @@ -576,7 +579,20 @@ namespace xsimd

static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
{
#if !defined(_MSC_VER)
return _mm512_mask_blend_ps(cond, b, a);
#else
__m512i mcondi = _mm512_maskz_broadcastd_epi32 ((__mmask16)cond, _mm_set1_epi32(~0));
__m512 mcond = *reinterpret_cast<__m512*>(&mcondi);
XSIMD_SPLITPS_AVX512(mcond);
XSIMD_SPLITPS_AVX512(a);
XSIMD_SPLITPS_AVX512(b);

auto res_lo = _mm256_blendv_ps(b_low, a_low, mcond_low);
auto res_hi = _mm256_blendv_ps(b_high, a_high, mcond_high);

XSIMD_RETURN_MERGEDPS_AVX(res_lo, res_hi);
#endif
}

static batch_bool_type isnan(const batch_type& x)
Expand Down
16 changes: 7 additions & 9 deletions include/xsimd/types/xsimd_avx512_int16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,19 +316,17 @@ namespace xsimd

static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
{
#if defined(XSIMD_AVX512BW_AVAILABLE)
// Some compilers are not happy with passing directly a and b to the intrinsics
// See https://github.com/xtensor-stack/xsimd/issues/315
__m512i ma = a;
__m512i mb = b;
return _mm512_mask_blend_epi16(cond, mb, ma);
#if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER)
auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a);
return batch_type(res);
#else
XSIMD_SPLIT_AVX512(cond);
__m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0));
XSIMD_SPLIT_AVX512(mcond);
XSIMD_SPLIT_AVX512(a);
XSIMD_SPLIT_AVX512(b);

auto res_lo = _mm256_blendv_epi8(b_low, a_low, cond_low);
auto res_hi = _mm256_blendv_epi8(b_high, a_high, cond_high);
auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);

XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
#endif
Expand Down
2 changes: 1 addition & 1 deletion include/xsimd/types/xsimd_avx512_int32.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ namespace xsimd
// TODO Why not _mm512_reduce_add_...?
__m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
__m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
__m256i res1 = tmp1 + tmp2;
__m256i res1 = _mm256_add_epi32(tmp1, tmp2);
return xsimd::hadd(batch<int32_t, 8>(res1));
}

Expand Down
14 changes: 13 additions & 1 deletion include/xsimd/types/xsimd_avx512_int64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,25 @@ namespace xsimd
{
__m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
__m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
__m256i res1 = tmp1 + tmp2;
__m256i res1 = _mm256_add_epi64(tmp1, tmp2);
return xsimd::hadd(batch<int64_t, 4>(res1));
}

static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
{
#if !defined(_MSC_VER)
return _mm512_mask_blend_epi64(cond, b, a);
#else
__m512i mcond = _mm512_maskz_broadcastq_epi64((__mmask8)cond, _mm_set1_epi32(~0));
XSIMD_SPLIT_AVX512(mcond);
XSIMD_SPLIT_AVX512(a);
XSIMD_SPLIT_AVX512(b);

auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);

XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
#endif
}
};

Expand Down
20 changes: 18 additions & 2 deletions include/xsimd/types/xsimd_avx512_int_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,29 @@ namespace xsimd
{

#define XSIMD_SPLIT_AVX512(avx_name) \
__m256i avx_name##_low = _mm512_castsi512_si256(avx_name); \
__m256i avx_name##_high = _mm512_extracti64x4_epi64(avx_name, 1) \
__m256i avx_name##_low = _mm512_castsi512_si256((__m512i)avx_name); \
__m256i avx_name##_high = _mm512_extracti64x4_epi64((__m512i)avx_name, 1) \

#define XSIMD_SPLITPS_AVX512(avx_name) \
__m256 avx_name##_low = _mm512_castps512_ps256((__m512)avx_name); \
__m256 avx_name##_high = _mm512_extractf32x8_ps((__m512)avx_name, 1) \

#define XSIMD_SPLITPD_AVX512(avx_name) \
__m256d avx_name##_low = _mm512_castpd512_pd256((__m512d)avx_name); \
__m256d avx_name##_high = _mm512_extractf64x4_pd((__m512d)avx_name, 1) \

#define XSIMD_RETURN_MERGED_AVX(res_low, res_high) \
__m512i result = _mm512_castsi256_si512(res_low); \
return _mm512_inserti64x4(result, res_high, 1) \

#define XSIMD_RETURN_MERGEDPS_AVX(res_low, res_high) \
__m512 result = _mm512_castps256_ps512(res_low); \
return _mm512_insertf32x8(result, res_high, 1) \

#define XSIMD_RETURN_MERGEDPD_AVX(res_low, res_high) \
__m512d result = _mm512_castpd256_pd512(res_low); \
return _mm512_insertf64x4(result, res_high, 1) \

#define XSIMD_APPLY_AVX2_FUNCTION(N, func, avx_lhs, avx_rhs) \
XSIMD_SPLIT_AVX512(avx_lhs); \
XSIMD_SPLIT_AVX512(avx_rhs); \
Expand Down
10 changes: 10 additions & 0 deletions include/xsimd/types/xsimd_avx_conversion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,17 @@ namespace xsimd
XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 8, _mm256_cvtepi32_epi16)
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
#if defined(_MSC_VER)
namespace detail {
static inline __m256 xsimd_mm256_cvtepu32_ps(__m256i a)
{
return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a)));
}
}
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, detail::xsimd_mm256_cvtepu32_ps)
#else
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, _mm256_cvtepu32_ps)
#endif
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 4, _mm256_cvtepu32_pd)
XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 4, _mm256_cvtepi64_epi32)
XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 4, _mm256_cvtepi64_epi32)
Expand Down
12 changes: 12 additions & 0 deletions include/xsimd/types/xsimd_sse_conversion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,19 @@ namespace xsimd
XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 2)
XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, _mm_cvttps_epi32)
#if defined(XSIMD_AVX512VL_AVAILABLE)

#if defined(_MSC_VER)
namespace detail {
static inline __m128 xsimd_mm_cvtepu32_ps(__m128i a)
{
return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a)));
}
}
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, detail::xsimd_mm_cvtepu32_ps)
#else
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, _mm_cvtepu32_ps)
#endif

XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, _mm_cvttps_epu32)
#if defined(XSIMD_AVX512DQ_AVAILABLE)
XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, _mm_cvtepi64_pd)
Expand Down