Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/emulated.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ jobs:
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \
-DXSIMD_ENABLE_WERROR=ON \
-DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<${{ matrix.sys.size }}\> -DXSIMD_WITH_EMULATED=1 $CXXFLAGS" \
-DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \
-DCMAKE_CXX_FLAGS="${CXXFLAGS}" \
-GNinja
- name: Build
run: ninja -C _build
Expand Down
60 changes: 40 additions & 20 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2588,30 +2588,50 @@ namespace xsimd
I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
};

template <class A, uint16_t... Is>
constexpr bool is_reduce_pattern()
{
// The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
if (sizeof...(Is) != batch<uint16_t, A>::size)
return false;
uint16_t pattern[] = { Is... };
if (pattern[0] != 1)
return false;
for (size_t i = 1; i < sizeof...(Is); i += 1)
{
if (pattern[i] != (i & 1))
return false;
}
return true;
}
}

template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
{
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
}

template <class A>
XSIMD_INLINE batch<uint16_t, A>
swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
template <class A, uint16_t... Idx>
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
{
// FIXME: this sequence is very inefficient, but it's here to catch
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
// The whole pattern is actually decently folded by GCC and Clang,
// so bare with it.
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value)
{
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
}
else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>())
{
// FIXME: this sequence is very inefficient, but it's here to catch
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
// The whole pattern is actually decently folded by GCC and Clang,
// so bare with it.
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);

alignas(A::alignment()) uint16_t buffer[32];
_mm512_store_si512((__m512i*)&buffer[0], tmp);
buffer[0] = buffer[1];
return _mm512_load_si512(&buffer[0]);
alignas(A::alignment()) uint16_t buffer[32];
_mm512_store_si512((__m512i*)&buffer[0], tmp);
buffer[0] = buffer[1];
return _mm512_load_si512(&buffer[0]);
}
else
{
return swizzle(self, mask, common {});
}
}

template <class A, uint16_t... Vs>
Expand Down
14 changes: 13 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF)
# Note: to compile on ARM (or cross compile), you may need to add the following:
# -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi"
set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments")
string(REGEX MATCH "emulated\\<[0-9]+\\>" TARGET_EMULATED ${TARGET_ARCH})

if (TARGET_EMULATED)
message(STATUS "Using emulated target: ${TARGET_EMULATED}")
set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1)
unset(TARGET_ARCH CACHE)
endif()

if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
if (NOT WIN32 AND NOT ANDROID)
Expand Down Expand Up @@ -111,7 +118,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
elseif(NOT WIN32 AND NOT EMSCRIPTEN)
if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
if(TARGET_ARCH AND NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
endif()
endif()
Expand Down Expand Up @@ -227,6 +234,11 @@ endif()
add_subdirectory(doc)
add_subdirectory(architectures)

if(EMULATED_COMPILE_FLAGS)
message(STATUS ${EMULATED_COMPILE_FLAGS})
target_compile_options(test_xsimd PRIVATE ${EMULATED_COMPILE_FLAGS})
endif()

if(EMSCRIPTEN)
set_target_properties(test_xsimd PROPERTIES LINK_FLAGS "-s MODULARIZE=1 -s EXPORT_NAME=test_xsimd_wasm -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -lembind")
target_compile_options(test_xsimd
Expand Down
Loading