diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index 272ec2611..bc4781cbb 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -35,7 +35,8 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \ -DXSIMD_ENABLE_WERROR=ON \ - -DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<${{ matrix.sys.size }}\> -DXSIMD_WITH_EMULATED=1 $CXXFLAGS" \ + -DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \ -GNinja - name: Build run: ninja -C _build diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 97427f194..79b1e0398 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -2588,30 +2588,50 @@ namespace xsimd I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>; }; + template + constexpr bool is_reduce_pattern() + { + // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1} + if (sizeof...(Is) != batch::size) + return false; + uint16_t pattern[] = { Is... }; + if (pattern[0] != 1) + return false; + for (size_t i = 1; i < sizeof...(Is); i += 1) + { + if (pattern[i] != (i & 1)) + return false; + } + return true; + } } - template ::value>> - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept - { - constexpr typename detail::fold_batch_constant::type mask32; - return _mm512_permutexvar_epi32(static_cast>(mask32), self); - } - - template - XSIMD_INLINE batch - swizzle(batch const& self, batch_constant, requires_arch) noexcept + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - // FIXME: this sequence is very inefficient, but it's here to catch - // a pattern generated by detail::reduce from xsimd_common_math.hpp. - // The whole pattern is actually decently folded by GCC and Clang, - // so bare with it. - constexpr batch_constant mask32; - auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); + XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices::value) + { + constexpr typename detail::fold_batch_constant::type mask32; + return _mm512_permutexvar_epi32(static_cast>(mask32), self); + } + else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern()) + { + // FIXME: this sequence is very inefficient, but it's here to catch + // a pattern generated by detail::reduce from xsimd_common_math.hpp. + // The whole pattern is actually decently folded by GCC and Clang, + // so bare with it. + constexpr batch_constant mask32; + auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); - alignas(A::alignment()) uint16_t buffer[32]; - _mm512_store_si512((__m512i*)&buffer[0], tmp); - buffer[0] = buffer[1]; - return _mm512_load_si512(&buffer[0]); + alignas(A::alignment()) uint16_t buffer[32]; + _mm512_store_si512((__m512i*)&buffer[0], tmp); + buffer[0] = buffer[1]; + return _mm512_load_si512(&buffer[0]); + } + else + { + return swizzle(self, mask, common {}); + } } template diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index feb0e6edb..662dcdc3f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -43,6 +43,13 @@ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF) # Note: to compile on ARM (or cross compile), you may need to add the following: # -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi" set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments") +string(REGEX MATCH "emulated\\<[0-9]+\\>" TARGET_EMULATED ${TARGET_ARCH}) + +if (TARGET_EMULATED) + message(STATUS "Using emulated target: ${TARGET_EMULATED}") + set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1) + unset(TARGET_ARCH CACHE) +endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (NOT WIN32 AND NOT ANDROID) @@ -111,7 +118,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") elseif(NOT WIN32 AND NOT EMSCRIPTEN) - if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) + if(TARGET_ARCH AND NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") endif() endif() @@ -227,6 +234,11 @@ endif() add_subdirectory(doc) add_subdirectory(architectures) +if(EMULATED_COMPILE_FLAGS) + message(STATUS ${EMULATED_COMPILE_FLAGS}) + target_compile_options(test_xsimd PRIVATE ${EMULATED_COMPILE_FLAGS}) +endif() + if(EMSCRIPTEN) set_target_properties(test_xsimd PROPERTIES LINK_FLAGS "-s MODULARIZE=1 -s EXPORT_NAME=test_xsimd_wasm -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -lembind") target_compile_options(test_xsimd