xtensor-stack · serge-sans-paille · Apr 27, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml
@@ -35,7 +35,8 @@ jobs:
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \
           -DXSIMD_ENABLE_WERROR=ON \
-          -DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<${{ matrix.sys.size }}\> -DXSIMD_WITH_EMULATED=1 $CXXFLAGS" \
+          -DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \
+          -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \
           -GNinja
     - name: Build
       run: ninja -C _build

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -2588,30 +2588,50 @@ namespace xsimd
                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
             };
 
+            template <class A, uint16_t... Is>
+            constexpr bool is_reduce_pattern()
+            {
+                // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
+                if (sizeof...(Is) != batch<uint16_t, A>::size)
+                    return false;
+                uint16_t pattern[] = { Is... };
+                if (pattern[0] != 1)
+                    return false;
+                for (size_t i = 1; i < sizeof...(Is); i += 1)
+                {
+                    if (pattern[i] != (i & 1))
+                        return false;
+                }
+                return true;
+            }
         }
 
-        template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
-        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
-        {
-            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
-            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        template <class A, uint16_t... Idx>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
         {
-            // FIXME: this sequence is very inefficient, but it's here to catch
-            // a pattern generated by detail::reduce from xsimd_common_math.hpp.
-            // The whole pattern is actually decently folded by GCC and Clang,
-            // so bare with it.
-            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
-            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value)
+            {
+                constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+                return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            }
+            else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>())
+            {
+                // FIXME: this sequence is very inefficient, but it's here to catch
+                // a pattern generated by detail::reduce from xsimd_common_math.hpp.
+                // The whole pattern is actually decently folded by GCC and Clang,
+                // so bare with it.
+                constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+                auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
 
-            alignas(A::alignment()) uint16_t buffer[32];
-            _mm512_store_si512((__m512i*)&buffer[0], tmp);
-            buffer[0] = buffer[1];
-            return _mm512_load_si512(&buffer[0]);
+                alignas(A::alignment()) uint16_t buffer[32];
+                _mm512_store_si512((__m512i*)&buffer[0], tmp);
+                buffer[0] = buffer[1];
+                return _mm512_load_si512(&buffer[0]);
+            }
+            else
+            {
+                return swizzle(self, mask, common {});
+            }
         }
 
         template <class A, uint16_t... Vs>

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -43,6 +43,13 @@ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF)
 # Note: to compile on ARM (or cross compile), you may need to add the following:
 # -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi"
 set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments")
+string(REGEX MATCH "emulated\\<[0-9]+\\>" TARGET_EMULATED ${TARGET_ARCH})
+
+if (TARGET_EMULATED)
+    message(STATUS "Using emulated target: ${TARGET_EMULATED}")
+    set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1)
+    unset(TARGET_ARCH CACHE)
+endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
     if (NOT WIN32 AND NOT ANDROID)
@@ -111,7 +118,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
     elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
     elseif(NOT WIN32 AND NOT EMSCRIPTEN)
-        if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
+        if(TARGET_ARCH AND NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
         endif()
     endif()
@@ -227,6 +234,11 @@ endif()
 add_subdirectory(doc)
 add_subdirectory(architectures)
 
+if(EMULATED_COMPILE_FLAGS)
+    message(STATUS ${EMULATED_COMPILE_FLAGS})
+    target_compile_options(test_xsimd PRIVATE ${EMULATED_COMPILE_FLAGS})
+endif()
+
 if(EMSCRIPTEN)
     set_target_properties(test_xsimd PROPERTIES LINK_FLAGS "-s MODULARIZE=1 -s EXPORT_NAME=test_xsimd_wasm -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -lembind")
     target_compile_options(test_xsimd