Skip to content

Commit

Permalink
Fix dispatching mechanism
Browse files Browse the repository at this point in the history
Traverse required arch in the order provided by the user instead of
trying to guess the best one.

It is actually impossible to define the notion of a best architectures
as Intel instruction sets have a tree structure and not a linear
structure : there are multiple leaves and none of them can be considered
the "best".
  • Loading branch information
serge-sans-paille committed Dec 22, 2023
1 parent a48ab43 commit bc29d96
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 106 deletions.
9 changes: 3 additions & 6 deletions include/xsimd/config/xsimd_arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,6 @@ namespace xsimd
};
} // namespace detail

struct unsupported
{
};
using all_x86_architectures = arch_list<
avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
Expand Down Expand Up @@ -221,7 +218,7 @@ namespace xsimd
class dispatcher
{

const unsigned best_arch_found;
const decltype(available_architectures()) availables_archs;
F functor;

template <class Arch, class... Tys>
Expand All @@ -234,15 +231,15 @@ namespace xsimd
template <class Arch, class ArchNext, class... Archs, class... Tys>
inline auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
{
if (Arch::version() <= best_arch_found)
if (availables_archs.has(Arch {}))
return functor(Arch {}, std::forward<Tys>(args)...);
else
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
}

public:
inline dispatcher(F f) noexcept
: best_arch_found(available_architectures().best)
: availables_archs(available_architectures())
, functor(f)
{
}
Expand Down
139 changes: 48 additions & 91 deletions include/xsimd/config/xsimd_cpuid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,65 +33,67 @@ namespace xsimd
{
struct supported_arch
{
unsigned sse2 : 1;
unsigned sse3 : 1;
unsigned ssse3 : 1;
unsigned sse4_1 : 1;
unsigned sse4_2 : 1;
unsigned sse4a : 1;
unsigned fma3_sse : 1;
unsigned fma4 : 1;
unsigned xop : 1;
unsigned avx : 1;
unsigned fma3_avx : 1;
unsigned avx2 : 1;
unsigned avxvnni : 1;
unsigned fma3_avx2 : 1;
unsigned avx512f : 1;
unsigned avx512cd : 1;
unsigned avx512dq : 1;
unsigned avx512bw : 1;
unsigned avx512er : 1;
unsigned avx512pf : 1;
unsigned avx512ifma : 1;
unsigned avx512vbmi : 1;
unsigned avx512vnni_bw : 1;
unsigned avx512vnni_vbmi : 1;
unsigned neon : 1;
unsigned neon64 : 1;
unsigned sve : 1;
unsigned rvv : 1;

// version number of the best arch available
unsigned best;

#define ARCH_FIELD_EX(arch, field_name) \
unsigned field_name; \
inline bool has(::xsimd::arch) const { return this->field_name; }
#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)

ARCH_FIELD(sse2)
ARCH_FIELD(sse3)

ARCH_FIELD(ssse3)
ARCH_FIELD(sse4_1)
ARCH_FIELD(sse4_2)
// ARCH_FIELD(sse4a)
ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42)
ARCH_FIELD(fma4)
// ARCH_FIELD(xop)
ARCH_FIELD(avx)
ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx)
ARCH_FIELD(avx2)
ARCH_FIELD(avxvnni)
ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
ARCH_FIELD(avx512f)
ARCH_FIELD(avx512cd)
ARCH_FIELD(avx512dq)
ARCH_FIELD(avx512bw)
ARCH_FIELD(avx512er)
ARCH_FIELD(avx512pf)
ARCH_FIELD(avx512ifma)
ARCH_FIELD(avx512vbmi)
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw)
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
ARCH_FIELD(neon)
ARCH_FIELD(neon64)
ARCH_FIELD(sve)
ARCH_FIELD(rvv)
ARCH_FIELD(wasm)

#undef ARCH_FIELD

inline supported_arch() noexcept
{
memset(this, 0, sizeof(supported_arch));

#if XSIMD_WITH_WASM
wasm = 1;
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
neon = 1;
neon64 = 1;
best = neon64::version();
#elif defined(__ARM_NEON) || defined(_M_ARM)

#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
#else
// that's very conservative :-/
neon = 0;
#endif
neon64 = 0;
best = neon::version() * neon;

#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0

#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
#else
sve = 0;
#endif
best = sve::version() * sve;

#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0

Expand All @@ -100,11 +102,8 @@ namespace xsimd
#define HWCAP_V (1 << ('V' - 'A'))
#endif
rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
#else
rvv = 0;
#endif

best = ::xsimd::rvv::version() * rvv;
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
{
Expand All @@ -122,14 +121,12 @@ namespace xsimd
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t"
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));

#else
__asm__("cpuid\n\t"
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));
#endif

Expand All @@ -143,87 +140,47 @@ namespace xsimd
get_cpuid(regs1, 0x1);

sse2 = regs1[3] >> 26 & 1;
best = std::max(best, sse2::version() * sse2);

sse3 = regs1[2] >> 0 & 1;
best = std::max(best, sse3::version() * sse3);

ssse3 = regs1[2] >> 9 & 1;
best = std::max(best, ssse3::version() * ssse3);

sse4_1 = regs1[2] >> 19 & 1;
best = std::max(best, sse4_1::version() * sse4_1);

sse4_2 = regs1[2] >> 20 & 1;
best = std::max(best, sse4_2::version() * sse4_2);

fma3_sse = regs1[2] >> 12 & 1;
if (sse4_2)
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
fma3_sse42 = regs1[2] >> 12 & 1;

avx = regs1[2] >> 28 & 1;
best = std::max(best, avx::version() * avx);

fma3_avx = avx && fma3_sse;
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
fma3_avx = avx && fma3_sse42;

int regs8[4];
get_cpuid(regs8, 0x80000001);
fma4 = regs8[2] >> 16 & 1;
best = std::max(best, fma4::version() * fma4);

// sse4a = regs[2] >> 6 & 1;
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);

// xop = regs[2] >> 11 & 1;
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);

int regs7[4];
get_cpuid(regs7, 0x7);
avx2 = regs7[1] >> 5 & 1;
best = std::max(best, avx2::version() * avx2);

int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
avxvnni = regs7a[0] >> 4 & 1;
best = std::max(best, avxvnni::version() * avxvnni * avx2);

fma3_avx2 = avx2 && fma3_sse;
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
fma3_avx2 = avx2 && fma3_sse42;

avx512f = regs7[1] >> 16 & 1;
best = std::max(best, avx512f::version() * avx512f);

avx512cd = regs7[1] >> 28 & 1;
best = std::max(best, avx512cd::version() * avx512cd * avx512f);

avx512dq = regs7[1] >> 17 & 1;
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);

avx512bw = regs7[1] >> 30 & 1;
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);

avx512er = regs7[1] >> 27 & 1;
best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f);

avx512pf = regs7[1] >> 26 & 1;
best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f);

avx512ifma = regs7[1] >> 21 & 1;
best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);

avx512vbmi = regs7[2] >> 1 & 1;
best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);

avx512vnni_bw = regs7[2] >> 11 & 1;
best = std::max(best, avx512vnni<xsimd::avx512bw>::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f);

avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
best = std::max(best, avx512vnni<xsimd::avx512vbmi>::version() * avx512vnni_vbmi);
#endif
}
};
}
} // namespace detail

inline detail::supported_arch available_architectures() noexcept
{
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/types/xsimd_generic_arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ namespace xsimd
protected:
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
};

struct unsupported
{
};
}

#endif
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_rvv_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@ namespace xsimd
using type = detail::rvv_bool_simd_register<T>;
};
} // namespace types
#else
using rvv = detail::rvv<0xFFFFFFFF>;
#endif
} // namespace xsimd

Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_sve_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ namespace xsimd
using type = detail::sve_bool_simd_register;
};
} // namespace types
#else
using sve = detail::sve<0xFFFFFFFF>;
#endif
} // namespace xsimd

Expand Down
9 changes: 0 additions & 9 deletions test/test_arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,6 @@ TEST_CASE("[multi arch support]")
float res = dispatched(data, 17);
CHECK_EQ(ref, res);
}

// check that we pick the most appropriate version
{
auto dispatched = xsimd::dispatch<xsimd::arch_list<xsimd::sse3, xsimd::sse2, xsimd::generic>>(get_arch_version {});
unsigned expected = xsimd::available_architectures().best >= xsimd::sse3::version()
? xsimd::sse3::version()
: xsimd::sse2::version();
CHECK_EQ(expected, dispatched());
}
#endif
}

Expand Down

0 comments on commit bc29d96

Please sign in to comment.