/
setcmploop-x86_64-ifunc.c
37 lines (33 loc) · 1.26 KB
/
setcmploop-x86_64-ifunc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include "rpmsetcmp-common.h"
#pragma GCC visibility push(hidden)
int setcmploop_sse2(const uint32_t *Pv, size_t Pn, const uint32_t *Rv, size_t Rn);
int setcmploop_avx2(const uint32_t *Pv, size_t Pn, const uint32_t *Rv, size_t Rn);
int setcmploop_cmov(const uint32_t *Pv, size_t Pn, const uint32_t *Rv, size_t Rn);
#pragma GCC visibility pop
typedef int (*FuncPtr)(const uint32_t *Pv, size_t Pn, const uint32_t *Rv, size_t Rn);
static FuncPtr setcmploop_ifunc()
{
__builtin_cpu_init();
if (__builtin_cpu_supports("avx2")) {
// Slow AVX loads on Excavator.
if (__builtin_cpu_is("bdver4"))
return setcmploop_sse2;
// Slow vzeroupper on KNL.
if (__builtin_cpu_supports("avx512er"))
return setcmploop_cmov; // slow tzcnt/bsf
return setcmploop_avx2;
}
if (__builtin_cpu_is("intel")) {
// We use "corei7" as a synonym for "modern big core".
if (__builtin_cpu_is("corei7"))
return setcmploop_sse2;
// On small cores such as Goldmont, tzcnt/bsf is slow.
return setcmploop_cmov;
}
// On older AMD cores, the sse2 version is slower.
if (__builtin_cpu_supports("sse4.1"))
return setcmploop_sse2;
return setcmploop_cmov;
}
int setcmploop(const uint32_t *Pv, size_t Pn, const uint32_t *Rv, size_t Rn)
__attribute__((ifunc("setcmploop_ifunc")));