Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resample: port resample_neon.h to aarch64 #8

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
169 changes: 154 additions & 15 deletions libspeexdsp/resample_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,24 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <arm_neon.h>

#ifdef FIXED_POINT
#ifdef __thumb2__
#if defined(__aarch64__)
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("fmov s0, %w[a]\n"
"sqxtn h0, s0\n"
"sxtl v0.4s, v0.4h\n"
"fmov %w[ret], s0\n"
: [ret] "=r" (ret)
: [a] "r" (a)
: "v0" );
return ret;
}
#elif defined(__thumb2__)
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("ssat %[ret], #16, %[a]"
: [ret] "=&r" (ret)
: [ret] "=r" (ret)
: [a] "r" (a)
: );
return ret;
Expand All @@ -54,7 +64,7 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
asm ("vmov.s32 d0[0], %[a]\n"
"vqmovn.s32 d0, q0\n"
"vmov.s16 %[ret], d0[0]\n"
: [ret] "=&r" (ret)
: [ret] "=r" (ret)
: [a] "r" (a)
: "q0");
return ret;
Expand All @@ -64,7 +74,63 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
#define WORD2INT(x) (saturate_32bit_to_16bit(x))

#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 */
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
uint32_t remainder = len % 16;
len = len - remainder;

asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4h}, [%[b]], #8\n"
" ld1 {v20.4h}, [%[a]], #8\n"
" subs %w[remainder], %w[remainder], #4\n"
" smull v0.4s, v16.4h, v20.4h\n"
" b.ne 4f\n"
" b 5f\n"
"1:"
" ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
" ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
" subs %w[len], %w[len], #16\n"
" smull v0.4s, v16.4h, v20.4h\n"
" smlal v0.4s, v17.4h, v21.4h\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" smlal v0.4s, v19.4h, v23.4h\n"
" b.eq 3f\n"
"2:"
" ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
" ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
" subs %w[len], %w[len], #16\n"
" smlal v0.4s, v16.4h, v20.4h\n"
" smlal v0.4s, v17.4h, v21.4h\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" smlal v0.4s, v19.4h, v23.4h\n"
" b.ne 2b\n"
"3:"
" cmp %w[remainder], #0\n"
" b.eq 5f\n"
"4:"
" ld1 {v18.4h}, [%[b]], #8\n"
" ld1 {v22.4h}, [%[a]], #8\n"
" subs %w[remainder], %w[remainder], #4\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" b.ne 4b\n"
"5:"
" saddlv d0, v0.4s\n"
" sqxtn s0, d0\n"
" sqrshrn h0, s0, #15\n"
" sxtl v0.4s, v0.4h\n"
" fmov %w[ret], s0\n"
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v0",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
Expand Down Expand Up @@ -112,33 +178,105 @@ static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, u
" vqmovn.s64 d0, q0\n"
" vqrshrn.s32 d0, q0, #15\n"
" vmov.s16 %[ret], d0[0]\n"
: [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "q0",
"d16", "d17", "d18", "d19",
"d20", "d21", "d22", "d23");
"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23");

return ret;
}
#elif defined(FLOATING_POINT)
#endif // !defined(__aarch64__)

#elif defined(FLOATING_POINT)
#if defined(__aarch64__)
static inline int32_t saturate_float_to_16bit(float a) {
Copy link
Member

@tmatth tmatth Aug 9, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also a mismatch on negative ties (e.g. -0.5, -1.5, etc.)...the existing ARM and x86 code rounds toward 0 (0.5 -> 0, -0.5 -> 0, -1.5 -> -1) whereas this version rounds toward negative infinity (0.5 -> 0, -0.5 -> -1, -1.5 -> -2).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ARM 32 bit code does ties 'away' from zero, mimicking this code:

int16_t float_to_short(float a) {
a += (a >= 0) ? 0.5f : -0.5f;
return (int16_t)(max(-32768, min(32767, a)));
}
so 0.5 rounds away to 1.0 and -0.5 rounds away from 0 to -1.0

The aarch64 fcvtas instruction implements the same rounding as the ARM version.

The x86 instruction cvtss2si implements rounding ties to 'even', which is toward 0 for even integers and away from 0 for odd integers. e.g. -0.5 rounds to 0. -1.5 rounds to -2.0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I will try and do some more testing for this over the weekend, thanks for the info.

int32_t ret;
asm ("fmov s0, %w[a]\n"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm getting an assembler error with gcc 4.9 here:
Error: operand 2 should be an integer register -- fmov s0,v1
Why not:

asm ("fcvtas s0, %s[a]\n"
     "sqxtn h0, s0\n"
     "sxtl v0.4s, v0.4h\n"
     "fmov %w[ret], s0\n"
     : [ret] "=r" (ret)
     : [a] "w" (a)
     : "v0");

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On Mon, Aug 8, 2016 at 8:26 PM, Tristan Matthews notifications@github.com
wrote:

In libspeexdsp/resample_neon.h
#8 (comment):

+#elif defined(FLOATING_POINT)
+#if defined(aarch64)
+static inline int32_t saturate_float_to_16bit(float a) {

  • int32_t ret;
  • asm ("fmov s0, %w[a]\n"

I'm getting an assembler error with gcc 4.9 here:
Error: operand 2 should be an integer register -- fmov s0,v1

doh! My bad. I've reproduced the error with gcc 4.9. It was supposed to
be:

asm ("fmov s0, *%s[a]\*n"

..

Why not:

asm ("fcvtas s0, %s[a]\n" "sqxtn h0, s0\n" "sxtl v0.4s, v0.4h\n" "fmov %w[ret], s0\n" : [ret] "=r" (ret) : [a] "w" (a) : "v0");

at the time I had trouble with clang... it said it couldnt allocate a
register. perhaps because the float is in s0, and I used s0. Reguardless,
I'm able to build your code with clang 3.8 so should be fine.
Changing it to use v1 saves a mov

     asm ("fcvtas s1, %s[a]\n"
     "sqxtn h1, s1\n"
     "sxtl v1.4s, v1.4h\n"
     "fmov %w[ret], s1\n"
     : [ret] "=r" (ret)
     : [a] "w" (a)
     : "v1");

0: 5e21c801 fcvtas s1, s0
4: 5e614821 sqxtn h1, s1
8: 0f10a421 sxtl v1.4s, v1.4h
c: 1e260020 fmov w0, s1
10: d65f03c0 ret

You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
https://github.com/xiph/speexdsp/pull/8/files/7c20961d4688ca1a486557d88fefcfe4f6c11412#r73993202,
or mute the thread
https://github.com/notifications/unsubscribe-auth/AOOYDR-ch7wMFbOcSTADN7F1NWm2nvKeks5qd_ODgaJpZM4JSTPL
.

"fcvtas s0, s0\n"
"sqxtn h0, s0\n"
"sxtl v0.4s, v0.4h\n"
"fmov %w[ret], s0\n"
: [ret] "=r" (ret)
: [a] "w" (a)
: "v0");
return ret;
}
#else
static inline int32_t saturate_float_to_16bit(float a) {
int32_t ret;
asm ("vmov.f32 d0[0], %[a]\n"
"vcvt.s32.f32 d0, d0, #15\n"
"vqrshrn.s32 d0, q0, #15\n"
"vmov.s16 %[ret], d0[0]\n"
: [ret] "=&r" (ret)
: [ret] "=r" (ret)
: [a] "r" (a)
: "q0");
return ret;
}
#endif

#undef WORD2INT
#define WORD2INT(x) (saturate_float_to_16bit(x))

#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 */
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
uint32_t remainder = len % 16;
len = len - remainder;

asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4s}, [%[b]], #16\n"
" ld1 {v20.4s}, [%[a]], #16\n"
" subs %w[remainder], %w[remainder], #4\n"
" fmul v1.4s, v16.4s, v20.4s\n"
" b.ne 4f\n"
" b 5f\n"
"1:"
" ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
" ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
" subs %w[len], %w[len], #16\n"
" fmul v1.4s, v16.4s, v20.4s\n"
" fmul v2.4s, v17.4s, v21.4s\n"
" fmul v3.4s, v18.4s, v22.4s\n"
" fmul v4.4s, v19.4s, v23.4s\n"
" b.eq 3f\n"
"2:"
" ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
" ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
" subs %w[len], %w[len], #16\n"
" fmla v1.4s, v16.4s, v20.4s\n"
" fmla v2.4s, v17.4s, v21.4s\n"
" fmla v3.4s, v18.4s, v22.4s\n"
" fmla v4.4s, v19.4s, v23.4s\n"
" b.ne 2b\n"
"3:"
" fadd v16.4s, v1.4s, v2.4s\n"
" fadd v17.4s, v3.4s, v4.4s\n"
" cmp %w[remainder], #0\n"
" fadd v1.4s, v16.4s, v17.4s\n"
" b.eq 5f\n"
"4:"
" ld1 {v18.4s}, [%[b]], #16\n"
" ld1 {v22.4s}, [%[a]], #16\n"
" subs %w[remainder], %w[remainder], #4\n"
" fmla v1.4s, v18.4s, v22.4s\n"
" b.ne 4b\n"
"5:"
" faddp v1.4s, v1.4s, v1.4s\n"
" faddp %[ret].4s, v1.4s, v1.4s\n"
: [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v1", "v2", "v3", "v4",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
Expand Down Expand Up @@ -191,11 +329,12 @@ static inline float inner_product_single(const float *a, const float *b, unsigne
" vadd.f32 d0, d0, d1\n"
" vpadd.f32 d0, d0, d0\n"
" vmov.f32 %[ret], d0[0]\n"
: [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+l" (len), [remainder] "+l" (remainder)
:
: "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q11");
: "cc", "q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
return ret;
}
#endif // defined(__aarch64__)
#endif