Closed
Description
float foo (void) {
for (int i = 0; i < N; i++)
{
a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] +
h[i] * j[i] + p[i];
}
}
- Clang: base address are reused with
x0
andx1
.LBB0_1: // =>This Inner Loop Header: Depth=1
add x0, x11, x8
add x1, x12, x8
ldr q0, [x0, #8192]
add x0, x9, x8
ldr q1, [x1, #8192]
add x1, x10, x8
ldr q2, [x0, #8192]
add x0, x13, x8
fmul v0.2d, v0.2d, v1.2d
ldr q1, [x1, #8192]
add x1, x14, x8
fmla v0.2d, v1.2d, v2.2d
ldr q1, [x0, #8192]
ldr q2, [x1, #8192]
add x0, x15, x8
add x1, x16, x8
fmla v0.2d, v2.2d, v1.2d
ldr q3, [x0, #8192]
ldr q1, [x1, #8192]
add x0, x17, x8
fmla v0.2d, v1.2d, v3.2d
ldr q1, [x0, #8192]
add x0, x18, x8
adds x8, x8, #16
fadd v0.2d, v0.2d, v1.2d
ldr q1, [x0, #8192]
fadd v0.2d, v1.2d, v0.2d
str q0, [x0, #8192]
b.ne .LBB0_1
- GCC: only need one single
add x0, x0, 16
to update all the memory address
.L2:
ldr q30, [x7, x0]
ldr q31, [x8, x0]
ldr q27, [x10, x0]
ldr q28, [x9, x0]
fmul v31.2d, v31.2d, v30.2d
ldr q29, [x6, x0]
ldr q30, [x5, x0]
fmla v31.2d, v27.2d, v28.2d
ldr q27, [x4, x0]
ldr q28, [x3, x0]
fmla v31.2d, v29.2d, v30.2d
ldr q29, [x2, x0]
ldr q30, [x1, x0]
fmla v31.2d, v27.2d, v28.2d
fadd v31.2d, v31.2d, v29.2d
fadd v31.2d, v31.2d, v30.2d
str q31, [x1, x0]
add x0, x0, 16
cmp x0, 8192
bne .L2