Skip to content

Commit

Permalink
Merge pull request #839 from Atcold/fix_ASIMD
Browse files Browse the repository at this point in the history
Fix compilation for ASIMD, fix #766
  • Loading branch information
soumith authored Nov 15, 2016
2 parents a7d9af3 + 01b0f5a commit 97341e2
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 237 deletions.
7 changes: 5 additions & 2 deletions lib/TH/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP)

# ARM specific flags
FIND_PACKAGE(ARM)
IF (NEON_FOUND)
IF (ASIMD_FOUND)
MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
ELSEIF (NEON_FOUND)
MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
ENDIF (NEON_FOUND)
ENDIF (ASIMD_FOUND)
IF (CORTEXA8_FOUND)
MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
Expand Down
9 changes: 9 additions & 0 deletions lib/TH/cmake/FindARM.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
set(NEON_FOUND false CACHE BOOL "NEON available on host")
ENDIF (NEON_TRUE)

# on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
IF (ASIMD_TRUE)
set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
ELSE (ASIMD_TRUE)
set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
ENDIF (ASIMD_TRUE)

#Find the processor type (for now OMAP3 or OMAP4)
STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
Expand Down
296 changes: 61 additions & 235 deletions lib/TH/vector/NEON.c
Original file line number Diff line number Diff line change
@@ -1,252 +1,78 @@
static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
float ctemp = c;
float * caddr = &ctemp;
__asm__ __volatile__ (
"mov r0, %0 @ \n\t"
"ldr r4, [%1] @ \n\t"
"vdup.32 q12, r4 @ \n\t"
"vdup.32 q13, r4 @ \n\t"
"lsrs r4, %2, #3 @ \n\t"
"beq 3f @ \n\t"
"1: @ \n\t"
"vst1.32 {d24-d27}, [r0]! @ \n\t"
"subs r4, r4, #1 @ \n\t"
"bne 1b @ \n\t"
"3: @ \n\t"
"ands r4, %2, #7 @ \n\t"
"beq 5f @ \n\t"
"4: @ \n\t"
"subs r4, r4, #1 @ \n\t"
"vst1.32 {d24[0]}, [r0]! @ \n\t"
"bne 4b @ \n\t"
"5: @ "
:
:"r" (x), "r"(caddr),"r"(n)
: "cc", "r0", "r4", "memory",
"q12",
"d24", "d25", "d26", "d27"
);
long i = 0;

for(; i < n-4; i += 4)
{
x[i] = c;
x[i+1] = c;
x[i+2] = c;
x[i+3] = c;
}

for(; i < n; i++)
x[i] = c;

}


static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
__asm__ __volatile__ (
"mov r0, %2 @ \n\t"
"mov r1, %1 @ \n\t"
"mov r2, %0 @ \n\t"
"lsrs r4, %3, #3 @ \n\t"
"beq 3f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"1: @ \n\t"
"vsub.f32 q12, q8, q0 @ \n\t"
"vsub.f32 q13, q9, q1 @ \n\t"
"subs r4, r4, #1 @ \n\t"
"beq 2f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vst1.32 {d24-d27}, [r2]! @ \n\t"
"b 1b @ \n\t"
"2: @ \n\t"
"vst1.32 {d24-d27}, [r2]! @ \n\t"
"3: @ \n\t"
"ands r4, %3, #7 @ \n\t"
"beq 5f @ \n\t"
"4: @ \n\t"
"subs r4, r4, #1 @ \n\t"
"vld1.32 {d16[0]}, [r1]! @ \n\t"
"vld1.32 {d0[0]}, [r0]! @ \n\t"
"vsub.f32 d24, d16, d0 @ \n\t"
"vst1.32 {d24[0]}, [r2]! @ \n\t"
"bne 4b @ \n\t"
"5: @ "
:
:"r" (z), "r" (x),"r" (y), "r"(n)
: "cc", "r0", "r1", "r2", "r4", "memory",
"q0", "q1", "q8", "q9", "q12", "q13",
"d0", "d1", "d2", "d3",
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
);
long i = 0;

for(; i < n-4; i += 4)
{
z[i] = x[i] - y[i];
z[i+1] = x[i+1] - y[i+1];
z[i+2] = x[i+2] - y[i+2];
z[i+3] = x[i+3] - y[i+3];
}

for(; i < n; i++)
z[i] = x[i] - y[i];

}


static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
float ctemp = c;
float * caddr = &ctemp;
__asm__ __volatile__ (
"mov r0, %0 @ \n\t"
"mov r2, r0 @ \n\t"
"ldr r5, [%1] @ \n\t"
"vdup.32 q14, r5 @ \n\t"
"lsrs r5, %2, #5 @ \n\t"
"beq 3f @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vld1.32 {d4-d7}, [r0]! @ \n\t"
"vld1.32 {d8-d11}, [r0]! @ \n\t"
"vld1.32 {d12-d15}, [r0]! @ \n\t"
"1: @ \n\t"
"vmul.f32 q0, q0, q14 @ \n\t"
"vmul.f32 q1, q1, q14 @ \n\t"
"vmul.f32 q2, q2, q14 @ \n\t"
"vmul.f32 q3, q3, q14 @ \n\t"
"vmul.f32 q4, q4, q14 @ \n\t"
"vmul.f32 q5, q5, q14 @ \n\t"
"vmul.f32 q6, q6, q14 @ \n\t"
"vmul.f32 q7, q7, q14 @ \n\t"
"subs r5, r5, #1 @ \n\t"
"beq 2f @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vst1.32 {d4-d7}, [r2]! @ \n\t"
"vld1.32 {d4-d7}, [r0]! @ \n\t"
"vst1.32 {d8-d11}, [r2]! @ \n\t"
"vld1.32 {d8-d11}, [r0]! @ \n\t"
"vst1.32 {d12-d15}, [r2]! @ \n\t"
"vld1.32 {d12-d15}, [r0]! @ \n\t"
"b 1b @ \n\t"
"2: @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"vst1.32 {d4-d7}, [r2]! @ \n\t"
"vst1.32 {d8-d11}, [r2]! @ \n\t"
"vst1.32 {d12-d15}, [r2]! @ \n\t"
"3: @ \n\t"
"lsrs r5, %2, #4 @ \n\t"
"ands r5, r5, #1 @ \n\t"
"beq 4f @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vld1.32 {d4-d7}, [r0]! @ \n\t"
"vmul.f32 q0, q0, q14 @ \n\t"
"vmul.f32 q1, q1, q14 @ \n\t"
"vmul.f32 q2, q2, q14 @ \n\t"
"vmul.f32 q3, q3, q14 @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"vst1.32 {d4-d7}, [r2]! @ \n\t"
"4: @ \n\t"
"lsrs r5, %2, #3 @ \n\t"
"ands r5, r5, #1 @ \n\t"
"beq 5f @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vmul.f32 q0, q0, q14 @ \n\t"
"vmul.f32 q1, q1, q14 @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"5: @ \n\t"
"ands r5, %2, #7 @ \n\t"
"beq 7f @ \n\t"
"6: @ \n\t"
"subs r5, r5, #1 @ \n\t"
"vld1.32 d0[0], [r0]! @ \n\t"
"vmul.f32 d0, d0, d28 @ \n\t"
"vst1.32 d0[0], [r2]! @ \n\t"
"bne 6b @ \n\t"
"7: @ "
:
:"r" (y), "r"(caddr),"r"(n)
: "cc", "r0", "r2", "r5", "memory",
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
"d28", "d29"
);
long i = 0;

for(; i < n-4; i +=4)
{
y[i] *= c;
y[i+1] *= c;
y[i+2] *= c;
y[i+3] *= c;
}

for(; i < n; i++)
y[i] *= c;
}

static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
__asm__ __volatile__ (
"mov r0, %0 @ \n\t"
"mov r1, %1 @ \n\t"
"mov r2, r0 @ \n\t"
"lsrs r4, %2, #3 @ \n\t"
"beq 3f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"1: @ \n\t"
"vmul.f32 q12, q8, q0 @ \n\t"
"vmul.f32 q13, q9, q1 @ \n\t"
"subs r4, r4, #1 @ \n\t"
"beq 2f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vst1.32 {d24-d27}, [r2]! @ \n\t"
"b 1b @ \n\t"
"2: @ \n\t"
"vst1.32 {d24-d27}, [r2]! @ \n\t"
"3: @ \n\t"
"ands r4, %2, #7 @ \n\t"
"beq 5f @ \n\t"
"4: @ \n\t"
"subs r4, r4, #1 @ \n\t"
"vld1.32 {d16[0]}, [r1]! @ \n\t"
"vld1.32 {d0[0]}, [r0]! @ \n\t"
"vmul.f32 q12, q8, q0 @ \n\t"
"vst1.32 {d24[0]}, [r2]! @ \n\t"
"bne 4b @ \n\t"
"5: @ "
:
:"r" (y),"r" (x),"r"(n)
: "cc", "r0", "r1", "r2", "r4", "memory",
"q0", "q1", "q8", "q9", "q12", "q13",
"d0", "d1", "d2", "d3",
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
);
long i = 0;

for(; i < n-4; i += 4)
{
y[i] *= x[i];
y[i+1] *= x[i+1];
y[i+2] *= x[i+2];
y[i+3] *= x[i+3];
}

for(; i < n; i++)
y[i] *= x[i];
}

static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
float ctemp = c;
float * caddr = &ctemp;
__asm__ __volatile__ (
"mov r0, %0 @ \n\t"
"mov r1, %1 @ \n\t"
"mov r2, r0 @ \n\t"
"ldr r5, [%2] @ \n\t"
"vdup.32 q14, r5 @ \n\t"
"lsrs r5, %3, #4 @ \n\t"
"beq 3f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vld1.32 {d20-d23}, [r1]! @ \n\t"
"vld1.32 {d4-d7}, [r0]! @ \n\t"
"1: @ \n\t"
"vmla.f32 q0, q8, q14 @ \n\t"
"vmla.f32 q1, q9, q14 @ \n\t"
"vmla.f32 q2, q10, q14 @ \n\t"
"vmla.f32 q3, q11, q14 @ \n\t"
"subs r5, r5, #1 @ \n\t"
"beq 2f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d20-d23}, [r1]! @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vst1.32 {d4-d7}, [r2]! @ \n\t"
"vld1.32 {d4-d7}, [r0]! @ \n\t"
"b 1b @ \n\t"
"2: @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"vst1.32 {d4-d7}, [r2]! @ \n\t"
"3: @ \n\t"
"lsrs r5, %3, #3 @ \n\t"
"ands r5, #1 @ \n\t"
"beq 4f @ \n\t"
"vld1.32 {d16-d19}, [r1]! @ \n\t"
"vld1.32 {d0-d3}, [r0]! @ \n\t"
"vmla.f32 q0, q8, q14 @ \n\t"
"vmla.f32 q1, q9, q14 @ \n\t"
"vst1.32 {d0-d3}, [r2]! @ \n\t"
"4: @ \n\t"
"ands r5, %3, #7 @ \n\t"
"beq 6f @ \n\t"
"5: @ \n\t"
"subs r5, r5, #1 @ \n\t"
"vld1.32 {d16[0]}, [r1]! @ \n\t"
"vld1.32 {d0[0]}, [r0]! @ \n\t"
"vmla.f32 d0, d16, d28 @ \n\t"
"vst1.32 d0[0], [r2]! @ \n\t"
"bne 5b @ \n\t"
"6: @ "
:
:"r" (y),"r" (x), "r"(caddr),"r"(n)
: "cc", "r0", "r1", "r2", "r5", "memory",
"q0", "q1", "q2", "q3", "q14",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
);
long i = 0;

for(;i < n-4; i += 4)
{
y[i] += c * x[i];
y[i+1] += c * x[i+1];
y[i+2] += c * x[i+2];
y[i+3] += c * x[i+3];
}

for(; i < n; i++)
y[i] += c * x[i];
}

0 comments on commit 97341e2

Please sign in to comment.