Skip to content
Permalink
Browse files

new sgemm 8x16

  • Loading branch information...
quickwritereader committed Jun 17, 2019
1 parent 148c4cc commit cdbfb891da2a8de14aa1d9bd7a57265284f7432c
Showing with 285 additions and 248 deletions.
  1. +106 −87 kernel/power/sgemm_logic_power9.S
  2. +178 −160 kernel/power/sgemm_macros_power9.S
  3. +1 −1 param.h
@@ -3,89 +3,89 @@ b L8

MY_ALIGN
LSGEMM_L8x16_LMAIN_SUB:
LOAD8x16_0
mtctr L
LOAD8x16_2
MY_ALIGN

LSGEMM_L8x16_LOOP:

KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_2 64,32, 15,0
KERNEL8x16_I1_L4_2 64,32, 16,0
KERNEL8x16_I1_L4_2 64,32, 17,0
KERNEL8x16_I1_L4_2 64,32, 18,0
KERNEL8x16_I1_L4_2 64,32, 19,0
KERNEL8x16_I1_L4_2 64,32, 20,0
KERNEL8x16_I1_L4_2 64,32, 21,0
KERNEL8x16_I1_L4_2 64,32, 22,0
KERNEL8x16_I1_L4_2 64,32, 23,0
KERNEL8x16_I1_L4_2 64,32, 24,0
KERNEL8x16_I1_L4_2 64,32, 25,0
KERNEL8x16_I1_L4_2 64,32, 26,0
KERNEL8x16_I1_L4_2 64,32, 27,0
KERNEL8x16_I1_L4_2 64,32, 28,0
KERNEL8x16_I1_L4_2 64,32, 29,0
KERNEL8x16_I1_L4_2 64,32, 30,0
KERNEL8x16_I1_L4_2 64,32, 31,1
KERNEL8x16_L2 128,64,0,0
LSGEMM_L8x16_K128:
KERNEL8x16_L2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64, 3,0
KERNEL8x16_I1_L4_2 128,64, 4,0
KERNEL8x16_I1_L4_2 128,64, 5,0
KERNEL8x16_I1_L4_2 128,64, 6,0
KERNEL8x16_I1_L4_2 128,64, 7,0
KERNEL8x16_I1_L4_2 128,64, 8,0
KERNEL8x16_I1_L4_2 128,64, 9,0
KERNEL8x16_I1_L4_2 128,64, 10,0
KERNEL8x16_I1_L4_2 128,64, 11,0
KERNEL8x16_I1_L4_2 128,64, 12,0
KERNEL8x16_I1_L4_2 128,64, 13,0
KERNEL8x16_I1_L4_2 128,64, 14,0
KERNEL8x16_I1_L4_2 128,64, 15,0
KERNEL8x16_I1_L4_2 128,64, 16,0
KERNEL8x16_I1_L4_2 128,64, 17,0
KERNEL8x16_I1_L4_2 128,64, 18,0
KERNEL8x16_I1_L4_2 128,64, 19,0
KERNEL8x16_I1_L4_2 128,64, 20,0
KERNEL8x16_I1_L4_2 128,64, 21,0
KERNEL8x16_I1_L4_2 128,64, 22,0
KERNEL8x16_I1_L4_2 128,64, 23,0
KERNEL8x16_I1_L4_2 128,64, 24,0
KERNEL8x16_I1_L4_2 128,64, 25,0
KERNEL8x16_I1_L4_2 128,64, 26,0
KERNEL8x16_I1_L4_2 128,64, 27,0
KERNEL8x16_I1_L4_2 128,64, 28,0
KERNEL8x16_I1_L4_2 128,64, 29,0
KERNEL8x16_I1_L4_2 128,64, 30,0
KERNEL8x16_I1_L4_2 128,64, 31,1
bdnz LSGEMM_L8x16_LOOP

MY_ALIGN
LSGEMM_L8x16_LOOP_END:
END8x16 0, AO, BO, 64, 32
END8x16_2
blr

MY_ALIGN
LSGEMM_L8x16_L64_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_3 64,32, 15,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_2 128,64,7,0
KERNEL8x16_I1_L4_2 128,64,8,0
KERNEL8x16_I1_L4_2 128,64,9,0
KERNEL8x16_I1_L4_2 128,64,10,0
KERNEL8x16_I1_L4_2 128,64,11,0
KERNEL8x16_I1_L4_2 128,64,12,0
KERNEL8x16_I1_L4_2 128,64,13,0
KERNEL8x16_I1_L4_2 128,64,14,0
KERNEL8x16_I1_L4_3 128,64,15,1
blr
LSGEMM_L8x16_L32_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_3 64,32, 7,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_3 128,64,7,1
blr

LSGEMM_L8x16_L16_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_3 64,32, 3,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_3 128,64,3,1
blr

L8:
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 7 /**(T11-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(T11-2) % 128x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 7 /**(K-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(K-2) % 128x */
#endif

ZERO8x16
ZERO8x16
mtctr L
ble LSGEMM_L8x16_SUB0
bl LSGEMM_L8x16_LMAIN_SUB
andi. L, T12, 127
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
cmpwi T11,128
#else
andi. L, K, 255
cmpwi K,129
#endif
li T10,1
bne CMP8x16_128K
addi BO,BO,-32
addi AO,AO,-64
LOAD8x16 64,32
END8x16_WITHOUT_ADD
LOAD8x16_2O AO,BO, 128, 64
mtctr T10
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
CMP8x16_128K:
/*----------------------------------------*/
#if defined(TRMMKERNEL)
cmpwi T11,128
#else
cmpwi K,128
#endif

bne LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB2_128:
bl LSGEMM_L8x16_L64_SUB
bl LSGEMM_L8x16_L64_SUB
b LSGEMM_L8x16_SAVE
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
mtctr T10
addi BO,BO,-64
addi AO,AO,-128
LOAD8x16_2O AO,BO, 128,64
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
andi. T10,L,64
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L8x16_SUB2_4
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_3 64,32, 1,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_3 128,64, 1,1
MY_ALIGN
LSGEMM_L8x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L8x16_SUB2_2
LOAD8x16_0
KERNEL8x16_I1_L4_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_I1_L4_3 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L8x16_SUB2_1
LOAD8x16_0
KERNEL8x16_I1_L2_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_E2 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1

0 comments on commit cdbfb89

Please sign in to comment.
You can’t perform that action at this time.