Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Added optimized version od Dswap

  • Loading branch information...
commit f5adad3c5444cec848f80569b7c42ffe61af82a4 1 parent 70ca838
@ziutek authored
View
2  Makefile
@@ -1,5 +1,6 @@
include $(GOROOT)/src/Make.inc
+GC = $Og -N
TARG=blas
OFILES_amd64=\
@@ -8,6 +9,7 @@ OFILES_amd64=\
ddot_amd64.$O\
dnrm2_amd64.$O\
dasum_amd64.$O\
+ dswap_amd64.$O\
OFILES=\
$(OFILES_$(GOARCH))
View
2  ddot_amd64.s
@@ -108,7 +108,7 @@ rest:
// Undo last SUBQ
ADDQ $4, BP
- // Check that are there any pair to process
+ // Check that are there any value to process
JE end
loop:
View
128 dswap_amd64.s
@@ -0,0 +1,128 @@
+// func Dswap(N int, X []float64, incX int, Y []float64, incY int) float64
+TEXT ·Dswap(SB), 7, $0
+ MOVL N+0(FP), BP
+ MOVQ X_data+8(FP), SI
+ MOVL incX+24(FP), AX
+ MOVQ Y_data+32(FP), DI
+ MOVL incY+48(FP), BX
+
+ // Check data bounaries
+ MOVL BP, CX
+ DECL CX
+ MOVL CX, DX
+ IMULL AX, CX // CX = incX * (N - 1)
+ IMULL BX, DX // DX = incY * (N - 1)
+ CMPL CX, X_len+16(FP)
+ JGE panic
+ CMPL DX, Y_len+40(FP)
+ JGE panic
+
+ // Setup strides
+ SALQ $3, AX // AX = sizeof(float64) * incX
+ SALQ $3, BX // BX = sizeof(float64) * incY
+
+ // Check that there are 4 or more pairs for SIMD calculations
+ SUBQ $4, BP
+ JL rest // There are less than 4 pairs to process
+
+ // Check if incX != 1 or incY != 1
+ CMPQ AX, $8
+ JNE with_stride
+ CMPQ BX, $8
+ JNE with_stride
+
+ // Fully optimized loop (for incX == incY == 1)
+ full_simd_loop:
+ // Load two pairs from X
+ MOVUPD (SI), X0
+ MOVUPD 16(SI), X1
+ // Load two pairs from Y
+ MOVUPD (DI), X2
+ MOVUPD 16(DI), X3
+ // Save them
+ MOVUPD X0, (DI)
+ MOVUPD X1, 16(DI)
+ MOVUPD X2, (SI)
+ MOVUPD X3, 16(SI)
+
+ // Update data pointers
+ ADDQ $32, SI
+ ADDQ $32, DI
+
+ SUBQ $4, BP
+ JGE full_simd_loop // There are 4 or more pairs to process
+
+ JMP rest
+
+with_stride:
+ // Setup long strides
+ MOVQ AX, CX
+ MOVQ BX, DX
+ SALQ $1, CX // CX = 16 * incX
+ SALQ $1, DX // DX = 16 * incY
+
+ // Partially optimized loop
+ half_simd_loop:
+ // Load two values from X
+ MOVSD (SI), X0
+ MOVSD (SI)(AX*1), X1
+ // Load two values from Y
+ MOVSD (DI), X2
+ MOVSD (DI)(BX*1), X3
+ // Save them
+ MOVSD X0, (DI)
+ MOVSD X1, (DI)(BX*1)
+ MOVSD X2, (SI)
+ MOVSD X3, (SI)(AX*1)
+
+ // Update data pointers using long strides
+ ADDQ CX, SI
+ ADDQ DX, DI
+
+ // Load two values from X
+ MOVSD (SI), X0
+ MOVSD (SI)(AX*1), X1
+ // Load two values from Y
+ MOVSD (DI), X2
+ MOVSD (DI)(BX*1), X3
+ // Save them
+ MOVSD X0, (DI)
+ MOVSD X1, (DI)(BX*1)
+ MOVSD X2, (SI)
+ MOVSD X3, (SI)(AX*1)
+
+ // Update data pointers using long strides
+ ADDQ CX, SI
+ ADDQ DX, DI
+
+ SUBQ $4, BP
+ JGE half_simd_loop // There are 4 or more pairs to process
+
+rest:
+ // Undo last SUBQ
+ ADDQ $4, BP
+
+ // Check that are there any value to process
+ JE end
+
+ loop:
+ // Load values from X and Y
+ MOVSD (SI), X0
+ MOVSD (DI), X1
+ // Save them
+ MOVSD X0, (DI)
+ MOVSD X1, (SI)
+
+ // Update data pointers
+ ADDQ AX, SI
+ ADDQ BX, DI
+
+ DECQ BP
+ JNE loop
+
+end:
+ RET
+
+panic:
+ CALL runtime·panicindex(SB)
+ RET
View
3  dswap_decl.go
@@ -0,0 +1,3 @@
+package blas
+
+func Dswap(N int, X []float64, incX int, Y []float64, incY int)
View
2  sdot_amd64.s
@@ -115,7 +115,7 @@ rest:
// Undo last SUBQ
ADDQ $4, BP
- // Check that are there any pair to process
+ // Check that are there any value to process
JE end
loop:
View
2  sdsdot_amd64.s
@@ -129,7 +129,7 @@ rest:
// Undo last SUBQ
ADDQ $4, BP
- // Check that are there any pair to process
+ // Check that are there any value to process
JE end
loop:
Please sign in to comment.
Something went wrong with that request. Please try again.