Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Sscal and Dscal added

  • Loading branch information...
commit 013697f334e622af805db92576e780ed034565f1 1 parent e89ec03
Michał Derkacz authored
4 Makefile
View
@@ -20,6 +20,8 @@ OFILES_amd64=\
dcopy_amd64.$O\
saxpy_amd64.$O\
daxpy_amd64.$O\
+ sscal_amd64.$O\
+ dscal_amd64.$O\
OFILES=\
$(OFILES_$(GOARCH))
@@ -41,6 +43,8 @@ ALLGOFILES=\
dcopy.go\
saxpy.go\
daxpy.go\
+ sscal.go\
+ dscal.go\
NOGOFILES=\
$(subst _$(GOARCH).$O,.go,$(OFILES_$(GOARCH)))
2  README.md
View
@@ -10,7 +10,7 @@ Any implemented function has its own unity test and benchmark.
*Level 1*
Sdsdot, Sdot, Ddot, Snrm2, Dnrm2, Sasum, Dasum, Isamax, Idamax, Sswap, Dswap,
-Scopy, Dcopy, Saxpy, Daxpy
+Scopy, Dcopy, Saxpy, Daxpy, Sscal, Dscal
*Level 2*
31 d_test.go
View
@@ -153,6 +153,28 @@ func TestDaxpy(t *testing.T) {
}
}
+func TestDscal(t *testing.T) {
+ alpha := 3.0
+ for inc := 1; inc < 9; inc++ {
+ for N := 0; N <= len(xd)/inc; N++ {
+ r := make([]float64, len(xd))
+ e := make([]float64, len(xd))
+ copy(r, xd)
+ copy(e, xd)
+ Dscal(N, alpha, r, inc)
+ for i := 0; i < N; i++ {
+ e[i*inc] = alpha * xd[i*inc]
+ }
+ for i := 0; i < len(xd); i++ {
+ if r[i] != e[i] {
+ t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, r[i],
+ e[i])
+ }
+ }
+ }
+ }
+}
+
var vd, wd []float64
func init() {
@@ -215,3 +237,12 @@ func BenchmarkDaxpy(b *testing.B) {
Daxpy(len(vd), -1.0, vd, 1, y, 1)
}
}
+func BenchmarkDscal(b *testing.B) {
+ b.StopTimer()
+ y := make([]float64, len(vd))
+ copy(y, vd)
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ Dscal(len(y), -1.0, y, 1)
+ }
+}
15 dscal.go
View
@@ -0,0 +1,15 @@
+package blas
+
+// Rescale the vector X by the multiplicative factor alpha
+func Dscal(N int, alpha float64, X []float64, incX int) {
+ var xi int
+ for ; N >= 2; N -= 2 {
+ X[xi] = alpha * X[xi]
+ xi += incX
+ X[xi] = alpha * X[xi]
+ xi += incX
+ }
+ if N != 0 {
+ X[xi] = alpha * X[xi]
+ }
+}
106 dscal_amd64.s
View
@@ -0,0 +1,106 @@
+// func Dscal(N int, alpha float64, X []float64, incX int)
+TEXT ·Dscal(SB), 7, $0
+ MOVL N+0(FP), BP
+ MOVSD alpha+8(FP), X0
+ MOVQ X_data+16(FP), SI
+ MOVL incX+32(FP), AX
+
+ // Check data bounaries
+ MOVL BP, CX
+ DECL CX
+ IMULL AX, CX // CX = incX * (N - 1)
+ CMPL CX, X_len+16(FP)
+ JGE panic
+
+ // Setup strides
+ SALQ $3, AX // AX = sizeof(float64) * incX
+
+ // Check that there are 4 or more values for SIMD calculations
+ SUBQ $4, BP
+ JL rest // There are less than 4 values to process
+
+ // Setup two alphas in X0
+ MOVLHPS X0, X0
+
+ // Check if incX != 1
+ CMPQ AX, $8
+ JNE with_stride
+
+ // Fully optimized loop (for incX == 1)
+ full_simd_loop:
+ // Load first two values and scale
+ MOVUPD (SI), X2
+ MULPD X0, X2
+ // Load second two values and scale
+ MOVUPD 16(SI), X4
+ MULPD X0, X4
+ // Save scaled values
+ MOVUPD X2, (SI)
+ MOVUPD X4, 16(SI)
+
+ // Update data pointers
+ ADDQ $32, SI
+
+ SUBQ $4, BP
+ JGE full_simd_loop // There are 4 or more pairs to process
+
+ JMP rest
+
+with_stride:
+ // Setup long stride
+ MOVQ AX, CX
+ SALQ $1, CX // CX = 16 * incX
+
+ // Partially optimized loop
+ half_simd_loop:
+ // Load first two values and scale
+ MOVLPD (SI), X2
+ MOVHPD (SI)(AX*1), X2
+ MULPD X0, X2
+ // Save scaled values
+ MOVLPD X2, (SI)
+ MOVHPD X2, (SI)(AX*1)
+
+ // Update data pointer using long stride
+ ADDQ CX, SI
+
+ // Load second two values and scale
+ MOVLPD (SI), X4
+ MOVHPD (SI)(AX*1), X4
+ MULPD X0, X4
+ // Save scaled values
+ MOVLPD X4, (SI)
+ MOVHPD X4, (SI)(AX*1)
+
+ // Update data pointer using long stride
+ ADDQ CX, SI
+
+ SUBQ $4, BP
+ JGE half_simd_loop // There are 4 or more pairs to process
+
+rest:
+ // Undo last SUBQ
+ ADDQ $4, BP
+
+ // Check that are there any value to process
+ JE end
+
+ loop:
+ // Load from X and scale
+ MOVSD (SI), X2
+ MULSD X0, X2
+ // Save scaled value
+ MOVSD X2, (SI)
+
+ // Update data pointers
+ ADDQ AX, SI
+
+ DECQ BP
+ JNE loop
+
+end:
+ RET
+
+panic:
+ CALL runtime·panicindex(SB)
+ RET
3  dscal_decl.go
View
@@ -0,0 +1,3 @@
+package blas
+
+func Dscal(N int, alpha float64, X []float64, incX int)
32 s_test.go
View
@@ -161,6 +161,28 @@ func TestSaxpy(t *testing.T) {
}
}
+func TestSscal(t *testing.T) {
+ alpha := float32(3.0)
+ for inc := 1; inc < 9; inc++ {
+ for N := 0; N <= len(xf)/inc; N++ {
+ r := make([]float32, len(xf))
+ e := make([]float32, len(xf))
+ copy(r, xf)
+ copy(e, xf)
+ Sscal(N, alpha, r, inc)
+ for i := 0; i < N; i++ {
+ e[i*inc] = alpha * xf[i*inc]
+ }
+ for i := 0; i < len(xf); i++ {
+ if r[i] != e[i] {
+ t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, r[i],
+ e[i])
+ }
+ }
+ }
+ }
+}
+
var vf, wf []float32
func init() {
@@ -229,3 +251,13 @@ func BenchmarkSaxpy(b *testing.B) {
Saxpy(len(vf), -1.0, vf, 1, y, 1)
}
}
+
+func BenchmarkSscal(b *testing.B) {
+ b.StopTimer()
+ y := make([]float32, len(vf))
+ copy(y, vf)
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ Sscal(len(y), -1.0, y, 1)
+ }
+}
15 sscal.go
View
@@ -0,0 +1,15 @@
+package blas
+
+// Rescale the vector X by the multiplicative factor alpha
+func Sscal(N int, alpha float32, X []float32, incX int) {
+ var xi int
+ for ; N >= 2; N -= 2 {
+ X[xi] = alpha * X[xi]
+ xi += incX
+ X[xi] = alpha * X[xi]
+ xi += incX
+ }
+ if N != 0 {
+ X[xi] = alpha * X[xi]
+ }
+}
118 sscal_amd64.s
View
@@ -0,0 +1,118 @@
+//func Sscal(N int, alpha float32, X []float32, incX int)
+TEXT ·Sscal(SB), 7, $0
+ MOVL N+0(FP), BP
+ MOVSS alpha+4(FP), X0
+ MOVQ X_data+8(FP), SI
+ MOVL incX+24(FP), AX
+
+ // Check data bounaries
+ MOVL BP, CX
+ DECL CX
+ IMULL AX, CX // CX = incX * (N - 1)
+ CMPL CX, X_len+16(FP)
+ JGE panic
+
+ // Setup stride
+ SALQ $2, AX // AX = sizeof(float32) * incX
+
+ // Check that there are 4 or more pairs for SIMD calculations
+ SUBQ $4, BP
+ JL rest // There are less than 4 values to process
+
+ // Setup four alphas in X0
+ SHUFPS $0, X0, X0
+
+ // Check if incX != 1
+ CMPQ AX, $4
+ JNE with_stride
+
+ // Fully optimized loop (for incX == 1)
+ full_simd_loop:
+ // Load four values and scale
+ MOVUPS (SI), X2
+ MULPS X0, X2
+ // Save scaled values
+ MOVUPS X2, (SI)
+
+ // Update data pointers
+ ADDQ $16, SI
+
+ SUBQ $4, BP
+ JGE full_simd_loop // There are 4 or more pairs to process
+
+ JMP rest
+
+with_stride:
+ // Setup long stride
+ MOVQ AX, CX
+ SALQ $1, CX // CX = 8 * incX
+
+ // Partially optimized loop
+ half_simd_loop:
+ // Load first two values
+ MOVSS (SI), X2
+ MOVSS (SI)(AX*1), X4
+
+ // Create a half-vector
+ UNPCKLPS X4, X2
+
+ // Save data pointer
+ MOVQ SI, DI
+ // Update data pointer using long stride
+ ADDQ CX, SI
+
+ // Load second two values
+ MOVSS (SI), X4
+ MOVSS (SI)(AX*1), X6
+
+ // Create a half-vector
+ UNPCKLPS X6, X4
+
+ // Create a full-vector
+ MOVLHPS X4, X2
+
+ // Scale the full-vector
+ MULPS X0, X2
+
+ // Unvectorize and save the result
+ MOVHLPS X2, X3
+ MOVSS X2, X4
+ MOVSS X3, X5
+ SHUFPS $0xe1, X2, X2
+ SHUFPS $0xe1, X3, X3
+ MOVSS X4, (DI)
+ MOVSS X2, (DI)(AX*1)
+ MOVSS X5, (SI)
+ MOVSS X3, (SI)(AX*1)
+
+ // Update data pointers using long strides
+ ADDQ CX, SI
+
+ SUBQ $4, BP
+ JGE half_simd_loop // There are 4 or more pairs to process
+
+rest:
+ // Undo last SUBQ
+ ADDQ $4, BP
+
+ // Check that are there any value to process
+ JE end
+
+ loop:
+ // Load from X and save scaled
+ MOVSS (SI), X2
+ MULSS X0, X2
+ MOVSS X2, (SI)
+
+ // Update data pointers
+ ADDQ AX, SI
+
+ DECQ BP
+ JNE loop
+
+end:
+ RET
+
+panic:
+ CALL runtime·panicindex(SB)
+ RET
3  sscal_decl.go
View
@@ -0,0 +1,3 @@
+package blas
+
+func Sscal(N int, alpha float32, X []float32, incX int)
Please sign in to comment.
Something went wrong with that request. Please try again.