From 21a09e3bb9a1131b8eb5d86092177bb1487a0907 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Sat, 30 May 2026 12:21:37 -0500 Subject: [PATCH] Finish rot/rotg/rotm to the BLAS reference (C8, part 2) The rotation routines existed but did not compile or build: srot used a struct-vs-int comparison and a uint32_t signature; srotg was WIP with the wrong sign factor (|roe| instead of sign(roe)) and assigned to local pointer copies; srotm had a parenless `if f32_eq(...)`. All were absent from the Makefile. Rewrite srot/srotg/srotm faithfully to the Netlib SROT/SROTG/SROTM reference using SoftFloat ops (loading operands into locals first, since the abs/compare macros don't parenthesize their argument), with NaN canonicalization on the outputs. Generate the float64 and float16 variants and add all nine to the Makefile. test_rot.c covers srot (c=0,s=1 -> (y,-x)), srotg (a=1,b=0; and 3,4 -> r=5), and srotm (flag=0 modified Givens). 155/155 tests pass. (rotmg and sdsdot/hsdot remain; rotmg's rescaling constant gamsq=2^24 overflows float16 and needs handling.) Co-Authored-By: Claude Opus 4.8 --- Makefile | 9 +++ src/blas/level1/drot.c | 33 ++++------- src/blas/level1/drotg.c | 53 +++++++++-------- src/blas/level1/drotm.c | 110 ++++++++-------------------------- src/blas/level1/hrot.c | 31 ++++------ src/blas/level1/hrotg.c | 53 +++++++++-------- src/blas/level1/hrotm.c | 112 +++++++++-------------------------- src/blas/level1/srot.c | 29 +++------ src/blas/level1/srotg.c | 68 +++++++++------------ src/blas/level1/srotm.c | 112 +++++++++-------------------------- tests/blas/include/test.h | 5 ++ tests/blas/level1/test_rot.c | 43 ++++++++++++++ tests/test_all.c | 5 ++ 13 files changed, 261 insertions(+), 402 deletions(-) create mode 100644 tests/blas/level1/test_rot.c diff --git a/Makefile b/Makefile index cfc5208..3eb514d 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,15 @@ BLAS_SRCS = \ $(BLAS_SRC_DIR_L1)/cdotc.c \ $(BLAS_SRC_DIR_L1)/scnrm2.c \ $(BLAS_SRC_DIR_L1)/csrot.c \ + $(BLAS_SRC_DIR_L1)/srot.c \ + $(BLAS_SRC_DIR_L1)/drot.c \ + $(BLAS_SRC_DIR_L1)/hrot.c \ + $(BLAS_SRC_DIR_L1)/srotg.c \ + $(BLAS_SRC_DIR_L1)/drotg.c \ + $(BLAS_SRC_DIR_L1)/hrotg.c \ + $(BLAS_SRC_DIR_L1)/srotm.c \ + $(BLAS_SRC_DIR_L1)/drotm.c \ + $(BLAS_SRC_DIR_L1)/hrotm.c \ $(BLAS_SRC_DIR_L2)/sgemv.c \ $(BLAS_SRC_DIR_L2)/dgemv.c \ $(BLAS_SRC_DIR_L2)/hgemv.c \ diff --git a/src/blas/level1/drot.c b/src/blas/level1/drot.c index a12dc3b..82cb686 100644 --- a/src/blas/level1/drot.c +++ b/src/blas/level1/drot.c @@ -1,23 +1,14 @@ -void drot(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) { - _set_rounding(rndMode); - float64_t tmp; +#include "softblas.h" - if (c != SB_REAL64_ONE || s != SB_REAL64_ZERO) { - if (incX == 1 && incY == 1) { - for (uint64_t i=0; i != N; i++) { - tmp = f64_add(f64_mul(c, X[i]), f64_mul(s, Y[i])); - Y[i] = f64_sub(f64_mul(c, Y[i]), f64_mul(s, X[i])); - X[i] = tmp; - } - } - else - { - for (uint64_t i=N; i; i--, Y += incY, X += incX) - { - tmp = f64_add(f64_mul(c, *X), f64_mul(s, *Y)); - *Y = f64_sub(f64_mul(c, *Y), f64_mul(s, *X)); - *X = tmp; - } - } +// Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i]. +void drot(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) { + _set_rounding(rndMode); + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float64_t tmp = f64_add(f64_mul(c, X[iX]), f64_mul(s, Y[iY])); + Y[iY] = nan_unify_d(f64_sub(f64_mul(c, Y[iY]), f64_mul(s, X[iX]))); + X[iX] = nan_unify_d(tmp); + iX += incX; + iY += incY; } -} \ No newline at end of file +} diff --git a/src/blas/level1/drotg.c b/src/blas/level1/drotg.c index 11d7dc7..23f944a 100644 --- a/src/blas/level1/drotg.c +++ b/src/blas/level1/drotg.c @@ -1,30 +1,35 @@ #include "softblas.h" +// Construct a Givens rotation that zeroes b: returns c, s, and overwrites +// a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z. +// (The SoftFloat comparison/abs macros do not parenthesize their argument, +// so every operand here is first loaded into a plain local.) void drotg(float64_t *a, float64_t *b, float64_t *c, float64_t *s, const uint_fast8_t rndMode) { _set_rounding(rndMode); - float64_t roe, scal, r, z, aa, ab, t0, t1; + const float64_t ZERO = { SB_REAL64_ZERO }; + const float64_t ONE = { SB_REAL64_ONE }; + const float64_t NEG = { SB_REAL64_NEGONE }; - aa = f64_abs(*a); - ab = f64_abs(*b); - if f64_gt(aa, ab) roe = *a; - else roe = *b; - scal = f64_add(aa, ab); - if f64_ne(scal, SB_REAL64_ZERO) - { - t0 = f64_div(aa, scal); t1 = f64_div(ab, scal); - r = f64_mul(scal, f64_sqrt(f64_add(f64_mul(t0, t0), f64_mul(t1, t1)))); - if f64_le(roe, SB_REAL64_ZERO) r = f64_neg(r); - *c = f64_div(*a, r); - *s = f64_div(*b, r); - if f64_gt(aa, ab) z = *s; - else if f64_ne(*c, SB_REAL64_ZERO) z = f64_div(SB_REAL64_ONE, *c); - else z = SB_REAL64_ONE; - *a = r; - *b = z; - } - else - { - *c = SB_REAL64_ONE; - *s = *a = *b = SB_REAL64_ZERO; - } + float64_t A = *a, B = *b; + float64_t absA = f64_abs(A), absB = f64_abs(B); + float64_t roe = f64_gt(absA, absB) ? A : B; + float64_t scale = f64_add(absA, absB); + float64_t r, z; + + if (f64_eq(scale, ZERO)) { + *c = ONE; *s = ZERO; r = ZERO; z = ZERO; + } else { + float64_t as = f64_div(A, scale); + float64_t bs = f64_div(B, scale); + r = f64_mul(scale, f64_sqrt(f64_add(f64_mul(as, as), f64_mul(bs, bs)))); + r = f64_mul(f64_lt(roe, ZERO) ? NEG : ONE, r); // sign(roe) * |r| + *c = f64_div(A, r); + *s = f64_div(B, r); + float64_t C = *c, S = *s; + z = ONE; + if (f64_gt(absA, absB)) z = S; + if (f64_ge(absB, absA) && f64_ne(C, ZERO)) z = f64_div(ONE, C); + } + *a = r; + *b = z; } diff --git a/src/blas/level1/drotm.c b/src/blas/level1/drotm.c index 22a5bc8..500945d 100644 --- a/src/blas/level1/drotm.c +++ b/src/blas/level1/drotm.c @@ -1,90 +1,32 @@ #include "softblas.h" +// Apply a modified Givens rotation H (selected by the flag P[0]) to the +// pair of vectors X, Y. H is built from P[1..4] as in the reference SROTM. void drotm(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t *P, const uint_fast8_t rndMode) { _set_rounding(rndMode); - uint64_t i; - const float64_t flag = *P; - float64_t h11, h21, h12, h22, w, z; + const float64_t flag = P[0]; + const float64_t NEGTWO = { SB_REAL64_NEGTWO }; + const float64_t NEGONE = { SB_REAL64_NEGONE }; + const float64_t ZERO = { SB_REAL64_ZERO }; + const float64_t ONE = { SB_REAL64_ONE }; - if f64_eq(flag, SB_REAL64_NEGTWO) return; - if (f64_eq(flag, SB_REAL64_NEGONE)) - { - h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f64_add(f64_mul(w, h11), f64_mul(z, h12)); - *Y++ = f64_add(f64_mul(w, h21), f64_mul(z, h22)); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f64_add(f64_mul(w, h11), f64_mul(z, h12)); - *Y = f64_add(f64_mul(w, h21), f64_mul(z, h22)); - X += incX; - Y += incY; - } - } - } - else if (f64_eq(flag, SB_REAL64_ZERO)) - { - h21 = P[2]; - h12 = P[3]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f64_add(w, f64_mul(z, h12)); - *Y++ = f64_add(f64_mul(w, h21), z); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f64_add(w, f64_mul(z, h12)); - *Y = f64_add(f64_mul(w, h21), z); - X += incX; - Y += incY; - } - } - } - else if (f64_eq(flag, SB_REAL64_ONE)) - { - h11 = P[1]; - h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f64_add(f64_mul(w, h11), z); - *Y++ = f64_sub(f64_mul(z, h22), w); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f64_add(f64_mul(w, h11), z); - *Y = f64_sub(f64_mul(z, h22), w); - X += incX; - Y += incY; - } - } - } + if (f64_eq(flag, NEGTWO)) return; + + float64_t h11, h21, h12, h22; + if (f64_eq(flag, NEGONE)) { + h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; + } else if (f64_eq(flag, ZERO)) { + h11 = ONE; h21 = P[2]; h12 = P[3]; h22 = ONE; + } else { /* flag == 1 */ + h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4]; + } + + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float64_t w = X[iX], z = Y[iY]; + X[iX] = nan_unify_d(f64_add(f64_mul(w, h11), f64_mul(z, h12))); + Y[iY] = nan_unify_d(f64_add(f64_mul(w, h21), f64_mul(z, h22))); + iX += incX; + iY += incY; + } } diff --git a/src/blas/level1/hrot.c b/src/blas/level1/hrot.c index 19456c5..d0bf9f2 100644 --- a/src/blas/level1/hrot.c +++ b/src/blas/level1/hrot.c @@ -1,23 +1,14 @@ -void hrot(const uint16_t N, float16_t *X, const uint16_t incX, float16_t *Y, const uint16_t incY, const float16_t c, const float16_t s) { - _set_rounding(rndMode); - float16_t tmp; +#include "softblas.h" - if (c != SB_REAL32_ONE || s != SB_REAL32_ZERO) { - if (incX == 1 && incY == 1) { - for (uint64_t i=0; i != N; i++) { - tmp = f16_add(f16_mul(c, X[i]), f16_mul(s, Y[i])); - Y[i] = f16_sub(f16_mul(c, Y[i]), f16_mul(s, X[i])); - X[i] = tmp; - } - } - else - { - for (uint64_t i=N; i; i--, Y += incY, X += incX) - { - tmp = f16_add(f16_mul(c, *X), f16_mul(s, *Y)); - *Y = f16_sub(f16_mul(c, *Y), f16_mul(s, *X)); - *X = tmp; - } - } +// Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i]. +void hrot(const uint64_t N, float16_t *X, const uint64_t incX, float16_t *Y, const uint64_t incY, const float16_t c, const float16_t s, const uint_fast8_t rndMode) { + _set_rounding(rndMode); + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float16_t tmp = f16_add(f16_mul(c, X[iX]), f16_mul(s, Y[iY])); + Y[iY] = nan_unify_h(f16_sub(f16_mul(c, Y[iY]), f16_mul(s, X[iX]))); + X[iX] = nan_unify_h(tmp); + iX += incX; + iY += incY; } } diff --git a/src/blas/level1/hrotg.c b/src/blas/level1/hrotg.c index ffe1146..be711c5 100644 --- a/src/blas/level1/hrotg.c +++ b/src/blas/level1/hrotg.c @@ -1,30 +1,35 @@ #include "softblas.h" +// Construct a Givens rotation that zeroes b: returns c, s, and overwrites +// a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z. +// (The SoftFloat comparison/abs macros do not parenthesize their argument, +// so every operand here is first loaded into a plain local.) void hrotg(float16_t *a, float16_t *b, float16_t *c, float16_t *s, const uint_fast8_t rndMode) { _set_rounding(rndMode); - float16_t roe, scal, r, z, aa, ab, t0, t1; + const float16_t ZERO = { SB_REAL16_ZERO }; + const float16_t ONE = { SB_REAL16_ONE }; + const float16_t NEG = { SB_REAL16_NEGONE }; - aa = f16_abs(*a); - ab = f16_abs(*b); - if f16_gt(aa, ab) roe = *a; - else roe = *b; - scal = f16_add(aa, ab); - if f16_ne(scal, SB_REAL16_ZERO) - { - t0 = f16_div(aa, scal); t1 = f16_div(ab, scal); - r = f16_mul(scal, f16_sqrt(f16_add(f16_mul(t0, t0), f16_mul(t1, t1)))); - if f16_le(roe, SB_REAL16_ZERO) r = f16_neg(r); - *c = f16_div(*a, r); - *s = f16_div(*b, r); - if f16_gt(aa, ab) z = *s; - else if f16_ne(*c, SB_REAL16_ZERO) z = f16_div(SB_REAL16_ONE, *c); - else z = SB_REAL16_ONE; - *a = r; - *b = z; - } - else - { - *c = SB_REAL16_ONE; - *s = *a = *b = SB_REAL16_ZERO; - } + float16_t A = *a, B = *b; + float16_t absA = f16_abs(A), absB = f16_abs(B); + float16_t roe = f16_gt(absA, absB) ? A : B; + float16_t scale = f16_add(absA, absB); + float16_t r, z; + + if (f16_eq(scale, ZERO)) { + *c = ONE; *s = ZERO; r = ZERO; z = ZERO; + } else { + float16_t as = f16_div(A, scale); + float16_t bs = f16_div(B, scale); + r = f16_mul(scale, f16_sqrt(f16_add(f16_mul(as, as), f16_mul(bs, bs)))); + r = f16_mul(f16_lt(roe, ZERO) ? NEG : ONE, r); // sign(roe) * |r| + *c = f16_div(A, r); + *s = f16_div(B, r); + float16_t C = *c, S = *s; + z = ONE; + if (f16_gt(absA, absB)) z = S; + if (f16_ge(absB, absA) && f16_ne(C, ZERO)) z = f16_div(ONE, C); + } + *a = r; + *b = z; } diff --git a/src/blas/level1/hrotm.c b/src/blas/level1/hrotm.c index e223f5a..ebfdba9 100644 --- a/src/blas/level1/hrotm.c +++ b/src/blas/level1/hrotm.c @@ -1,90 +1,32 @@ #include "softblas.h" -void hrotm(const uint16_t N, float16_t *X, const uint16_t incX, float16_t *Y, const uint16_t incY, const float16_t *P, const uint_fast8_t rndMode) { +// Apply a modified Givens rotation H (selected by the flag P[0]) to the +// pair of vectors X, Y. H is built from P[1..4] as in the reference SROTM. +void hrotm(const uint64_t N, float16_t *X, const uint64_t incX, float16_t *Y, const uint64_t incY, const float16_t *P, const uint_fast8_t rndMode) { _set_rounding(rndMode); - uint16_t i; - const float16_t flag = *P; - float16_t h11, h21, h12, h22, w, z; + const float16_t flag = P[0]; + const float16_t NEGTWO = { SB_REAL16_NEGTWO }; + const float16_t NEGONE = { SB_REAL16_NEGONE }; + const float16_t ZERO = { SB_REAL16_ZERO }; + const float16_t ONE = { SB_REAL16_ONE }; - if (f16_eq(flag, SB_REAL16_NEGTWO)) return; - if (f16_eq(flag, SB_REAL16_NEGONE)) - { - h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f16_add(f16_mul(w, h11), f16_mul(z, h12)); - *Y++ = f16_add(f16_mul(w, h21), f16_mul(z, h22)); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f16_add(f16_mul(w, h11), f16_mul(z, h12)); - *Y = f16_add(f16_mul(w, h21), f16_mul(z, h22)); - X += incX; - Y += incY; - } - } - } - else if (f16_eq(flag, SB_REAL16_ZERO)) - { - h21 = P[2]; - h12 = P[3]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f16_add(w, f16_mul(z, h12)); - *Y++ = f16_add(f16_mul(w, h21), z); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f16_add(w, f16_mul(z, h12)); - *Y = f16_add(f16_mul(w, h21), z); - X += incX; - Y += incY; - } - } - } - else if (f16_eq(flag, SB_REAL16_ONE)) - { - h11 = P[1]; - h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f16_add(f16_mul(w, h11), z); - *Y++ = f16_sub(f16_mul(z, h22), w); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f16_add(f16_mul(w, h11), z); - *Y = f16_sub(f16_mul(z, h22), w); - X += incX; - Y += incY; - } - } - } + if (f16_eq(flag, NEGTWO)) return; + + float16_t h11, h21, h12, h22; + if (f16_eq(flag, NEGONE)) { + h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; + } else if (f16_eq(flag, ZERO)) { + h11 = ONE; h21 = P[2]; h12 = P[3]; h22 = ONE; + } else { /* flag == 1 */ + h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4]; + } + + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float16_t w = X[iX], z = Y[iY]; + X[iX] = nan_unify_h(f16_add(f16_mul(w, h11), f16_mul(z, h12))); + Y[iY] = nan_unify_h(f16_add(f16_mul(w, h21), f16_mul(z, h22))); + iX += incX; + iY += incY; + } } diff --git a/src/blas/level1/srot.c b/src/blas/level1/srot.c index 0c78710..bfdf489 100644 --- a/src/blas/level1/srot.c +++ b/src/blas/level1/srot.c @@ -1,25 +1,14 @@ #include "softblas.h" -void srot(const uint32_t N, float32_t *X, const uint32_t incX, float32_t *Y, const uint32_t incY, const float32_t c, const float32_t s, const uint_fast8_t rndMode) { +// Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i]. +void srot(const uint64_t N, float32_t *X, const uint64_t incX, float32_t *Y, const uint64_t incY, const float32_t c, const float32_t s, const uint_fast8_t rndMode) { _set_rounding(rndMode); - float32_t tmp; - - if (c != SB_REAL32_ONE || s != SB_REAL32_ZERO) { - if (incX == 1 && incY == 1) { - for (uint64_t i=0; i != N; i++) { - tmp = f32_add(f32_mul(c, X[i]), f32_mul(s, Y[i])); - Y[i] = f32_sub(f32_mul(c, Y[i]), f32_mul(s, X[i])); - X[i] = tmp; - } - } - else - { - for (uint64_t i=N; i; i--, Y += incY, X += incX) - { - tmp = f32_add(f32_mul(c, *X), f32_mul(s, *Y)); - *Y = f32_sub(f32_mul(c, *Y), f32_mul(s, *X)); - *X = tmp; - } - } + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float32_t tmp = f32_add(f32_mul(c, X[iX]), f32_mul(s, Y[iY])); + Y[iY] = nan_unify_s(f32_sub(f32_mul(c, Y[iY]), f32_mul(s, X[iX]))); + X[iX] = nan_unify_s(tmp); + iX += incX; + iY += incY; } } diff --git a/src/blas/level1/srotg.c b/src/blas/level1/srotg.c index fa47cad..664963e 100644 --- a/src/blas/level1/srotg.c +++ b/src/blas/level1/srotg.c @@ -1,45 +1,35 @@ #include "softblas.h" -// *** XXX WIP XXX *** - -// Given a vertical matrix containing a and b, computes the values of cos θ and -// sin θ that zero the lower value (b). Returns the value of sin θ in s, the -// value of cos θ in c, and the upper value (r) in a. -void srotg(float32_t *A, float32_t *B, float32_t *C, float32_t *S, const uint_fast8_t rndMode) { +// Construct a Givens rotation that zeroes b: returns c, s, and overwrites +// a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z. +// (The SoftFloat comparison/abs macros do not parenthesize their argument, +// so every operand here is first loaded into a plain local.) +void srotg(float32_t *a, float32_t *b, float32_t *c, float32_t *s, const uint_fast8_t rndMode) { _set_rounding(rndMode); - const float32_t SZERO = { SB_REAL32_ZERO }; - const float32_t SONE = { SB_REAL32_ONE }; - float32_t roe, scale, R, Z; - - if (f32_gt(f32M_abs(A), f32M_abs(B))) { - roe = *A; - } else { - roe = *B; - } - scale = f32_add(f32M_abs(A), f32M_abs(B)); - - if (f32_eq(scale, SZERO)) { - C = &((float32_t){ SB_REAL32_ONE }); - S = &((float32_t){ SB_REAL32_ZERO }); - R = (float32_t){ SB_REAL32_ZERO }; - Z = (float32_t){ SB_REAL32_ZERO }; - A = &R; - B = &Z; - } else { - R = f32_mul(scale, f32_sqrt(f32_add(f32_mul(f32_div(*A, scale), f32_div(*A, scale)), f32_mul(f32_div(*B, scale), f32_div(*B, scale))))); - R = f32_mul(f32_abs(roe), R); + const float32_t ZERO = { SB_REAL32_ZERO }; + const float32_t ONE = { SB_REAL32_ONE }; + const float32_t NEG = { SB_REAL32_NEGONE }; - *C = f32_div(*A, R); - *S = f32_div(*B, R); - Z = SONE; - if (f32_gt(f32M_abs(*A), f32M_abs(*B))) { - Z = *S; - } - if (f32_ge(f32M_abs(*B), f32M_abs(*A)) && f32_ne(*C, SZERO)) { - Z = f32_div(SONE, *C); - } - } + float32_t A = *a, B = *b; + float32_t absA = f32_abs(A), absB = f32_abs(B); + float32_t roe = f32_gt(absA, absB) ? A : B; + float32_t scale = f32_add(absA, absB); + float32_t r, z; - *A = R; - *B = Z; + if (f32_eq(scale, ZERO)) { + *c = ONE; *s = ZERO; r = ZERO; z = ZERO; + } else { + float32_t as = f32_div(A, scale); + float32_t bs = f32_div(B, scale); + r = f32_mul(scale, f32_sqrt(f32_add(f32_mul(as, as), f32_mul(bs, bs)))); + r = f32_mul(f32_lt(roe, ZERO) ? NEG : ONE, r); // sign(roe) * |r| + *c = f32_div(A, r); + *s = f32_div(B, r); + float32_t C = *c, S = *s; + z = ONE; + if (f32_gt(absA, absB)) z = S; + if (f32_ge(absB, absA) && f32_ne(C, ZERO)) z = f32_div(ONE, C); + } + *a = r; + *b = z; } diff --git a/src/blas/level1/srotm.c b/src/blas/level1/srotm.c index 90f88b1..c5484bd 100644 --- a/src/blas/level1/srotm.c +++ b/src/blas/level1/srotm.c @@ -1,90 +1,32 @@ #include "softblas.h" -void srotm(const uint32_t N, float32_t *X, const uint32_t incX, float32_t *Y, const uint32_t incY, const float32_t *P, const uint_fast8_t rndMode) { +// Apply a modified Givens rotation H (selected by the flag P[0]) to the +// pair of vectors X, Y. H is built from P[1..4] as in the reference SROTM. +void srotm(const uint64_t N, float32_t *X, const uint64_t incX, float32_t *Y, const uint64_t incY, const float32_t *P, const uint_fast8_t rndMode) { _set_rounding(rndMode); - uint64_t i; - const float32_t flag = *P; - float32_t h11, h21, h12, h22, w, z; + const float32_t flag = P[0]; + const float32_t NEGTWO = { SB_REAL32_NEGTWO }; + const float32_t NEGONE = { SB_REAL32_NEGONE }; + const float32_t ZERO = { SB_REAL32_ZERO }; + const float32_t ONE = { SB_REAL32_ONE }; - if f32_eq(flag, SB_REAL32_NEGTWO) return; - if (f32_eq(flag, SB_REAL32_NEGONE)) - { - h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f32_add(f32_mul(w, h11), f32_mul(z, h12)); - *Y++ = f32_add(f32_mul(w, h21), f32_mul(z, h22)); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f32_add(f32_mul(w, h11), f32_mul(z, h12)); - *Y = f32_add(f32_mul(w, h21), f32_mul(z, h22)); - X += incX; - Y += incY; - } - } - } - else if (f32_eq(flag, SB_REAL32_ZERO)) - { - h21 = P[2]; - h12 = P[3]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f32_add(w, f32_mul(z, h12)); - *Y++ = f32_add(f32_mul(w, h21), z); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f32_add(w, f32_mul(z, h12)); - *Y = f32_add(f32_mul(w, h21), z); - X += incX; - Y += incY; - } - } - } - else if (f32_eq(flag, SB_REAL32_ONE)) - { - h11 = P[1]; - h22 = P[4]; - if (incX == 1 && incY == 1) - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X++ = f32_add(f32_mul(w, h11), z); - *Y++ = f32_sub(f32_mul(z, h22), w); - } - } - else - { - for (i = N; i; i--) - { - w = *X; - z = *Y; - *X = f32_add(f32_mul(w, h11), z); - *Y = f32_sub(f32_mul(z, h22), w); - X += incX; - Y += incY; - } - } - } + if (f32_eq(flag, NEGTWO)) return; + + float32_t h11, h21, h12, h22; + if (f32_eq(flag, NEGONE)) { + h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4]; + } else if (f32_eq(flag, ZERO)) { + h11 = ONE; h21 = P[2]; h12 = P[3]; h22 = ONE; + } else { /* flag == 1 */ + h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4]; + } + + uint64_t iX = 0, iY = 0; + for (uint64_t i = 0; i < N; i++) { + float32_t w = X[iX], z = Y[iY]; + X[iX] = nan_unify_s(f32_add(f32_mul(w, h11), f32_mul(z, h12))); + Y[iY] = nan_unify_s(f32_add(f32_mul(w, h21), f32_mul(z, h22))); + iX += incX; + iY += incY; + } } diff --git a/tests/blas/include/test.h b/tests/blas/include/test.h index 989be8e..06beba0 100644 --- a/tests/blas/include/test.h +++ b/tests/blas/include/test.h @@ -460,5 +460,10 @@ MunitResult test_cdotc_conj(const MunitParameter params[], void* u); MunitResult test_scasum_basic(const MunitParameter params[], void* u); MunitResult test_scnrm2_basic(const MunitParameter params[], void* u); MunitResult test_csrot_basic(const MunitParameter params[], void* u); +// Rotation (test_rot.c) +MunitResult test_srot_basic(const MunitParameter params[], void* u); +MunitResult test_srotg_basic(const MunitParameter params[], void* u); +MunitResult test_srotg_345(const MunitParameter params[], void* u); +MunitResult test_srotm_basic(const MunitParameter params[], void* u); #endif // TEST_H diff --git a/tests/blas/level1/test_rot.c b/tests/blas/level1/test_rot.c new file mode 100644 index 0000000..f052900 --- /dev/null +++ b/tests/blas/level1/test_rot.c @@ -0,0 +1,43 @@ +#include "test.h" + +// srot: plane rotation with c=0, s=1 maps (x,y) -> (y, -x). +MunitResult test_srot_basic(const MunitParameter params[], void* u) { + const float32_t c = { 0x00000000 }, s = { 0x3f800000 }; // 0, 1 + float32_t* X = svec((float[]){1.0f, 3.0f}, 2); + float32_t* Y = svec((float[]){2.0f, 4.0f}, 2); + srot(2, X, 1, Y, 1, c, s, 'n'); + assert_ulong(X[0].v, ==, 0x40000000u); // 2 + assert_ulong(X[1].v, ==, 0x40800000u); // 4 + assert_ulong(Y[0].v, ==, 0xbf800000u); // -1 + assert_ulong(Y[1].v, ==, 0xc0400000u); // -3 + free(X); free(Y); + return MUNIT_OK; +} +// srotg(a=1, b=0): r=1, z=0, c=1, s=0. +MunitResult test_srotg_basic(const MunitParameter params[], void* u) { + float32_t a = { 0x3f800000 }, b = { 0x00000000 }, c, s; + srotg(&a, &b, &c, &s, 'n'); + assert_ulong(a.v, ==, 0x3f800000u); // r = 1 + assert_ulong(b.v, ==, 0x00000000u); // z = 0 + assert_ulong(c.v, ==, 0x3f800000u); // c = 1 + assert_ulong(s.v, ==, 0x00000000u); // s = 0 + return MUNIT_OK; +} +// srotg(a=3, b=4): r = 5 (|a|<|b| so roe=b>0 -> positive r). +MunitResult test_srotg_345(const MunitParameter params[], void* u) { + float32_t a = { 0x40400000 }, b = { 0x40800000 }, c, s; // 3, 4 + srotg(&a, &b, &c, &s, 'n'); + assert_ulong(a.v, ==, 0x40a00000u); // r = 5 + return MUNIT_OK; +} +// srotm flag=0: H = [[1, h12],[h21, 1]] with h21=h12=1. (2,3) -> (5,5). +MunitResult test_srotm_basic(const MunitParameter params[], void* u) { + float32_t* X = svec((float[]){2.0f}, 1); + float32_t* Y = svec((float[]){3.0f}, 1); + float32_t P[5] = {{0x00000000},{0},{0x3f800000},{0x3f800000},{0}}; // flag=0,h21=1,h12=1 + srotm(1, X, 1, Y, 1, P, 'n'); + assert_ulong(X[0].v, ==, 0x40a00000u); // 5 + assert_ulong(Y[0].v, ==, 0x40a00000u); // 5 + free(X); free(Y); + return MUNIT_OK; +} diff --git a/tests/test_all.c b/tests/test_all.c index 7751049..93ed521 100644 --- a/tests/test_all.c +++ b/tests/test_all.c @@ -51,6 +51,11 @@ int main(int argc, char* argv[MUNIT_ARRAY_PARAM(argc + 1)]) { {"/test_isamax_stride", test_isamax_stride, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_isamax_13542", test_isamax_13542, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_isamax_one", test_isamax_one, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + + {"/test_srot_basic", test_srot_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"/test_srotg_basic", test_srotg_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"/test_srotg_345", test_srotg_345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"/test_srotm_basic", test_srotm_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_dasum_0", test_dasum_0, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_dasum_12345", test_dasum_12345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},