diff --git a/Makefile b/Makefile index 3eb514d..74eb318 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,9 @@ BLAS_SRCS = \ $(BLAS_SRC_DIR_L1)/srotm.c \ $(BLAS_SRC_DIR_L1)/drotm.c \ $(BLAS_SRC_DIR_L1)/hrotm.c \ + $(BLAS_SRC_DIR_L1)/srotmg.c \ + $(BLAS_SRC_DIR_L1)/drotmg.c \ + $(BLAS_SRC_DIR_L1)/hrotmg.c \ $(BLAS_SRC_DIR_L2)/sgemv.c \ $(BLAS_SRC_DIR_L2)/dgemv.c \ $(BLAS_SRC_DIR_L2)/hgemv.c \ diff --git a/src/blas/level1/drotmg.c b/src/blas/level1/drotmg.c index 3abde43..accc67d 100644 --- a/src/blas/level1/drotmg.c +++ b/src/blas/level1/drotmg.c @@ -1,169 +1,111 @@ #include "softblas.h" -void drotmg(float64_t *D1, float64_t *D2, float64_t *X1, const float64_t y1, float64_t *P) -{ - float64_t flag = *P; - float64_t d1=(*D1), d2=(*D2), x1=(*X1); - float64_t h11, h21, h12, h22, tmp, u, p1, p2, q1, q2; - static const float64_t gam=0x40b0000000000000; // 4096.0 - static const float64_t gamsq = 0x40b0000000000000 * 0x40b0000000000000; - static const float64_t rgam = SB_REAL64_ONE / 0x40b0000000000000; - static const float64_t rgamsq = SB_REAL64_ONE / - (0x40b0000000000000 * 0x40b0000000000000); +// Construct a modified Givens transformation (reference SROTMG). Given the +// diagonal scaling D1, D2, the first component X1 of the X-vector, and the +// first component y1 of the Y-vector, it computes the flag P[0] and the H +// matrix entries P[1..4] that zero the second component of the rotated +// vector, rescaling so the scale factors stay in [1/gamsq, gamsq]. +void drotmg(float64_t *D1, float64_t *D2, float64_t *X1, const float64_t y1, float64_t *P, const uint_fast8_t rndMode) { + _set_rounding(rndMode); + const float64_t ZERO = { SB_REAL64_ZERO }; + const float64_t ONE = { SB_REAL64_ONE }; + const float64_t NEGONE = { SB_REAL64_NEGONE }; + const float64_t NEGTWO = { SB_REAL64_NEGTWO }; + const float64_t gam = { 0x40b0000000000000 }; // 4096 + const float64_t gamsq = { 0x4170000000000000 }; // 4096^2 = 2^24 + const float64_t rgam = { 0x3f30000000000000 }; // 1/4096 = 2^-12 + const float64_t rgamsq = { 0x3e70000000000000 }; // 2^-24 - if (f64_lt(d1, SB_REAL64_ZERO)) - { - *P = SB_REAL64_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL64_ZERO; - return; - } + float64_t flag; + float64_t d1 = *D1, d2 = *D2, x1 = *X1; + float64_t h11 = ZERO, h21 = ZERO, h12 = ZERO, h22 = ZERO; - p2 = f64_mul(d2, y1); - if (f64_eq(p2, SB_REAL64_ZERO)) - { - *P = SB_REAL64_NEGTWO; - return; - } + if (f64_lt(d1, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + } else { + float64_t p2 = f64_mul(d2, y1); + if (f64_eq(p2, ZERO)) { + *P = NEGTWO; + return; + } + float64_t p1 = f64_mul(d1, x1); + float64_t q2 = f64_mul(p2, y1); + float64_t q1 = f64_mul(p1, x1); - p1 = f64_mul(d1, x1); - q2 = f64_mul(p2, y1); - q1 = f64_mul(p1, x1); - if (f64_gt(f64_abs(q1), f64_abs(q2))) - { - h21 = f64_div(f64_mul(SB_REAL64_NEGONE, y1), x1); - h12 = f64_div(p2, p1); - u = f64_sub(SB_REAL64_ONE, f64_mul(h12, h21)); - if (f64_gt(u, SB_REAL64_ZERO)) - { - flag = SB_REAL64_ZERO; + if (f64_gt(f64_abs(q1), f64_abs(q2))) { + h21 = f64_div(f64_mul(NEGONE, y1), x1); + h12 = f64_div(p2, p1); + float64_t u = f64_sub(ONE, f64_mul(h12, h21)); + if (f64_le(u, ZERO)) { // singular -> zero result + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ZERO; d1 = f64_div(d1, u); d2 = f64_div(d2, u); x1 = f64_mul(x1, u); + } else { + if (f64_lt(q2, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ONE; + h11 = f64_div(p1, p2); + h22 = f64_div(x1, y1); + float64_t u = f64_add(ONE, f64_mul(h11, h22)); + float64_t tmp = f64_div(d2, u); + d2 = f64_div(d1, u); + d1 = tmp; + x1 = f64_mul(y1, u); } - } - else - { - if (f64_lt(q2, SB_REAL64_ZERO)) - { - *P = SB_REAL64_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL64_ZERO; - return; - } - flag = SB_REAL64_ONE; - h11 = f64_div(p1, p2); - h22 = f64_div(x1, y1); - u = f64_add(SB_REAL64_ONE, f64_mul(h11, h22)); - tmp = f64_div(d2, u); - d2 = f64_div(d1, u); - d1 = tmp; - x1 = f64_mul(y1, u); - - } - if (f64_le(d1, rgamsq)) - { - if (f64_ne(d1, SB_REAL64_ZERO)) - { - if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; } - else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; } - while(1) - { - d1 = f64_mul(d1, gamsq); - x1 = f64_mul(x1, rgam); - h11 = f64_mul(h11, rgam); - h12 = f64_mul(h12, rgam); - if (f64_gt(d1, rgamsq)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; - } + // Rescale D1 into [1/gamsq, gamsq]. + while (f64_ne(d1, ZERO) && f64_le(d1, rgamsq)) { + if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f64_mul(d1, gamsq); + x1 = f64_mul(x1, rgam); + h11 = f64_mul(h11, rgam); + h12 = f64_mul(h12, rgam); } - } - else if (f64_ge(d1, gamsq)) - { - if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; } - else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; } - while (1) - { - d1 = f64_mul(d1, rgamsq); - x1 = f64_mul(x1, gam); - h11 = f64_mul(h11, gam); - h12 = f64_mul(h12, gam); - if (f64_lt(d1, gamsq)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; + while (f64_ge(d1, gamsq)) { + if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f64_mul(d1, rgamsq); + x1 = f64_mul(x1, gam); + h11 = f64_mul(h11, gam); + h12 = f64_mul(h12, gam); } - } - - tmp = f64_abs(d2); - if (f64_le(tmp, rgamsq)) - { - if (!f64_eq(d2, SB_REAL64_ZERO)) - { - if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; } - else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; } - if (f64_ge(d2, SB_REAL64_ZERO)) - while (1) - { - d2 = f64_mul(d2, gamsq); - h21 = f64_mul(h21, rgam); - h22 = f64_mul(h22, rgam); - if (f64_gt(d2, rgamsq)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; - } - else /* d2 < SB_REAL64_ZERO */ - { - tmp = f64_mul(SB_REAL64_NEGONE, rgamsq); - while (1) - { - d2 = f64_mul(d2, gamsq); - h21 = f64_mul(h21, rgam); - h22 = f64_mul(h22, rgam); - if (f64_lt(d2, tmp)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; - } - } + + // Rescale D2 into [1/gamsq, gamsq]. + while (f64_ne(d2, ZERO) && f64_le(f64_abs(d2), rgamsq)) { + if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f64_mul(d2, gamsq); + h21 = f64_mul(h21, rgam); + h22 = f64_mul(h22, rgam); } - } - else if f64_ge(tmp, gamsq) { - if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; } - else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; } - if (f64_ge(d2, SB_REAL64_ZERO)) - while (1) - { - d2 = f64_mul(d2, rgamsq); + while (f64_ge(f64_abs(d2), gamsq)) { + if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f64_mul(d2, rgamsq); h21 = f64_mul(h21, gam); h22 = f64_mul(h22, gam); - if (f64_lt(d2, gamsq)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; - } - else /* d2 < SB_REAL64_ZERO */ - { - tmp = f64_mul(SB_REAL64_NEGONE, gamsq); - while (1) - { - d2 = f64_mul(d2, rgamsq); - h21 = f64_mul(h21, gam); - h22 = f64_mul(h22, gam); - if (f64_gt(d2, tmp)) - break; - h21 = SB_REAL64_NEGONE; - h12 = SB_REAL64_ONE; - } } } - *D1 = d1; - *D2 = d2; - *X1 = x1; - *P = flag; - if (flag == SB_REAL64_NEGONE) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } - else if (flag == SB_REAL64_ZERO) { P[2] = h21; P[3] = h12; } - else if (flag == SB_REAL64_ONE) { P[1] = h11; P[4] = h22; } + +store: + *D1 = d1; + *D2 = d2; + *X1 = x1; + P[0] = flag; + if (f64_eq(flag, NEGONE)) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } + else if (f64_eq(flag, ZERO)) { P[2] = h21; P[3] = h12; } + else { P[1] = h11; P[4] = h22; } // flag == 1 } diff --git a/src/blas/level1/hrotmg.c b/src/blas/level1/hrotmg.c index 62d7a8b..7113dd9 100644 --- a/src/blas/level1/hrotmg.c +++ b/src/blas/level1/hrotmg.c @@ -1,169 +1,111 @@ #include "softblas.h" -void hrotmg(float16_t *D1, float16_t *D2, float16_t *X1, const float16_t y1, float16_t *P) -{ - float16_t flag = *P; - float16_t d1=(*D1), d2=(*D2), x1=(*X1); - float16_t h11, h21, h12, h22, tmp, u, p1, p2, q1, q2; - static const float16_t gam=0x6c00; // 4096.0 - static const float16_t gamsq = 0x6c00 * 0x6c00; - static const float16_t rgam = SB_REAL16_ONE / 0x6c00; - static const float16_t rgamsq = SB_REAL16_ONE / - (0x6c00 * 0x6c00); +// Construct a modified Givens transformation (reference SROTMG). Given the +// diagonal scaling D1, D2, the first component X1 of the X-vector, and the +// first component y1 of the Y-vector, it computes the flag P[0] and the H +// matrix entries P[1..4] that zero the second component of the rotated +// vector, rescaling so the scale factors stay in [1/gamsq, gamsq]. +void hrotmg(float16_t *D1, float16_t *D2, float16_t *X1, const float16_t y1, float16_t *P, const uint_fast8_t rndMode) { + _set_rounding(rndMode); + const float16_t ZERO = { SB_REAL16_ZERO }; + const float16_t ONE = { SB_REAL16_ONE }; + const float16_t NEGONE = { SB_REAL16_NEGONE }; + const float16_t NEGTWO = { SB_REAL16_NEGTWO }; + const float16_t gam = { 0x4c00 }; // 16 (reduced: float16 cannot hold 2^24) + const float16_t gamsq = { 0x5c00 }; // 256 = gam^2 + const float16_t rgam = { 0x2c00 }; // 1/16 + const float16_t rgamsq = { 0x1c00 }; // 1/256 - if (f16_lt(d1, SB_REAL16_ZERO)) - { - *P = SB_REAL16_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL16_ZERO; - return; - } + float16_t flag; + float16_t d1 = *D1, d2 = *D2, x1 = *X1; + float16_t h11 = ZERO, h21 = ZERO, h12 = ZERO, h22 = ZERO; - p2 = f16_mul(d2, y1); - if (f16_eq(p2, SB_REAL16_ZERO)) - { - *P = SB_REAL16_NEGTWO; - return; - } + if (f16_lt(d1, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + } else { + float16_t p2 = f16_mul(d2, y1); + if (f16_eq(p2, ZERO)) { + *P = NEGTWO; + return; + } + float16_t p1 = f16_mul(d1, x1); + float16_t q2 = f16_mul(p2, y1); + float16_t q1 = f16_mul(p1, x1); - p1 = f16_mul(d1, x1); - q2 = f16_mul(p2, y1); - q1 = f16_mul(p1, x1); - if (f16_gt(f16_abs(q1), f16_abs(q2))) - { - h21 = f16_div(f16_mul(SB_REAL16_NEGONE, y1), x1); - h12 = f16_div(p2, p1); - u = f16_sub(SB_REAL16_ONE, f16_mul(h12, h21)); - if (f16_gt(u, SB_REAL16_ZERO)) - { - flag = SB_REAL16_ZERO; + if (f16_gt(f16_abs(q1), f16_abs(q2))) { + h21 = f16_div(f16_mul(NEGONE, y1), x1); + h12 = f16_div(p2, p1); + float16_t u = f16_sub(ONE, f16_mul(h12, h21)); + if (f16_le(u, ZERO)) { // singular -> zero result + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ZERO; d1 = f16_div(d1, u); d2 = f16_div(d2, u); x1 = f16_mul(x1, u); + } else { + if (f16_lt(q2, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ONE; + h11 = f16_div(p1, p2); + h22 = f16_div(x1, y1); + float16_t u = f16_add(ONE, f16_mul(h11, h22)); + float16_t tmp = f16_div(d2, u); + d2 = f16_div(d1, u); + d1 = tmp; + x1 = f16_mul(y1, u); } - } - else - { - if (f16_lt(q2, SB_REAL16_ZERO)) - { - *P = SB_REAL16_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL16_ZERO; - return; - } - flag = SB_REAL16_ONE; - h11 = f16_div(p1, p2); - h22 = f16_div(x1, y1); - u = f16_add(SB_REAL16_ONE, f16_mul(h11, h22)); - tmp = f16_div(d2, u); - d2 = f16_div(d1, u); - d1 = tmp; - x1 = f16_mul(y1, u); - - } - if (f16_le(d1, rgamsq)) - { - if (f16_ne(d1, SB_REAL16_ZERO)) - { - if (f16_eq(flag, SB_REAL16_ZERO)) { flag = SB_REAL16_NEGONE; h11 = h22 = SB_REAL16_ONE; } - else { flag = h21 = SB_REAL16_NEGONE; h12 = SB_REAL16_ONE; } - while(1) - { - d1 = f16_mul(d1, gamsq); - x1 = f16_mul(x1, rgam); - h11 = f16_mul(h11, rgam); - h12 = f16_mul(h12, rgam); - if (f16_gt(d1, rgamsq)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; - } + // Rescale D1 into [1/gamsq, gamsq]. + while (f16_ne(d1, ZERO) && f16_le(d1, rgamsq)) { + if (f16_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f16_mul(d1, gamsq); + x1 = f16_mul(x1, rgam); + h11 = f16_mul(h11, rgam); + h12 = f16_mul(h12, rgam); } - } - else if (f16_ge(d1, gamsq)) - { - if (f16_eq(flag, SB_REAL16_ZERO)) { flag = SB_REAL16_NEGONE; h11 = h22 = SB_REAL16_ONE; } - else { flag = h21 = SB_REAL16_NEGONE; h12 = SB_REAL16_ONE; } - while (1) - { - d1 = f16_mul(d1, rgamsq); - x1 = f16_mul(x1, gam); - h11 = f16_mul(h11, gam); - h12 = f16_mul(h12, gam); - if (f16_lt(d1, gamsq)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; + while (f16_ge(d1, gamsq)) { + if (f16_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f16_mul(d1, rgamsq); + x1 = f16_mul(x1, gam); + h11 = f16_mul(h11, gam); + h12 = f16_mul(h12, gam); } - } - - tmp = f16_abs(d2); - if (f16_le(tmp, rgamsq)) - { - if (!f16_eq(d2, SB_REAL16_ZERO)) - { - if (f16_eq(flag, SB_REAL16_ZERO)) { flag = SB_REAL16_NEGONE; h11 = h22 = SB_REAL16_ONE; } - else { flag = h21 = SB_REAL16_NEGONE; h12 = SB_REAL16_ONE; } - if (f16_ge(d2, SB_REAL16_ZERO)) - while (1) - { - d2 = f16_mul(d2, gamsq); - h21 = f16_mul(h21, rgam); - h22 = f16_mul(h22, rgam); - if (f16_gt(d2, rgamsq)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; - } - else /* d2 < SB_REAL16_ZERO */ - { - tmp = f16_mul(SB_REAL16_NEGONE, rgamsq); - while (1) - { - d2 = f16_mul(d2, gamsq); - h21 = f16_mul(h21, rgam); - h22 = f16_mul(h22, rgam); - if (f16_lt(d2, tmp)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; - } - } + + // Rescale D2 into [1/gamsq, gamsq]. + while (f16_ne(d2, ZERO) && f16_le(f16_abs(d2), rgamsq)) { + if (f16_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f16_mul(d2, gamsq); + h21 = f16_mul(h21, rgam); + h22 = f16_mul(h22, rgam); } - } - else if f16_ge(tmp, gamsq) { - if (f16_eq(flag, SB_REAL16_ZERO)) { flag = SB_REAL16_NEGONE; h11 = h22 = SB_REAL16_ONE; } - else { flag = h21 = SB_REAL16_NEGONE; h12 = SB_REAL16_ONE; } - if (f16_ge(d2, SB_REAL16_ZERO)) - while (1) - { - d2 = f16_mul(d2, rgamsq); + while (f16_ge(f16_abs(d2), gamsq)) { + if (f16_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f16_mul(d2, rgamsq); h21 = f16_mul(h21, gam); h22 = f16_mul(h22, gam); - if (f16_lt(d2, gamsq)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; - } - else /* d2 < SB_REAL16_ZERO */ - { - tmp = f16_mul(SB_REAL16_NEGONE, gamsq); - while (1) - { - d2 = f16_mul(d2, rgamsq); - h21 = f16_mul(h21, gam); - h22 = f16_mul(h22, gam); - if (f16_gt(d2, tmp)) - break; - h21 = SB_REAL16_NEGONE; - h12 = SB_REAL16_ONE; - } } } - *D1 = d1; - *D2 = d2; - *X1 = x1; - *P = flag; - if (flag == SB_REAL16_NEGONE) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } - else if (flag == SB_REAL16_ZERO) { P[2] = h21; P[3] = h12; } - else if (flag == SB_REAL16_ONE) { P[1] = h11; P[4] = h22; } + +store: + *D1 = d1; + *D2 = d2; + *X1 = x1; + P[0] = flag; + if (f16_eq(flag, NEGONE)) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } + else if (f16_eq(flag, ZERO)) { P[2] = h21; P[3] = h12; } + else { P[1] = h11; P[4] = h22; } // flag == 1 } diff --git a/src/blas/level1/srotmg.c b/src/blas/level1/srotmg.c index d18c96d..eb90f57 100644 --- a/src/blas/level1/srotmg.c +++ b/src/blas/level1/srotmg.c @@ -1,169 +1,111 @@ #include "softblas.h" -void srotmg(float32_t *D1, float32_t *D2, float32_t *X1, const float32_t y1, float32_t *P) -{ - float32_t flag = *P; - float32_t d1=(*D1), d2=(*D2), x1=(*X1); - float32_t h11, h21, h12, h22, tmp, u, p1, p2, q1, q2; - static const float32_t gam=0x45800000; // 4096.0 - static const float32_t gamsq = 0x45800000 * 0x45800000; - static const float32_t rgam = SB_REAL32_ONE / 0x45800000; - static const float32_t rgamsq = SB_REAL32_ONE / - (0x45800000 * 0x45800000); +// Construct a modified Givens transformation (reference SROTMG). Given the +// diagonal scaling D1, D2, the first component X1 of the X-vector, and the +// first component y1 of the Y-vector, it computes the flag P[0] and the H +// matrix entries P[1..4] that zero the second component of the rotated +// vector, rescaling so the scale factors stay in [1/gamsq, gamsq]. +void srotmg(float32_t *D1, float32_t *D2, float32_t *X1, const float32_t y1, float32_t *P, const uint_fast8_t rndMode) { + _set_rounding(rndMode); + const float32_t ZERO = { SB_REAL32_ZERO }; + const float32_t ONE = { SB_REAL32_ONE }; + const float32_t NEGONE = { SB_REAL32_NEGONE }; + const float32_t NEGTWO = { SB_REAL32_NEGTWO }; + const float32_t gam = { 0x45800000 }; // 4096 + const float32_t gamsq = { 0x4b800000 }; // 4096^2 = 2^24 + const float32_t rgam = { 0x39800000 }; // 1/4096 = 2^-12 + const float32_t rgamsq = { 0x33800000 }; // 2^-24 - if (f32_lt(d1, SB_REAL32_ZERO)) - { - *P = SB_REAL32_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL32_ZERO; - return; - } + float32_t flag; + float32_t d1 = *D1, d2 = *D2, x1 = *X1; + float32_t h11 = ZERO, h21 = ZERO, h12 = ZERO, h22 = ZERO; - p2 = f32_mul(d2, y1); - if (f32_eq(p2, SB_REAL32_ZERO)) - { - *P = SB_REAL32_NEGTWO; - return; - } + if (f32_lt(d1, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + } else { + float32_t p2 = f32_mul(d2, y1); + if (f32_eq(p2, ZERO)) { + *P = NEGTWO; + return; + } + float32_t p1 = f32_mul(d1, x1); + float32_t q2 = f32_mul(p2, y1); + float32_t q1 = f32_mul(p1, x1); - p1 = f32_mul(d1, x1); - q2 = f32_mul(p2, y1); - q1 = f32_mul(p1, x1); - if (f32_gt(f32_abs(q1), f32_abs(q2))) - { - h21 = f32_div(f32_mul(SB_REAL32_NEGONE, y1), x1); - h12 = f32_div(p2, p1); - u = f32_sub(SB_REAL32_ONE, f32_mul(h12, h21)); - if (f32_gt(u, SB_REAL32_ZERO)) - { - flag = SB_REAL32_ZERO; + if (f32_gt(f32_abs(q1), f32_abs(q2))) { + h21 = f32_div(f32_mul(NEGONE, y1), x1); + h12 = f32_div(p2, p1); + float32_t u = f32_sub(ONE, f32_mul(h12, h21)); + if (f32_le(u, ZERO)) { // singular -> zero result + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ZERO; d1 = f32_div(d1, u); d2 = f32_div(d2, u); x1 = f32_mul(x1, u); + } else { + if (f32_lt(q2, ZERO)) { + flag = NEGONE; + d1 = ZERO; d2 = ZERO; x1 = ZERO; + h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO; + goto store; + } + flag = ONE; + h11 = f32_div(p1, p2); + h22 = f32_div(x1, y1); + float32_t u = f32_add(ONE, f32_mul(h11, h22)); + float32_t tmp = f32_div(d2, u); + d2 = f32_div(d1, u); + d1 = tmp; + x1 = f32_mul(y1, u); } - } - else - { - if (f32_lt(q2, SB_REAL32_ZERO)) - { - *P = SB_REAL32_NEGONE; - *D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL32_ZERO; - return; - } - flag = SB_REAL32_ONE; - h11 = f32_div(p1, p2); - h22 = f32_div(x1, y1); - u = f32_add(SB_REAL32_ONE, f32_mul(h11, h22)); - tmp = f32_div(d2, u); - d2 = f32_div(d1, u); - d1 = tmp; - x1 = f32_mul(y1, u); - - } - if (f32_le(d1, rgamsq)) - { - if (f32_ne(d1, SB_REAL32_ZERO)) - { - if (f32_eq(flag, SB_REAL32_ZERO)) { flag = SB_REAL32_NEGONE; h11 = h22 = SB_REAL32_ONE; } - else { flag = h21 = SB_REAL32_NEGONE; h12 = SB_REAL32_ONE; } - while(1) - { - d1 = f32_mul(d1, gamsq); - x1 = f32_mul(x1, rgam); - h11 = f32_mul(h11, rgam); - h12 = f32_mul(h12, rgam); - if (f32_gt(d1, rgamsq)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; - } + // Rescale D1 into [1/gamsq, gamsq]. + while (f32_ne(d1, ZERO) && f32_le(d1, rgamsq)) { + if (f32_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f32_mul(d1, gamsq); + x1 = f32_mul(x1, rgam); + h11 = f32_mul(h11, rgam); + h12 = f32_mul(h12, rgam); } - } - else if (f32_ge(d1, gamsq)) - { - if (f32_eq(flag, SB_REAL32_ZERO)) { flag = SB_REAL32_NEGONE; h11 = h22 = SB_REAL32_ONE; } - else { flag = h21 = SB_REAL32_NEGONE; h12 = SB_REAL32_ONE; } - while (1) - { - d1 = f32_mul(d1, rgamsq); - x1 = f32_mul(x1, gam); - h11 = f32_mul(h11, gam); - h12 = f32_mul(h12, gam); - if (f32_lt(d1, gamsq)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; + while (f32_ge(d1, gamsq)) { + if (f32_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d1 = f32_mul(d1, rgamsq); + x1 = f32_mul(x1, gam); + h11 = f32_mul(h11, gam); + h12 = f32_mul(h12, gam); } - } - - tmp = f32_abs(d2); - if (f32_le(tmp, rgamsq)) - { - if (!f32_eq(d2, SB_REAL32_ZERO)) - { - if (f32_eq(flag, SB_REAL32_ZERO)) { flag = SB_REAL32_NEGONE; h11 = h22 = SB_REAL32_ONE; } - else { flag = h21 = SB_REAL32_NEGONE; h12 = SB_REAL32_ONE; } - if (f32_ge(d2, SB_REAL32_ZERO)) - while (1) - { - d2 = f32_mul(d2, gamsq); - h21 = f32_mul(h21, rgam); - h22 = f32_mul(h22, rgam); - if (f32_gt(d2, rgamsq)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; - } - else /* d2 < SB_REAL32_ZERO */ - { - tmp = f32_mul(SB_REAL32_NEGONE, rgamsq); - while (1) - { - d2 = f32_mul(d2, gamsq); - h21 = f32_mul(h21, rgam); - h22 = f32_mul(h22, rgam); - if (f32_lt(d2, tmp)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; - } - } + + // Rescale D2 into [1/gamsq, gamsq]. + while (f32_ne(d2, ZERO) && f32_le(f32_abs(d2), rgamsq)) { + if (f32_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f32_mul(d2, gamsq); + h21 = f32_mul(h21, rgam); + h22 = f32_mul(h22, rgam); } - } - else if f32_ge(tmp, gamsq) { - if (f32_eq(flag, SB_REAL32_ZERO)) { flag = SB_REAL32_NEGONE; h11 = h22 = SB_REAL32_ONE; } - else { flag = h21 = SB_REAL32_NEGONE; h12 = SB_REAL32_ONE; } - if (f32_ge(d2, SB_REAL32_ZERO)) - while (1) - { - d2 = f32_mul(d2, rgamsq); + while (f32_ge(f32_abs(d2), gamsq)) { + if (f32_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; } + else { flag = NEGONE; h21 = NEGONE; h12 = ONE; } + d2 = f32_mul(d2, rgamsq); h21 = f32_mul(h21, gam); h22 = f32_mul(h22, gam); - if (f32_lt(d2, gamsq)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; - } - else /* d2 < SB_REAL32_ZERO */ - { - tmp = f32_mul(SB_REAL32_NEGONE, gamsq); - while (1) - { - d2 = f32_mul(d2, rgamsq); - h21 = f32_mul(h21, gam); - h22 = f32_mul(h22, gam); - if (f32_gt(d2, tmp)) - break; - h21 = SB_REAL32_NEGONE; - h12 = SB_REAL32_ONE; - } } } - *D1 = d1; - *D2 = d2; - *X1 = x1; - *P = flag; - if (flag == SB_REAL32_NEGONE) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } - else if (flag == SB_REAL32_ZERO) { P[2] = h21; P[3] = h12; } - else if (flag == SB_REAL32_ONE) { P[1] = h11; P[4] = h22; } + +store: + *D1 = d1; + *D2 = d2; + *X1 = x1; + P[0] = flag; + if (f32_eq(flag, NEGONE)) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; } + else if (f32_eq(flag, ZERO)) { P[2] = h21; P[3] = h12; } + else { P[1] = h11; P[4] = h22; } // flag == 1 } diff --git a/tests/blas/include/test.h b/tests/blas/include/test.h index 06beba0..f2f428d 100644 --- a/tests/blas/include/test.h +++ b/tests/blas/include/test.h @@ -466,4 +466,10 @@ MunitResult test_srotg_basic(const MunitParameter params[], void* u); MunitResult test_srotg_345(const MunitParameter params[], void* u); MunitResult test_srotm_basic(const MunitParameter params[], void* u); + +// Modified Givens setup (test_rotmg.c) +MunitResult test_srotmg_basic(const MunitParameter params[], void* u); +MunitResult test_srotmg_flag_neg2(const MunitParameter params[], void* u); +MunitResult test_drotmg_basic(const MunitParameter params[], void* u); + #endif // TEST_H diff --git a/tests/blas/level1/test_rotmg.c b/tests/blas/level1/test_rotmg.c new file mode 100644 index 0000000..cf35c93 --- /dev/null +++ b/tests/blas/level1/test_rotmg.c @@ -0,0 +1,37 @@ +#include "test.h" + +// srotmg(d1=d2=x1=y1=1): the no-rescale flag=1 branch. +// u = 1 + h11*h22 = 2; d1,d2 -> 0.5; x1 -> y1*u = 2; h11 = h22 = 1. +MunitResult test_srotmg_basic(const MunitParameter params[], void* u) { + float32_t d1 = { 0x3f800000 }, d2 = { 0x3f800000 }, x1 = { 0x3f800000 }; + const float32_t y1 = { 0x3f800000 }; + float32_t P[5]; + srotmg(&d1, &d2, &x1, y1, P, 'n'); + assert_ulong(d1.v, ==, 0x3f000000u); // 0.5 + assert_ulong(d2.v, ==, 0x3f000000u); // 0.5 + assert_ulong(x1.v, ==, 0x40000000u); // 2 + assert_ulong(P[0].v, ==, 0x3f800000u); // flag = 1 + assert_ulong(P[1].v, ==, 0x3f800000u); // h11 = 1 + assert_ulong(P[4].v, ==, 0x3f800000u); // h22 = 1 + return MUNIT_OK; +} +// srotmg with d2*y1 == 0 returns the "no rotation" flag (-2) immediately. +MunitResult test_srotmg_flag_neg2(const MunitParameter params[], void* u) { + float32_t d1 = { 0x3f800000 }, d2 = { 0x00000000 }, x1 = { 0x3f800000 }; + const float32_t y1 = { 0x3f800000 }; + float32_t P[5] = {{0},{0},{0},{0},{0}}; + srotmg(&d1, &d2, &x1, y1, P, 'n'); + assert_ulong(P[0].v, ==, 0xc0000000u); // flag = -2 + return MUNIT_OK; +} +// drotmg(1,1,1,1): same as srotmg but double precision. +MunitResult test_drotmg_basic(const MunitParameter params[], void* u) { + float64_t d1 = { 0x3ff0000000000000 }, d2 = { 0x3ff0000000000000 }, x1 = { 0x3ff0000000000000 }; + const float64_t y1 = { 0x3ff0000000000000 }; + float64_t P[5]; + drotmg(&d1, &d2, &x1, y1, P, 'n'); + assert_ullong(d1.v, ==, 0x3fe0000000000000ull); // 0.5 + assert_ullong(x1.v, ==, 0x4000000000000000ull); // 2 + assert_ullong(P[0].v, ==, 0x3ff0000000000000ull); // flag = 1 + return MUNIT_OK; +} diff --git a/tests/test_all.c b/tests/test_all.c index 93ed521..a1f1296 100644 --- a/tests/test_all.c +++ b/tests/test_all.c @@ -56,6 +56,10 @@ int main(int argc, char* argv[MUNIT_ARRAY_PARAM(argc + 1)]) { {"/test_srotg_basic", test_srotg_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_srotg_345", test_srotg_345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_srotm_basic", test_srotm_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + + {"/test_srotmg_basic", test_srotmg_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"/test_srotmg_flag_neg2", test_srotmg_flag_neg2, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"/test_drotmg_basic", test_drotmg_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_dasum_0", test_dasum_0, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"/test_dasum_12345", test_dasum_12345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},