Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ BLAS_SRCS = \
$(BLAS_SRC_DIR_L1)/srotm.c \
$(BLAS_SRC_DIR_L1)/drotm.c \
$(BLAS_SRC_DIR_L1)/hrotm.c \
$(BLAS_SRC_DIR_L1)/srotmg.c \
$(BLAS_SRC_DIR_L1)/drotmg.c \
$(BLAS_SRC_DIR_L1)/hrotmg.c \
$(BLAS_SRC_DIR_L2)/sgemv.c \
$(BLAS_SRC_DIR_L2)/dgemv.c \
$(BLAS_SRC_DIR_L2)/hgemv.c \
Expand Down
244 changes: 93 additions & 151 deletions src/blas/level1/drotmg.c
Original file line number Diff line number Diff line change
@@ -1,169 +1,111 @@
#include "softblas.h"

void drotmg(float64_t *D1, float64_t *D2, float64_t *X1, const float64_t y1, float64_t *P)
{
float64_t flag = *P;
float64_t d1=(*D1), d2=(*D2), x1=(*X1);
float64_t h11, h21, h12, h22, tmp, u, p1, p2, q1, q2;
static const float64_t gam=0x40b0000000000000; // 4096.0
static const float64_t gamsq = 0x40b0000000000000 * 0x40b0000000000000;
static const float64_t rgam = SB_REAL64_ONE / 0x40b0000000000000;
static const float64_t rgamsq = SB_REAL64_ONE /
(0x40b0000000000000 * 0x40b0000000000000);
// Construct a modified Givens transformation (reference SROTMG). Given the
// diagonal scaling D1, D2, the first component X1 of the X-vector, and the
// first component y1 of the Y-vector, it computes the flag P[0] and the H
// matrix entries P[1..4] that zero the second component of the rotated
// vector, rescaling so the scale factors stay in [1/gamsq, gamsq].
void drotmg(float64_t *D1, float64_t *D2, float64_t *X1, const float64_t y1, float64_t *P, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
const float64_t ZERO = { SB_REAL64_ZERO };
const float64_t ONE = { SB_REAL64_ONE };
const float64_t NEGONE = { SB_REAL64_NEGONE };
const float64_t NEGTWO = { SB_REAL64_NEGTWO };
const float64_t gam = { 0x40b0000000000000 }; // 4096
const float64_t gamsq = { 0x4170000000000000 }; // 4096^2 = 2^24
const float64_t rgam = { 0x3f30000000000000 }; // 1/4096 = 2^-12
const float64_t rgamsq = { 0x3e70000000000000 }; // 2^-24

if (f64_lt(d1, SB_REAL64_ZERO))
{
*P = SB_REAL64_NEGONE;
*D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL64_ZERO;
return;
}
float64_t flag;
float64_t d1 = *D1, d2 = *D2, x1 = *X1;
float64_t h11 = ZERO, h21 = ZERO, h12 = ZERO, h22 = ZERO;

p2 = f64_mul(d2, y1);
if (f64_eq(p2, SB_REAL64_ZERO))
{
*P = SB_REAL64_NEGTWO;
return;
}
if (f64_lt(d1, ZERO)) {
flag = NEGONE;
d1 = ZERO; d2 = ZERO; x1 = ZERO;
h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO;
} else {
float64_t p2 = f64_mul(d2, y1);
if (f64_eq(p2, ZERO)) {
*P = NEGTWO;
return;
}
float64_t p1 = f64_mul(d1, x1);
float64_t q2 = f64_mul(p2, y1);
float64_t q1 = f64_mul(p1, x1);

p1 = f64_mul(d1, x1);
q2 = f64_mul(p2, y1);
q1 = f64_mul(p1, x1);
if (f64_gt(f64_abs(q1), f64_abs(q2)))
{
h21 = f64_div(f64_mul(SB_REAL64_NEGONE, y1), x1);
h12 = f64_div(p2, p1);
u = f64_sub(SB_REAL64_ONE, f64_mul(h12, h21));
if (f64_gt(u, SB_REAL64_ZERO))
{
flag = SB_REAL64_ZERO;
if (f64_gt(f64_abs(q1), f64_abs(q2))) {
h21 = f64_div(f64_mul(NEGONE, y1), x1);
h12 = f64_div(p2, p1);
float64_t u = f64_sub(ONE, f64_mul(h12, h21));
if (f64_le(u, ZERO)) { // singular -> zero result
flag = NEGONE;
d1 = ZERO; d2 = ZERO; x1 = ZERO;
h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO;
goto store;
}
flag = ZERO;
d1 = f64_div(d1, u);
d2 = f64_div(d2, u);
x1 = f64_mul(x1, u);
} else {
if (f64_lt(q2, ZERO)) {
flag = NEGONE;
d1 = ZERO; d2 = ZERO; x1 = ZERO;
h11 = ZERO; h21 = ZERO; h12 = ZERO; h22 = ZERO;
goto store;
}
flag = ONE;
h11 = f64_div(p1, p2);
h22 = f64_div(x1, y1);
float64_t u = f64_add(ONE, f64_mul(h11, h22));
float64_t tmp = f64_div(d2, u);
d2 = f64_div(d1, u);
d1 = tmp;
x1 = f64_mul(y1, u);
}
}
else
{
if (f64_lt(q2, SB_REAL64_ZERO))
{
*P = SB_REAL64_NEGONE;
*D1 = *D2 = *X1 = P[1] = P[2] = P[3] = P[4] = SB_REAL64_ZERO;
return;
}
flag = SB_REAL64_ONE;
h11 = f64_div(p1, p2);
h22 = f64_div(x1, y1);
u = f64_add(SB_REAL64_ONE, f64_mul(h11, h22));
tmp = f64_div(d2, u);
d2 = f64_div(d1, u);
d1 = tmp;
x1 = f64_mul(y1, u);

}

if (f64_le(d1, rgamsq))
{
if (f64_ne(d1, SB_REAL64_ZERO))
{
if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; }
else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; }
while(1)
{
d1 = f64_mul(d1, gamsq);
x1 = f64_mul(x1, rgam);
h11 = f64_mul(h11, rgam);
h12 = f64_mul(h12, rgam);
if (f64_gt(d1, rgamsq))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
}
// Rescale D1 into [1/gamsq, gamsq].
while (f64_ne(d1, ZERO) && f64_le(d1, rgamsq)) {
if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; }
else { flag = NEGONE; h21 = NEGONE; h12 = ONE; }
d1 = f64_mul(d1, gamsq);
x1 = f64_mul(x1, rgam);
h11 = f64_mul(h11, rgam);
h12 = f64_mul(h12, rgam);
}
}
else if (f64_ge(d1, gamsq))
{
if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; }
else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; }
while (1)
{
d1 = f64_mul(d1, rgamsq);
x1 = f64_mul(x1, gam);
h11 = f64_mul(h11, gam);
h12 = f64_mul(h12, gam);
if (f64_lt(d1, gamsq))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
while (f64_ge(d1, gamsq)) {
if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; }
else { flag = NEGONE; h21 = NEGONE; h12 = ONE; }
d1 = f64_mul(d1, rgamsq);
x1 = f64_mul(x1, gam);
h11 = f64_mul(h11, gam);
h12 = f64_mul(h12, gam);
}
}

tmp = f64_abs(d2);
if (f64_le(tmp, rgamsq))
{
if (!f64_eq(d2, SB_REAL64_ZERO))
{
if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; }
else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; }
if (f64_ge(d2, SB_REAL64_ZERO))
while (1)
{
d2 = f64_mul(d2, gamsq);
h21 = f64_mul(h21, rgam);
h22 = f64_mul(h22, rgam);
if (f64_gt(d2, rgamsq))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
}
else /* d2 < SB_REAL64_ZERO */
{
tmp = f64_mul(SB_REAL64_NEGONE, rgamsq);
while (1)
{
d2 = f64_mul(d2, gamsq);
h21 = f64_mul(h21, rgam);
h22 = f64_mul(h22, rgam);
if (f64_lt(d2, tmp))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
}
}

// Rescale D2 into [1/gamsq, gamsq].
while (f64_ne(d2, ZERO) && f64_le(f64_abs(d2), rgamsq)) {
if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; }
else { flag = NEGONE; h21 = NEGONE; h12 = ONE; }
d2 = f64_mul(d2, gamsq);
h21 = f64_mul(h21, rgam);
h22 = f64_mul(h22, rgam);
}
}
else if f64_ge(tmp, gamsq) {
if (f64_eq(flag, SB_REAL64_ZERO)) { flag = SB_REAL64_NEGONE; h11 = h22 = SB_REAL64_ONE; }
else { flag = h21 = SB_REAL64_NEGONE; h12 = SB_REAL64_ONE; }
if (f64_ge(d2, SB_REAL64_ZERO))
while (1)
{
d2 = f64_mul(d2, rgamsq);
while (f64_ge(f64_abs(d2), gamsq)) {
if (f64_eq(flag, ZERO)) { flag = NEGONE; h11 = ONE; h22 = ONE; }
else { flag = NEGONE; h21 = NEGONE; h12 = ONE; }
d2 = f64_mul(d2, rgamsq);
h21 = f64_mul(h21, gam);
h22 = f64_mul(h22, gam);
if (f64_lt(d2, gamsq))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
}
else /* d2 < SB_REAL64_ZERO */
{
tmp = f64_mul(SB_REAL64_NEGONE, gamsq);
while (1)
{
d2 = f64_mul(d2, rgamsq);
h21 = f64_mul(h21, gam);
h22 = f64_mul(h22, gam);
if (f64_gt(d2, tmp))
break;
h21 = SB_REAL64_NEGONE;
h12 = SB_REAL64_ONE;
}
}
}
*D1 = d1;
*D2 = d2;
*X1 = x1;
*P = flag;
if (flag == SB_REAL64_NEGONE) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; }
else if (flag == SB_REAL64_ZERO) { P[2] = h21; P[3] = h12; }
else if (flag == SB_REAL64_ONE) { P[1] = h11; P[4] = h22; }

store:
*D1 = d1;
*D2 = d2;
*X1 = x1;
P[0] = flag;
if (f64_eq(flag, NEGONE)) { P[1] = h11; P[2] = h21; P[3] = h12; P[4] = h22; }
else if (f64_eq(flag, ZERO)) { P[2] = h21; P[3] = h12; }
else { P[1] = h11; P[4] = h22; } // flag == 1
}
Loading
Loading