Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ BLAS_SRCS = \
$(BLAS_SRC_DIR_L1)/cdotc.c \
$(BLAS_SRC_DIR_L1)/scnrm2.c \
$(BLAS_SRC_DIR_L1)/csrot.c \
$(BLAS_SRC_DIR_L1)/srot.c \
$(BLAS_SRC_DIR_L1)/drot.c \
$(BLAS_SRC_DIR_L1)/hrot.c \
$(BLAS_SRC_DIR_L1)/srotg.c \
$(BLAS_SRC_DIR_L1)/drotg.c \
$(BLAS_SRC_DIR_L1)/hrotg.c \
$(BLAS_SRC_DIR_L1)/srotm.c \
$(BLAS_SRC_DIR_L1)/drotm.c \
$(BLAS_SRC_DIR_L1)/hrotm.c \
$(BLAS_SRC_DIR_L2)/sgemv.c \
$(BLAS_SRC_DIR_L2)/dgemv.c \
$(BLAS_SRC_DIR_L2)/hgemv.c \
Expand Down
33 changes: 12 additions & 21 deletions src/blas/level1/drot.c
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
void drot(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
float64_t tmp;
#include "softblas.h"

if (c != SB_REAL64_ONE || s != SB_REAL64_ZERO) {
if (incX == 1 && incY == 1) {
for (uint64_t i=0; i != N; i++) {
tmp = f64_add(f64_mul(c, X[i]), f64_mul(s, Y[i]));
Y[i] = f64_sub(f64_mul(c, Y[i]), f64_mul(s, X[i]));
X[i] = tmp;
}
}
else
{
for (uint64_t i=N; i; i--, Y += incY, X += incX)
{
tmp = f64_add(f64_mul(c, *X), f64_mul(s, *Y));
*Y = f64_sub(f64_mul(c, *Y), f64_mul(s, *X));
*X = tmp;
}
}
// Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i].
void drot(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
uint64_t iX = 0, iY = 0;
for (uint64_t i = 0; i < N; i++) {
float64_t tmp = f64_add(f64_mul(c, X[iX]), f64_mul(s, Y[iY]));
Y[iY] = nan_unify_d(f64_sub(f64_mul(c, Y[iY]), f64_mul(s, X[iX])));
X[iX] = nan_unify_d(tmp);
iX += incX;
iY += incY;
}
}
}
53 changes: 29 additions & 24 deletions src/blas/level1/drotg.c
Original file line number Diff line number Diff line change
@@ -1,30 +1,35 @@
#include "softblas.h"

// Construct a Givens rotation that zeroes b: returns c, s, and overwrites
// a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z.
// (The SoftFloat comparison/abs macros do not parenthesize their argument,
// so every operand here is first loaded into a plain local.)
void drotg(float64_t *a, float64_t *b, float64_t *c, float64_t *s, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
float64_t roe, scal, r, z, aa, ab, t0, t1;
const float64_t ZERO = { SB_REAL64_ZERO };
const float64_t ONE = { SB_REAL64_ONE };
const float64_t NEG = { SB_REAL64_NEGONE };

aa = f64_abs(*a);
ab = f64_abs(*b);
if f64_gt(aa, ab) roe = *a;
else roe = *b;
scal = f64_add(aa, ab);
if f64_ne(scal, SB_REAL64_ZERO)
{
t0 = f64_div(aa, scal); t1 = f64_div(ab, scal);
r = f64_mul(scal, f64_sqrt(f64_add(f64_mul(t0, t0), f64_mul(t1, t1))));
if f64_le(roe, SB_REAL64_ZERO) r = f64_neg(r);
*c = f64_div(*a, r);
*s = f64_div(*b, r);
if f64_gt(aa, ab) z = *s;
else if f64_ne(*c, SB_REAL64_ZERO) z = f64_div(SB_REAL64_ONE, *c);
else z = SB_REAL64_ONE;
*a = r;
*b = z;
}
else
{
*c = SB_REAL64_ONE;
*s = *a = *b = SB_REAL64_ZERO;
}
float64_t A = *a, B = *b;
float64_t absA = f64_abs(A), absB = f64_abs(B);
float64_t roe = f64_gt(absA, absB) ? A : B;
float64_t scale = f64_add(absA, absB);
float64_t r, z;

if (f64_eq(scale, ZERO)) {
*c = ONE; *s = ZERO; r = ZERO; z = ZERO;
} else {
float64_t as = f64_div(A, scale);
float64_t bs = f64_div(B, scale);
r = f64_mul(scale, f64_sqrt(f64_add(f64_mul(as, as), f64_mul(bs, bs))));
r = f64_mul(f64_lt(roe, ZERO) ? NEG : ONE, r); // sign(roe) * |r|
*c = f64_div(A, r);
*s = f64_div(B, r);
float64_t C = *c, S = *s;
z = ONE;
if (f64_gt(absA, absB)) z = S;
if (f64_ge(absB, absA) && f64_ne(C, ZERO)) z = f64_div(ONE, C);
}
*a = r;
*b = z;
}
110 changes: 26 additions & 84 deletions src/blas/level1/drotm.c
Original file line number Diff line number Diff line change
@@ -1,90 +1,32 @@
#include "softblas.h"

// Apply a modified Givens rotation H (selected by the flag P[0]) to the
// pair of vectors X, Y. H is built from P[1..4] as in the reference SROTM.
void drotm(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t *P, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
uint64_t i;
const float64_t flag = *P;
float64_t h11, h21, h12, h22, w, z;
const float64_t flag = P[0];
const float64_t NEGTWO = { SB_REAL64_NEGTWO };
const float64_t NEGONE = { SB_REAL64_NEGONE };
const float64_t ZERO = { SB_REAL64_ZERO };
const float64_t ONE = { SB_REAL64_ONE };

if f64_eq(flag, SB_REAL64_NEGTWO) return;
if (f64_eq(flag, SB_REAL64_NEGONE))
{
h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
if (incX == 1 && incY == 1)
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X++ = f64_add(f64_mul(w, h11), f64_mul(z, h12));
*Y++ = f64_add(f64_mul(w, h21), f64_mul(z, h22));
}
}
else
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X = f64_add(f64_mul(w, h11), f64_mul(z, h12));
*Y = f64_add(f64_mul(w, h21), f64_mul(z, h22));
X += incX;
Y += incY;
}
}
}
else if (f64_eq(flag, SB_REAL64_ZERO))
{
h21 = P[2];
h12 = P[3];
if (incX == 1 && incY == 1)
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X++ = f64_add(w, f64_mul(z, h12));
*Y++ = f64_add(f64_mul(w, h21), z);
}
}
else
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X = f64_add(w, f64_mul(z, h12));
*Y = f64_add(f64_mul(w, h21), z);
X += incX;
Y += incY;
}
}
}
else if (f64_eq(flag, SB_REAL64_ONE))
{
h11 = P[1];
h22 = P[4];
if (incX == 1 && incY == 1)
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X++ = f64_add(f64_mul(w, h11), z);
*Y++ = f64_sub(f64_mul(z, h22), w);
}
}
else
{
for (i = N; i; i--)
{
w = *X;
z = *Y;
*X = f64_add(f64_mul(w, h11), z);
*Y = f64_sub(f64_mul(z, h22), w);
X += incX;
Y += incY;
}
}
}
if (f64_eq(flag, NEGTWO)) return;

float64_t h11, h21, h12, h22;
if (f64_eq(flag, NEGONE)) {
h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
} else if (f64_eq(flag, ZERO)) {
h11 = ONE; h21 = P[2]; h12 = P[3]; h22 = ONE;
} else { /* flag == 1 */
h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4];
}

uint64_t iX = 0, iY = 0;
for (uint64_t i = 0; i < N; i++) {
float64_t w = X[iX], z = Y[iY];
X[iX] = nan_unify_d(f64_add(f64_mul(w, h11), f64_mul(z, h12)));
Y[iY] = nan_unify_d(f64_add(f64_mul(w, h21), f64_mul(z, h22)));
iX += incX;
iY += incY;
}
}
31 changes: 11 additions & 20 deletions src/blas/level1/hrot.c
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
void hrot(const uint16_t N, float16_t *X, const uint16_t incX, float16_t *Y, const uint16_t incY, const float16_t c, const float16_t s) {
_set_rounding(rndMode);
float16_t tmp;
#include "softblas.h"

if (c != SB_REAL32_ONE || s != SB_REAL32_ZERO) {
if (incX == 1 && incY == 1) {
for (uint64_t i=0; i != N; i++) {
tmp = f16_add(f16_mul(c, X[i]), f16_mul(s, Y[i]));
Y[i] = f16_sub(f16_mul(c, Y[i]), f16_mul(s, X[i]));
X[i] = tmp;
}
}
else
{
for (uint64_t i=N; i; i--, Y += incY, X += incX)
{
tmp = f16_add(f16_mul(c, *X), f16_mul(s, *Y));
*Y = f16_sub(f16_mul(c, *Y), f16_mul(s, *X));
*X = tmp;
}
}
// Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i].
void hrot(const uint64_t N, float16_t *X, const uint64_t incX, float16_t *Y, const uint64_t incY, const float16_t c, const float16_t s, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
uint64_t iX = 0, iY = 0;
for (uint64_t i = 0; i < N; i++) {
float16_t tmp = f16_add(f16_mul(c, X[iX]), f16_mul(s, Y[iY]));
Y[iY] = nan_unify_h(f16_sub(f16_mul(c, Y[iY]), f16_mul(s, X[iX])));
X[iX] = nan_unify_h(tmp);
iX += incX;
iY += incY;
}
}
53 changes: 29 additions & 24 deletions src/blas/level1/hrotg.c
Original file line number Diff line number Diff line change
@@ -1,30 +1,35 @@
#include "softblas.h"

// Construct a Givens rotation that zeroes b: returns c, s, and overwrites
// a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z.
// (The SoftFloat comparison/abs macros do not parenthesize their argument,
// so every operand here is first loaded into a plain local.)
void hrotg(float16_t *a, float16_t *b, float16_t *c, float16_t *s, const uint_fast8_t rndMode) {
_set_rounding(rndMode);
float16_t roe, scal, r, z, aa, ab, t0, t1;
const float16_t ZERO = { SB_REAL16_ZERO };
const float16_t ONE = { SB_REAL16_ONE };
const float16_t NEG = { SB_REAL16_NEGONE };

aa = f16_abs(*a);
ab = f16_abs(*b);
if f16_gt(aa, ab) roe = *a;
else roe = *b;
scal = f16_add(aa, ab);
if f16_ne(scal, SB_REAL16_ZERO)
{
t0 = f16_div(aa, scal); t1 = f16_div(ab, scal);
r = f16_mul(scal, f16_sqrt(f16_add(f16_mul(t0, t0), f16_mul(t1, t1))));
if f16_le(roe, SB_REAL16_ZERO) r = f16_neg(r);
*c = f16_div(*a, r);
*s = f16_div(*b, r);
if f16_gt(aa, ab) z = *s;
else if f16_ne(*c, SB_REAL16_ZERO) z = f16_div(SB_REAL16_ONE, *c);
else z = SB_REAL16_ONE;
*a = r;
*b = z;
}
else
{
*c = SB_REAL16_ONE;
*s = *a = *b = SB_REAL16_ZERO;
}
float16_t A = *a, B = *b;
float16_t absA = f16_abs(A), absB = f16_abs(B);
float16_t roe = f16_gt(absA, absB) ? A : B;
float16_t scale = f16_add(absA, absB);
float16_t r, z;

if (f16_eq(scale, ZERO)) {
*c = ONE; *s = ZERO; r = ZERO; z = ZERO;
} else {
float16_t as = f16_div(A, scale);
float16_t bs = f16_div(B, scale);
r = f16_mul(scale, f16_sqrt(f16_add(f16_mul(as, as), f16_mul(bs, bs))));
r = f16_mul(f16_lt(roe, ZERO) ? NEG : ONE, r); // sign(roe) * |r|
*c = f16_div(A, r);
*s = f16_div(B, r);
float16_t C = *c, S = *s;
z = ONE;
if (f16_gt(absA, absB)) z = S;
if (f16_ge(absB, absA) && f16_ne(C, ZERO)) z = f16_div(ONE, C);
}
*a = r;
*b = z;
}
Loading
Loading