From 21a09e3bb9a1131b8eb5d86092177bb1487a0907 Mon Sep 17 00:00:00 2001
From: Sigilante <neal.davis@protonmail.com>
Date: Sat, 30 May 2026 12:21:37 -0500
Subject: [PATCH] Finish rot/rotg/rotm to the BLAS reference (C8, part 2)

The rotation routines existed but did not compile or build: srot used a
struct-vs-int comparison and a uint32_t signature; srotg was WIP with the
wrong sign factor (|roe| instead of sign(roe)) and assigned to local
pointer copies; srotm had a parenless `if f32_eq(...)`. All were absent
from the Makefile.

Rewrite srot/srotg/srotm faithfully to the Netlib SROT/SROTG/SROTM
reference using SoftFloat ops (loading operands into locals first, since
the abs/compare macros don't parenthesize their argument), with NaN
canonicalization on the outputs. Generate the float64 and float16
variants and add all nine to the Makefile.

test_rot.c covers srot (c=0,s=1 -> (y,-x)), srotg (a=1,b=0; and 3,4 -> r=5),
and srotm (flag=0 modified Givens). 155/155 tests pass.

(rotmg and sdsdot/hsdot remain; rotmg's rescaling constant gamsq=2^24
overflows float16 and needs handling.)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 Makefile                     |   9 +++
 src/blas/level1/drot.c       |  33 ++++-------
 src/blas/level1/drotg.c      |  53 +++++++++--------
 src/blas/level1/drotm.c      | 110 ++++++++--------------------------
 src/blas/level1/hrot.c       |  31 ++++------
 src/blas/level1/hrotg.c      |  53 +++++++++--------
 src/blas/level1/hrotm.c      | 112 +++++++++--------------------------
 src/blas/level1/srot.c       |  29 +++------
 src/blas/level1/srotg.c      |  68 +++++++++------------
 src/blas/level1/srotm.c      | 112 +++++++++--------------------------
 tests/blas/include/test.h    |   5 ++
 tests/blas/level1/test_rot.c |  43 ++++++++++++++
 tests/test_all.c             |   5 ++
 13 files changed, 261 insertions(+), 402 deletions(-)
 create mode 100644 tests/blas/level1/test_rot.c

diff --git a/Makefile b/Makefile
index cfc5208..3eb514d 100644
--- a/Makefile
+++ b/Makefile
@@ -48,6 +48,15 @@ BLAS_SRCS = \
   $(BLAS_SRC_DIR_L1)/cdotc.c \
   $(BLAS_SRC_DIR_L1)/scnrm2.c \
   $(BLAS_SRC_DIR_L1)/csrot.c \
+  $(BLAS_SRC_DIR_L1)/srot.c \
+  $(BLAS_SRC_DIR_L1)/drot.c \
+  $(BLAS_SRC_DIR_L1)/hrot.c \
+  $(BLAS_SRC_DIR_L1)/srotg.c \
+  $(BLAS_SRC_DIR_L1)/drotg.c \
+  $(BLAS_SRC_DIR_L1)/hrotg.c \
+  $(BLAS_SRC_DIR_L1)/srotm.c \
+  $(BLAS_SRC_DIR_L1)/drotm.c \
+  $(BLAS_SRC_DIR_L1)/hrotm.c \
   $(BLAS_SRC_DIR_L2)/sgemv.c \
   $(BLAS_SRC_DIR_L2)/dgemv.c \
   $(BLAS_SRC_DIR_L2)/hgemv.c \
diff --git a/src/blas/level1/drot.c b/src/blas/level1/drot.c
index a12dc3b..82cb686 100644
--- a/src/blas/level1/drot.c
+++ b/src/blas/level1/drot.c
@@ -1,23 +1,14 @@
-void drot(const uint64_t N, float64_t *X, const uint64_t  incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) {
-    _set_rounding(rndMode);
-    float64_t tmp;
+#include "softblas.h"
 
-    if (c != SB_REAL64_ONE || s != SB_REAL64_ZERO) {
-        if (incX == 1 && incY == 1) {
-            for (uint64_t i=0; i != N; i++) {
-                tmp = f64_add(f64_mul(c, X[i]), f64_mul(s, Y[i]));
-                Y[i] = f64_sub(f64_mul(c, Y[i]), f64_mul(s, X[i]));
-                X[i] = tmp;
-            }
-        }
-        else
-        {
-            for (uint64_t i=N; i; i--, Y += incY, X += incX)
-            {
-                tmp = f64_add(f64_mul(c, *X), f64_mul(s, *Y));
-                *Y = f64_sub(f64_mul(c, *Y), f64_mul(s, *X));
-                *X = tmp;
-            }
-        }
+//  Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i].
+void drot(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t c, const float64_t s, const uint_fast8_t rndMode) {
+    _set_rounding(rndMode);
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float64_t tmp  = f64_add(f64_mul(c, X[iX]), f64_mul(s, Y[iY]));
+        Y[iY] = nan_unify_d(f64_sub(f64_mul(c, Y[iY]), f64_mul(s, X[iX])));
+        X[iX] = nan_unify_d(tmp);
+        iX += incX;
+        iY += incY;
     }
-}
\ No newline at end of file
+}
diff --git a/src/blas/level1/drotg.c b/src/blas/level1/drotg.c
index 11d7dc7..23f944a 100644
--- a/src/blas/level1/drotg.c
+++ b/src/blas/level1/drotg.c
@@ -1,30 +1,35 @@
 #include "softblas.h"
 
+//  Construct a Givens rotation that zeroes b: returns c, s, and overwrites
+//  a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z.
+//  (The SoftFloat comparison/abs macros do not parenthesize their argument,
+//  so every operand here is first loaded into a plain local.)
 void drotg(float64_t *a, float64_t *b, float64_t *c, float64_t *s, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   float64_t roe, scal, r, z, aa, ab, t0, t1;
+    const float64_t ZERO = { SB_REAL64_ZERO };
+    const float64_t ONE  = { SB_REAL64_ONE };
+    const float64_t NEG  = { SB_REAL64_NEGONE };
 
-   aa = f64_abs(*a);
-   ab = f64_abs(*b);
-   if f64_gt(aa, ab) roe = *a;
-   else roe = *b;
-   scal = f64_add(aa, ab);
-   if f64_ne(scal, SB_REAL64_ZERO)
-   {
-      t0 = f64_div(aa, scal); t1 = f64_div(ab, scal);
-      r = f64_mul(scal, f64_sqrt(f64_add(f64_mul(t0, t0), f64_mul(t1, t1))));
-      if f64_le(roe, SB_REAL64_ZERO) r = f64_neg(r);
-      *c = f64_div(*a, r);
-      *s = f64_div(*b, r);
-      if f64_gt(aa, ab) z = *s;
-      else if f64_ne(*c, SB_REAL64_ZERO) z = f64_div(SB_REAL64_ONE, *c);
-      else z = SB_REAL64_ONE;
-      *a = r;
-      *b = z;
-   }
-   else
-   {
-      *c = SB_REAL64_ONE;
-      *s = *a = *b = SB_REAL64_ZERO;
-   }
+    float64_t A = *a, B = *b;
+    float64_t absA = f64_abs(A), absB = f64_abs(B);
+    float64_t roe   = f64_gt(absA, absB) ? A : B;
+    float64_t scale = f64_add(absA, absB);
+    float64_t r, z;
+
+    if (f64_eq(scale, ZERO)) {
+        *c = ONE; *s = ZERO; r = ZERO; z = ZERO;
+    } else {
+        float64_t as = f64_div(A, scale);
+        float64_t bs = f64_div(B, scale);
+        r = f64_mul(scale, f64_sqrt(f64_add(f64_mul(as, as), f64_mul(bs, bs))));
+        r = f64_mul(f64_lt(roe, ZERO) ? NEG : ONE, r);   // sign(roe) * |r|
+        *c = f64_div(A, r);
+        *s = f64_div(B, r);
+        float64_t C = *c, S = *s;
+        z = ONE;
+        if (f64_gt(absA, absB)) z = S;
+        if (f64_ge(absB, absA) && f64_ne(C, ZERO)) z = f64_div(ONE, C);
+    }
+    *a = r;
+    *b = z;
 }
diff --git a/src/blas/level1/drotm.c b/src/blas/level1/drotm.c
index 22a5bc8..500945d 100644
--- a/src/blas/level1/drotm.c
+++ b/src/blas/level1/drotm.c
@@ -1,90 +1,32 @@
 #include "softblas.h"
 
+//  Apply a modified Givens rotation H (selected by the flag P[0]) to the
+//  pair of vectors X, Y.  H is built from P[1..4] as in the reference SROTM.
 void drotm(const uint64_t N, float64_t *X, const uint64_t incX, float64_t *Y, const uint64_t incY, const float64_t *P, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   uint64_t i;
-   const float64_t flag = *P;
-   float64_t h11, h21, h12, h22, w, z;
+    const float64_t flag  = P[0];
+    const float64_t NEGTWO = { SB_REAL64_NEGTWO };
+    const float64_t NEGONE = { SB_REAL64_NEGONE };
+    const float64_t ZERO   = { SB_REAL64_ZERO };
+    const float64_t ONE    = { SB_REAL64_ONE };
 
-   if f64_eq(flag, SB_REAL64_NEGTWO) return;
-   if (f64_eq(flag, SB_REAL64_NEGONE))
-   {
-      h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f64_add(f64_mul(w, h11), f64_mul(z, h12));
-            *Y++ = f64_add(f64_mul(w, h21), f64_mul(z, h22));
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f64_add(f64_mul(w, h11), f64_mul(z, h12));
-            *Y = f64_add(f64_mul(w, h21), f64_mul(z, h22));
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f64_eq(flag, SB_REAL64_ZERO))
-   {
-      h21 = P[2];
-      h12 = P[3];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f64_add(w, f64_mul(z, h12));
-            *Y++ = f64_add(f64_mul(w, h21), z);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f64_add(w, f64_mul(z, h12));
-            *Y = f64_add(f64_mul(w, h21), z);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f64_eq(flag, SB_REAL64_ONE))
-   {
-      h11 = P[1];
-      h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f64_add(f64_mul(w, h11), z);
-            *Y++ = f64_sub(f64_mul(z, h22), w);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f64_add(f64_mul(w, h11), z);
-            *Y = f64_sub(f64_mul(z, h22), w);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
+    if (f64_eq(flag, NEGTWO)) return;
+
+    float64_t h11, h21, h12, h22;
+    if (f64_eq(flag, NEGONE)) {
+        h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
+    } else if (f64_eq(flag, ZERO)) {
+        h11 = ONE;  h21 = P[2]; h12 = P[3]; h22 = ONE;
+    } else { /* flag == 1 */
+        h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4];
+    }
+
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float64_t w = X[iX], z = Y[iY];
+        X[iX] = nan_unify_d(f64_add(f64_mul(w, h11), f64_mul(z, h12)));
+        Y[iY] = nan_unify_d(f64_add(f64_mul(w, h21), f64_mul(z, h22)));
+        iX += incX;
+        iY += incY;
+    }
 }
diff --git a/src/blas/level1/hrot.c b/src/blas/level1/hrot.c
index 19456c5..d0bf9f2 100644
--- a/src/blas/level1/hrot.c
+++ b/src/blas/level1/hrot.c
@@ -1,23 +1,14 @@
-void hrot(const uint16_t N, float16_t *X, const uint16_t  incX, float16_t *Y, const uint16_t incY, const float16_t c, const float16_t s) {
-    _set_rounding(rndMode);
-    float16_t tmp;
+#include "softblas.h"
 
-    if (c != SB_REAL32_ONE || s != SB_REAL32_ZERO) {
-        if (incX == 1 && incY == 1) {
-            for (uint64_t i=0; i != N; i++) {
-                tmp = f16_add(f16_mul(c, X[i]), f16_mul(s, Y[i]));
-                Y[i] = f16_sub(f16_mul(c, Y[i]), f16_mul(s, X[i]));
-                X[i] = tmp;
-            }
-        }
-        else
-        {
-            for (uint64_t i=N; i; i--, Y += incY, X += incX)
-            {
-                tmp = f16_add(f16_mul(c, *X), f16_mul(s, *Y));
-                *Y = f16_sub(f16_mul(c, *Y), f16_mul(s, *X));
-                *X = tmp;
-            }
-        }
+//  Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i].
+void hrot(const uint64_t N, float16_t *X, const uint64_t incX, float16_t *Y, const uint64_t incY, const float16_t c, const float16_t s, const uint_fast8_t rndMode) {
+    _set_rounding(rndMode);
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float16_t tmp  = f16_add(f16_mul(c, X[iX]), f16_mul(s, Y[iY]));
+        Y[iY] = nan_unify_h(f16_sub(f16_mul(c, Y[iY]), f16_mul(s, X[iX])));
+        X[iX] = nan_unify_h(tmp);
+        iX += incX;
+        iY += incY;
     }
 }
diff --git a/src/blas/level1/hrotg.c b/src/blas/level1/hrotg.c
index ffe1146..be711c5 100644
--- a/src/blas/level1/hrotg.c
+++ b/src/blas/level1/hrotg.c
@@ -1,30 +1,35 @@
 #include "softblas.h"
 
+//  Construct a Givens rotation that zeroes b: returns c, s, and overwrites
+//  a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z.
+//  (The SoftFloat comparison/abs macros do not parenthesize their argument,
+//  so every operand here is first loaded into a plain local.)
 void hrotg(float16_t *a, float16_t *b, float16_t *c, float16_t *s, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   float16_t roe, scal, r, z, aa, ab, t0, t1;
+    const float16_t ZERO = { SB_REAL16_ZERO };
+    const float16_t ONE  = { SB_REAL16_ONE };
+    const float16_t NEG  = { SB_REAL16_NEGONE };
 
-   aa = f16_abs(*a);
-   ab = f16_abs(*b);
-   if f16_gt(aa, ab) roe = *a;
-   else roe = *b;
-   scal = f16_add(aa, ab);
-   if f16_ne(scal, SB_REAL16_ZERO)
-   {
-      t0 = f16_div(aa, scal); t1 = f16_div(ab, scal);
-      r = f16_mul(scal, f16_sqrt(f16_add(f16_mul(t0, t0), f16_mul(t1, t1))));
-      if f16_le(roe, SB_REAL16_ZERO) r = f16_neg(r);
-      *c = f16_div(*a, r);
-      *s = f16_div(*b, r);
-      if f16_gt(aa, ab) z = *s;
-      else if f16_ne(*c, SB_REAL16_ZERO) z = f16_div(SB_REAL16_ONE, *c);
-      else z = SB_REAL16_ONE;
-      *a = r;
-      *b = z;
-   }
-   else
-   {
-      *c = SB_REAL16_ONE;
-      *s = *a = *b = SB_REAL16_ZERO;
-   }
+    float16_t A = *a, B = *b;
+    float16_t absA = f16_abs(A), absB = f16_abs(B);
+    float16_t roe   = f16_gt(absA, absB) ? A : B;
+    float16_t scale = f16_add(absA, absB);
+    float16_t r, z;
+
+    if (f16_eq(scale, ZERO)) {
+        *c = ONE; *s = ZERO; r = ZERO; z = ZERO;
+    } else {
+        float16_t as = f16_div(A, scale);
+        float16_t bs = f16_div(B, scale);
+        r = f16_mul(scale, f16_sqrt(f16_add(f16_mul(as, as), f16_mul(bs, bs))));
+        r = f16_mul(f16_lt(roe, ZERO) ? NEG : ONE, r);   // sign(roe) * |r|
+        *c = f16_div(A, r);
+        *s = f16_div(B, r);
+        float16_t C = *c, S = *s;
+        z = ONE;
+        if (f16_gt(absA, absB)) z = S;
+        if (f16_ge(absB, absA) && f16_ne(C, ZERO)) z = f16_div(ONE, C);
+    }
+    *a = r;
+    *b = z;
 }
diff --git a/src/blas/level1/hrotm.c b/src/blas/level1/hrotm.c
index e223f5a..ebfdba9 100644
--- a/src/blas/level1/hrotm.c
+++ b/src/blas/level1/hrotm.c
@@ -1,90 +1,32 @@
 #include "softblas.h"
 
-void hrotm(const uint16_t N, float16_t *X, const uint16_t incX, float16_t *Y, const uint16_t incY, const float16_t *P, const uint_fast8_t rndMode) {
+//  Apply a modified Givens rotation H (selected by the flag P[0]) to the
+//  pair of vectors X, Y.  H is built from P[1..4] as in the reference SROTM.
+void hrotm(const uint64_t N, float16_t *X, const uint64_t incX, float16_t *Y, const uint64_t incY, const float16_t *P, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   uint16_t i;
-   const float16_t flag = *P;
-   float16_t h11, h21, h12, h22, w, z;
+    const float16_t flag  = P[0];
+    const float16_t NEGTWO = { SB_REAL16_NEGTWO };
+    const float16_t NEGONE = { SB_REAL16_NEGONE };
+    const float16_t ZERO   = { SB_REAL16_ZERO };
+    const float16_t ONE    = { SB_REAL16_ONE };
 
-   if (f16_eq(flag, SB_REAL16_NEGTWO)) return;
-   if (f16_eq(flag, SB_REAL16_NEGONE))
-   {
-      h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f16_add(f16_mul(w, h11), f16_mul(z, h12));
-            *Y++ = f16_add(f16_mul(w, h21), f16_mul(z, h22));
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f16_add(f16_mul(w, h11), f16_mul(z, h12));
-            *Y = f16_add(f16_mul(w, h21), f16_mul(z, h22));
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f16_eq(flag, SB_REAL16_ZERO))
-   {
-      h21 = P[2];
-      h12 = P[3];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f16_add(w, f16_mul(z, h12));
-            *Y++ = f16_add(f16_mul(w, h21), z);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f16_add(w, f16_mul(z, h12));
-            *Y = f16_add(f16_mul(w, h21), z);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f16_eq(flag, SB_REAL16_ONE))
-   {
-      h11 = P[1];
-      h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f16_add(f16_mul(w, h11), z);
-            *Y++ = f16_sub(f16_mul(z, h22), w);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f16_add(f16_mul(w, h11), z);
-            *Y = f16_sub(f16_mul(z, h22), w);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
+    if (f16_eq(flag, NEGTWO)) return;
+
+    float16_t h11, h21, h12, h22;
+    if (f16_eq(flag, NEGONE)) {
+        h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
+    } else if (f16_eq(flag, ZERO)) {
+        h11 = ONE;  h21 = P[2]; h12 = P[3]; h22 = ONE;
+    } else { /* flag == 1 */
+        h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4];
+    }
+
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float16_t w = X[iX], z = Y[iY];
+        X[iX] = nan_unify_h(f16_add(f16_mul(w, h11), f16_mul(z, h12)));
+        Y[iY] = nan_unify_h(f16_add(f16_mul(w, h21), f16_mul(z, h22)));
+        iX += incX;
+        iY += incY;
+    }
 }
diff --git a/src/blas/level1/srot.c b/src/blas/level1/srot.c
index 0c78710..bfdf489 100644
--- a/src/blas/level1/srot.c
+++ b/src/blas/level1/srot.c
@@ -1,25 +1,14 @@
 #include "softblas.h"
 
-void srot(const uint32_t N, float32_t *X, const uint32_t  incX, float32_t *Y, const uint32_t incY, const float32_t c, const float32_t s, const uint_fast8_t rndMode) {
+//  Apply a plane rotation: [x_i; y_i] <- [c s; -s c] [x_i; y_i].
+void srot(const uint64_t N, float32_t *X, const uint64_t incX, float32_t *Y, const uint64_t incY, const float32_t c, const float32_t s, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-    float32_t tmp;
-
-    if (c != SB_REAL32_ONE || s != SB_REAL32_ZERO) {
-        if (incX == 1 && incY == 1) {
-            for (uint64_t i=0; i != N; i++) {
-                tmp = f32_add(f32_mul(c, X[i]), f32_mul(s, Y[i]));
-                Y[i] = f32_sub(f32_mul(c, Y[i]), f32_mul(s, X[i]));
-                X[i] = tmp;
-            }
-        }
-        else
-        {
-            for (uint64_t i=N; i; i--, Y += incY, X += incX)
-            {
-                tmp = f32_add(f32_mul(c, *X), f32_mul(s, *Y));
-                *Y = f32_sub(f32_mul(c, *Y), f32_mul(s, *X));
-                *X = tmp;
-            }
-        }
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float32_t tmp  = f32_add(f32_mul(c, X[iX]), f32_mul(s, Y[iY]));
+        Y[iY] = nan_unify_s(f32_sub(f32_mul(c, Y[iY]), f32_mul(s, X[iX])));
+        X[iX] = nan_unify_s(tmp);
+        iX += incX;
+        iY += incY;
     }
 }
diff --git a/src/blas/level1/srotg.c b/src/blas/level1/srotg.c
index fa47cad..664963e 100644
--- a/src/blas/level1/srotg.c
+++ b/src/blas/level1/srotg.c
@@ -1,45 +1,35 @@
 #include "softblas.h"
 
-// *** XXX WIP XXX ***
-
-//  Given a vertical matrix containing a and b, computes the values of cos θ and
-//  sin θ that zero the lower value (b).  Returns the value of sin θ in s, the
-//  value of cos θ in c, and the upper value (r) in a.
-void srotg(float32_t *A, float32_t *B, float32_t *C, float32_t *S, const uint_fast8_t rndMode) {
+//  Construct a Givens rotation that zeroes b: returns c, s, and overwrites
+//  a with r = +/-sqrt(a^2+b^2) and b with the reconstruction scalar z.
+//  (The SoftFloat comparison/abs macros do not parenthesize their argument,
+//  so every operand here is first loaded into a plain local.)
+void srotg(float32_t *a, float32_t *b, float32_t *c, float32_t *s, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   const float32_t SZERO = { SB_REAL32_ZERO };
-   const float32_t SONE = { SB_REAL32_ONE };
-   float32_t roe, scale, R, Z;
-
-   if (f32_gt(f32M_abs(A), f32M_abs(B))) {
-      roe = *A;
-   } else {
-      roe = *B;
-   }
-   scale = f32_add(f32M_abs(A), f32M_abs(B));
-
-   if (f32_eq(scale, SZERO)) {
-      C = &((float32_t){ SB_REAL32_ONE });
-      S = &((float32_t){ SB_REAL32_ZERO });
-      R = (float32_t){ SB_REAL32_ZERO };
-      Z = (float32_t){ SB_REAL32_ZERO };
-      A = &R;
-      B = &Z;
-   } else {
-      R = f32_mul(scale, f32_sqrt(f32_add(f32_mul(f32_div(*A, scale), f32_div(*A, scale)), f32_mul(f32_div(*B, scale), f32_div(*B, scale)))));
-      R = f32_mul(f32_abs(roe), R);
+    const float32_t ZERO = { SB_REAL32_ZERO };
+    const float32_t ONE  = { SB_REAL32_ONE };
+    const float32_t NEG  = { SB_REAL32_NEGONE };
 
-      *C = f32_div(*A, R);
-      *S = f32_div(*B, R);
-      Z = SONE;
-      if (f32_gt(f32M_abs(*A), f32M_abs(*B))) {
-         Z = *S;
-      }
-      if (f32_ge(f32M_abs(*B), f32M_abs(*A)) && f32_ne(*C, SZERO)) {
-         Z = f32_div(SONE, *C);
-      }
-   }
+    float32_t A = *a, B = *b;
+    float32_t absA = f32_abs(A), absB = f32_abs(B);
+    float32_t roe   = f32_gt(absA, absB) ? A : B;
+    float32_t scale = f32_add(absA, absB);
+    float32_t r, z;
 
-   *A = R;
-   *B = Z;
+    if (f32_eq(scale, ZERO)) {
+        *c = ONE; *s = ZERO; r = ZERO; z = ZERO;
+    } else {
+        float32_t as = f32_div(A, scale);
+        float32_t bs = f32_div(B, scale);
+        r = f32_mul(scale, f32_sqrt(f32_add(f32_mul(as, as), f32_mul(bs, bs))));
+        r = f32_mul(f32_lt(roe, ZERO) ? NEG : ONE, r);   // sign(roe) * |r|
+        *c = f32_div(A, r);
+        *s = f32_div(B, r);
+        float32_t C = *c, S = *s;
+        z = ONE;
+        if (f32_gt(absA, absB)) z = S;
+        if (f32_ge(absB, absA) && f32_ne(C, ZERO)) z = f32_div(ONE, C);
+    }
+    *a = r;
+    *b = z;
 }
diff --git a/src/blas/level1/srotm.c b/src/blas/level1/srotm.c
index 90f88b1..c5484bd 100644
--- a/src/blas/level1/srotm.c
+++ b/src/blas/level1/srotm.c
@@ -1,90 +1,32 @@
 #include "softblas.h"
 
-void srotm(const uint32_t N, float32_t *X, const uint32_t incX, float32_t *Y, const uint32_t incY, const float32_t *P, const uint_fast8_t rndMode) {
+//  Apply a modified Givens rotation H (selected by the flag P[0]) to the
+//  pair of vectors X, Y.  H is built from P[1..4] as in the reference SROTM.
+void srotm(const uint64_t N, float32_t *X, const uint64_t incX, float32_t *Y, const uint64_t incY, const float32_t *P, const uint_fast8_t rndMode) {
     _set_rounding(rndMode);
-   uint64_t i;
-   const float32_t flag = *P;
-   float32_t h11, h21, h12, h22, w, z;
+    const float32_t flag  = P[0];
+    const float32_t NEGTWO = { SB_REAL32_NEGTWO };
+    const float32_t NEGONE = { SB_REAL32_NEGONE };
+    const float32_t ZERO   = { SB_REAL32_ZERO };
+    const float32_t ONE    = { SB_REAL32_ONE };
 
-   if f32_eq(flag, SB_REAL32_NEGTWO) return;
-   if (f32_eq(flag, SB_REAL32_NEGONE))
-   {
-      h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f32_add(f32_mul(w, h11), f32_mul(z, h12));
-            *Y++ = f32_add(f32_mul(w, h21), f32_mul(z, h22));
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f32_add(f32_mul(w, h11), f32_mul(z, h12));
-            *Y = f32_add(f32_mul(w, h21), f32_mul(z, h22));
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f32_eq(flag, SB_REAL32_ZERO))
-   {
-      h21 = P[2];
-      h12 = P[3];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f32_add(w, f32_mul(z, h12));
-            *Y++ = f32_add(f32_mul(w, h21), z);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f32_add(w, f32_mul(z, h12));
-            *Y = f32_add(f32_mul(w, h21), z);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
-   else if (f32_eq(flag, SB_REAL32_ONE))
-   {
-      h11 = P[1];
-      h22 = P[4];
-      if (incX == 1 && incY == 1)
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X++ = f32_add(f32_mul(w, h11), z);
-            *Y++ = f32_sub(f32_mul(z, h22), w);
-         }
-      }
-      else
-      {
-         for (i = N; i; i--)
-         {
-            w = *X;
-            z = *Y;
-            *X = f32_add(f32_mul(w, h11), z);
-            *Y = f32_sub(f32_mul(z, h22), w);
-            X += incX;
-            Y += incY;
-         }
-      }
-   }
+    if (f32_eq(flag, NEGTWO)) return;
+
+    float32_t h11, h21, h12, h22;
+    if (f32_eq(flag, NEGONE)) {
+        h11 = P[1]; h21 = P[2]; h12 = P[3]; h22 = P[4];
+    } else if (f32_eq(flag, ZERO)) {
+        h11 = ONE;  h21 = P[2]; h12 = P[3]; h22 = ONE;
+    } else { /* flag == 1 */
+        h11 = P[1]; h21 = NEGONE; h12 = ONE; h22 = P[4];
+    }
+
+    uint64_t iX = 0, iY = 0;
+    for (uint64_t i = 0; i < N; i++) {
+        float32_t w = X[iX], z = Y[iY];
+        X[iX] = nan_unify_s(f32_add(f32_mul(w, h11), f32_mul(z, h12)));
+        Y[iY] = nan_unify_s(f32_add(f32_mul(w, h21), f32_mul(z, h22)));
+        iX += incX;
+        iY += incY;
+    }
 }
diff --git a/tests/blas/include/test.h b/tests/blas/include/test.h
index 989be8e..06beba0 100644
--- a/tests/blas/include/test.h
+++ b/tests/blas/include/test.h
@@ -460,5 +460,10 @@ MunitResult test_cdotc_conj(const MunitParameter params[], void* u);
 MunitResult test_scasum_basic(const MunitParameter params[], void* u);
 MunitResult test_scnrm2_basic(const MunitParameter params[], void* u);
 MunitResult test_csrot_basic(const MunitParameter params[], void* u);
+//  Rotation (test_rot.c)
+MunitResult test_srot_basic(const MunitParameter params[], void* u);
+MunitResult test_srotg_basic(const MunitParameter params[], void* u);
+MunitResult test_srotg_345(const MunitParameter params[], void* u);
+MunitResult test_srotm_basic(const MunitParameter params[], void* u);
 
 #endif // TEST_H
diff --git a/tests/blas/level1/test_rot.c b/tests/blas/level1/test_rot.c
new file mode 100644
index 0000000..f052900
--- /dev/null
+++ b/tests/blas/level1/test_rot.c
@@ -0,0 +1,43 @@
+#include "test.h"
+
+//  srot: plane rotation with c=0, s=1 maps (x,y) -> (y, -x).
+MunitResult test_srot_basic(const MunitParameter params[], void* u) {
+    const float32_t c = { 0x00000000 }, s = { 0x3f800000 };  // 0, 1
+    float32_t* X = svec((float[]){1.0f, 3.0f}, 2);
+    float32_t* Y = svec((float[]){2.0f, 4.0f}, 2);
+    srot(2, X, 1, Y, 1, c, s, 'n');
+    assert_ulong(X[0].v, ==, 0x40000000u);  // 2
+    assert_ulong(X[1].v, ==, 0x40800000u);  // 4
+    assert_ulong(Y[0].v, ==, 0xbf800000u);  // -1
+    assert_ulong(Y[1].v, ==, 0xc0400000u);  // -3
+    free(X); free(Y);
+    return MUNIT_OK;
+}
+//  srotg(a=1, b=0): r=1, z=0, c=1, s=0.
+MunitResult test_srotg_basic(const MunitParameter params[], void* u) {
+    float32_t a = { 0x3f800000 }, b = { 0x00000000 }, c, s;
+    srotg(&a, &b, &c, &s, 'n');
+    assert_ulong(a.v, ==, 0x3f800000u);  // r = 1
+    assert_ulong(b.v, ==, 0x00000000u);  // z = 0
+    assert_ulong(c.v, ==, 0x3f800000u);  // c = 1
+    assert_ulong(s.v, ==, 0x00000000u);  // s = 0
+    return MUNIT_OK;
+}
+//  srotg(a=3, b=4): r = 5 (|a|<|b| so roe=b>0 -> positive r).
+MunitResult test_srotg_345(const MunitParameter params[], void* u) {
+    float32_t a = { 0x40400000 }, b = { 0x40800000 }, c, s;  // 3, 4
+    srotg(&a, &b, &c, &s, 'n');
+    assert_ulong(a.v, ==, 0x40a00000u);  // r = 5
+    return MUNIT_OK;
+}
+//  srotm flag=0: H = [[1, h12],[h21, 1]] with h21=h12=1. (2,3) -> (5,5).
+MunitResult test_srotm_basic(const MunitParameter params[], void* u) {
+    float32_t* X = svec((float[]){2.0f}, 1);
+    float32_t* Y = svec((float[]){3.0f}, 1);
+    float32_t P[5] = {{0x00000000},{0},{0x3f800000},{0x3f800000},{0}};  // flag=0,h21=1,h12=1
+    srotm(1, X, 1, Y, 1, P, 'n');
+    assert_ulong(X[0].v, ==, 0x40a00000u);  // 5
+    assert_ulong(Y[0].v, ==, 0x40a00000u);  // 5
+    free(X); free(Y);
+    return MUNIT_OK;
+}
diff --git a/tests/test_all.c b/tests/test_all.c
index 7751049..93ed521 100644
--- a/tests/test_all.c
+++ b/tests/test_all.c
@@ -51,6 +51,11 @@ int main(int argc, char* argv[MUNIT_ARRAY_PARAM(argc + 1)]) {
             {"/test_isamax_stride", test_isamax_stride, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
             {"/test_isamax_13542", test_isamax_13542, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
             {"/test_isamax_one", test_isamax_one, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
+
+            {"/test_srot_basic", test_srot_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
+            {"/test_srotg_basic", test_srotg_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
+            {"/test_srotg_345", test_srotg_345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
+            {"/test_srotm_basic", test_srotm_basic, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
             
             {"/test_dasum_0", test_dasum_0, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},
             {"/test_dasum_12345", test_dasum_12345, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL},