From c1e372932607b7c6d2f33e6256738a5af7c910f7 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:13:49 +0800
Subject: [PATCH 01/21] rne/mono: scaffold schoolbook multiplication

---
 skyscraper/bn254-multiplier/benches/bench.rs | 10 +++++++++-
 skyscraper/bn254-multiplier/src/rne/mod.rs   |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs
index 7d27d2563..72f252e39 100644
--- a/skyscraper/bn254-multiplier/benches/bench.rs
+++ b/skyscraper/bn254-multiplier/benches/bench.rs
@@ -7,7 +7,7 @@ use {
 
 // #[divan::bench_group]
 mod mul {
-    use super::*;
+    use {super::*, bn254_multiplier::rne};
 
     #[divan::bench]
     fn scalar_mul(bencher: Bencher) {
@@ -31,6 +31,14 @@ mod mul {
             .bench_local_values(|(a, b)| a * b);
     }
 
+    #[divan::bench]
+    fn single_b51(bencher: Bencher) {
+        bencher
+            //.counter(ItemsCount::new(1usize))
+            .with_inputs(|| rng().random())
+            .bench_local_values(|(a, b)| rne::single::simd_mul(a, b));
+    }
+
     #[divan::bench]
     fn simd_mul_51b(bencher: Bencher) {
         bencher
diff --git a/skyscraper/bn254-multiplier/src/rne/mod.rs b/skyscraper/bn254-multiplier/src/rne/mod.rs
index 415090bd7..cee3ce350 100644
--- a/skyscraper/bn254-multiplier/src/rne/mod.rs
+++ b/skyscraper/bn254-multiplier/src/rne/mod.rs
@@ -25,5 +25,6 @@
 pub mod constants;
 pub mod portable_simd;
 pub mod simd_utils;
+pub mod single;
 
 pub use {constants::*, portable_simd::*, simd_utils::*};

From ed4d4d89c8a567d4a1b02a0913a65b1a8007b219 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:13:57 +0800
Subject: [PATCH 02/21] rne/mono: implement single-input mul function

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 335 ++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 skyscraper/bn254-multiplier/src/rne/single.rs

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
new file mode 100644
index 000000000..1fe990e1d
--- /dev/null
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -0,0 +1,335 @@
+//! Portable SIMD Montgomery multiplication and squaring.
+//!
+//! Processes two independent field multiplications in parallel using 2-lane
+//! SIMD.
+
+use {
+    crate::rne::{
+        constants::*,
+        simd_utils::{
+            addv_simd, fma, i2f, make_initial, reduce_ct_simd, smult_noinit_simd,
+            transpose_simd_to_u256, transpose_u256_to_simd, u255_to_u256_shr_1_simd,
+            u256_to_u255_simd,
+        },
+    },
+    core::{
+        ops::BitAnd,
+        simd::{num::SimdFloat, Simd},
+    },
+    seq_macro::seq,
+    std::simd::{
+        num::{SimdInt, SimdUint},
+        simd_swizzle,
+    },
+};
+
+/// Two parallel Montgomery squarings: `(v0², v1²)`.
+/// input must fit in 2^255-1; no runtime checking
+#[inline]
+pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) {
+    let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a]));
+
+    let mut t: [Simd<i64, 2>; 10] = [Simd::splat(0); 10];
+
+    for i in 0..5 {
+        let avi: Simd<f64, 2> = i2f(v0_a[i]);
+        for j in (i + 1)..5 {
+            let bvj: Simd<f64, 2> = i2f(v0_a[j]);
+            let p_hi = fma(avi, bvj, Simd::splat(C1));
+            let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi);
+            t[i + j + 1] += p_hi.to_bits().cast();
+            t[i + j] += p_lo.to_bits().cast();
+        }
+    }
+
+    // Most shifting operations are more expensive addition thus for multiplying by
+    // 2 we use addition.
+    for i in 1..=8 {
+        t[i] += t[i];
+    }
+
+    for i in 0..5 {
+        let avi: Simd<f64, 2> = i2f(v0_a[i]);
+        let p_hi = fma(avi, avi, Simd::splat(C1));
+        let p_lo = fma(avi, avi, Simd::splat(C2) - p_hi);
+        t[i + i + 1] += p_hi.to_bits().cast();
+        t[i + i] += p_lo.to_bits().cast();
+    }
+
+    t[0] += Simd::splat(make_initial(1, 0));
+    t[9] += Simd::splat(make_initial(0, 6));
+    t[1] += Simd::splat(make_initial(2, 1));
+    t[8] += Simd::splat(make_initial(6, 7));
+    t[2] += Simd::splat(make_initial(3, 2));
+    t[7] += Simd::splat(make_initial(7, 8));
+    t[3] += Simd::splat(make_initial(4, 3));
+    t[6] += Simd::splat(make_initial(8, 9));
+    t[4] += Simd::splat(make_initial(10, 4));
+    t[5] += Simd::splat(make_initial(9, 10));
+
+    t[1] += t[0] >> 51;
+    t[2] += t[1] >> 51;
+    t[3] += t[2] >> 51;
+    t[4] += t[3] >> 51;
+
+    let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4);
+    let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3);
+    let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2);
+    let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1);
+
+    let s = [
+        r0[0] + r1[0] + r2[0] + r3[0] + t[4],
+        r0[1] + r1[1] + r2[1] + r3[1] + t[5],
+        r0[2] + r1[2] + r2[2] + r3[2] + t[6],
+        r0[3] + r1[3] + r2[3] + r3[3] + t[7],
+        r0[4] + r1[4] + r2[4] + r3[4] + t[8],
+        r0[5] + r1[5] + r2[5] + r3[5] + t[9],
+    ];
+
+    // The upper bits of s will not affect the lower 51 bits of the product and
+    // therefore we only have to bitmask once.
+    let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51));
+    let mp = smult_noinit_simd(m, U51_P);
+
+    let mut addi = addv_simd(s, mp);
+    // Apply carries before dropping the last limb
+    addi[1] += addi[0] >> 51;
+    let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]];
+
+    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
+    // and the final shift is done as part of the conversion back to u256
+    let reduced = reduce_ct_simd(addi);
+    let reduced = redundant_carry(reduced);
+    let u256_result = u255_to_u256_shr_1_simd(reduced);
+    let v = transpose_simd_to_u256(u256_result);
+    (v[0], v[1])
+}
+
+/// Move redundant carries from lower limbs to the higher limbs such that all
+/// limbs except the last one is 51 bits. The most significant limb can be
+/// larger than 51 bits as the input can be bigger 2^255-1.
+#[inline(always)]
+fn redundant_carry<const N: usize, const L: usize>(t: [Simd<i64, L>; N]) -> [Simd<u64, L>; N]
+where
+    std::simd::LaneCount<L>: std::simd::SupportedLaneCount,
+{
+    let mut borrow = Simd::splat(0);
+    let mut res = [Simd::splat(0); N];
+    for i in 0..t.len() - 1 {
+        let tmp = t[i] + borrow;
+        res[i] = (tmp.cast()).bitand(Simd::splat(MASK51));
+        borrow = tmp >> 51;
+    }
+
+    res[N - 1] = (t[N - 1] + borrow).cast();
+    res
+}
+
+/// Convert 4×64-bit to 5×51-bit limb representation.
+/// Input must fit in 255 bits; no runtime checking.
+#[inline(always)]
+pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] {
+    let [l0, l1, l2, l3] = limbs;
+    [
+        (l0) & MASK51,
+        ((l0 >> 51) | (l1 << 13)) & MASK51,
+        ((l1 >> 38) | (l2 << 26)) & MASK51,
+        ((l2 >> 25) | (l3 << 39)) & MASK51,
+        l3 >> 12 & MASK51,
+    ]
+}
+
+pub fn i2f_scalar(a: u64) -> f64 {
+    // This function has no target gating as we want to verify this function with
+    // kani and proptest on a different platform than wasm
+
+    // By adding 2^52 represented as float (0x1p52) -> 0x433 << 52, we align the
+    // 52bit number fully in the mantissa. This can be done with a simple or. Then
+    // to convert a to it's floating point number we subtract this again. This way
+    // we only pay for the conversion of the lower bits and not the full 64 bits.
+    let exponent = 0x433 << 52;
+    let a: f64 = f64::from_bits(a | exponent);
+    let b: f64 = f64::from_bits(exponent);
+    a - b
+}
+
+#[inline(always)]
+pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
+    let mut t = [Simd::splat(0); 6];
+    let s: Simd<f64, 2> = Simd::splat(i2f_scalar(s));
+
+    let v01 = Simd::from_array([v[0] as f64, v[1] as f64]);
+    let p_hi = fma(s, v01, Simd::splat(C1));
+    let p_lo = fma(s, v01, Simd::splat(C2) - p_hi);
+    t[1] += p_hi.to_bits().cast();
+    t[0] += p_lo.to_bits().cast();
+
+    let v23 = Simd::from_array([v[2] as f64, v[3] as f64]);
+    let p_hi = fma(s, v23, Simd::splat(C1));
+    let p_lo_0 = fma(s, v23, Simd::splat(C2) - p_hi);
+    t[2] += p_hi.to_bits().cast();
+    t[3] += p_lo_0.to_bits().cast();
+
+    let p_hi_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C1));
+    let p_lo_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2);
+    t[3] += p_hi_2.to_bits().cast();
+    t[2] += p_lo_2.to_bits().cast();
+
+    let p_hi_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C1));
+    let p_lo_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3);
+    t[4] += p_hi_3.to_bits().cast();
+    t[3] += p_lo_3.to_bits().cast();
+
+    let p_hi_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C1));
+    let p_lo_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4);
+    t[5] += p_hi_4.to_bits().cast();
+    t[4] += p_lo_4.to_bits().cast();
+
+    t
+}
+
+/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
+/// input must fit in 2^255-1; no runtime checking
+#[inline(always)]
+pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
+    let v0_a = u256_to_u255(v0_a);
+    let v0_b = u256_to_u255(v0_b);
+
+    let mut t: [i64; 10] = [0; 10];
+    t[0] = make_initial(1, 0);
+    t[9] = make_initial(0, 6);
+    t[1] = make_initial(2, 1);
+    t[8] = make_initial(6, 7);
+    t[2] = make_initial(3, 2);
+    t[7] = make_initial(7, 8);
+    t[3] = make_initial(4, 3);
+    t[6] = make_initial(8, 9);
+    t[4] = make_initial(10, 4);
+    t[5] = make_initial(9, 10);
+
+    let mut ts = [Simd::splat(0); 9];
+
+    // Offset multiplication to have less intermediate data
+    seq!(i in 0..5{
+        let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
+        let b01: Simd<f64, 2> = i2f(Simd::from_array([v0_b[0], v0_b[1]]));
+        let p_hi = fma(ai, b01, Simd::splat(C1));
+        let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
+        ts[i+1] += p_hi.to_bits().cast();
+        ts[i+0] += p_lo.to_bits().cast();
+
+        let b23: Simd<f64, 2> = i2f(Simd::from_array([v0_b[2], v0_b[3]]));
+        let p_hi = fma(ai, b23, Simd::splat(C1));
+        let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi);
+        ts[i+3] += p_hi.to_bits().cast();
+        ts[i+2] += p_lo.to_bits().cast();
+
+        let b4 = Simd::splat(i2f_scalar(v0_b[4]));
+        let p_hi = fma(ai, b4, Simd::splat(C1));
+        let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
+        t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64);
+        t[i + 4] = t[i + 4].wrapping_add(p_lo[0].to_bits() as i64);
+
+    });
+
+    for s in (1..9).step_by(2) {
+        ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]);
+        ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]);
+    }
+
+    for s in (0..10).step_by(2) {
+        t[s] = t[s].wrapping_add(ts[s][0]);
+        t[s + 1] = t[s + 1].wrapping_add(ts[s][1]);
+    }
+
+    // sign extend redundant carries
+    t[1] += t[0] >> 51;
+    t[2] += t[1] >> 51;
+    t[3] += t[2] >> 51;
+    t[4] += t[3] >> 51;
+
+    let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4);
+    let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3);
+    let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2);
+    let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1);
+
+    let s = [
+        r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]),
+        r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]),
+        r0[2] + r1[2] + r2[2] + r3[2] + Simd::splat(t[6]),
+        r0[3] + r1[3] + r2[3] + r3[3] + Simd::splat(t[7]),
+        r0[4] + r1[4] + r2[4] + r3[4] + Simd::splat(t[8]),
+        r0[5] + r1[5] + r2[5] + r3[5] + Simd::splat(t[9]),
+    ];
+
+    let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51));
+    let mp = smult_noinit_simd(m, U51_P);
+
+    let mut addi = addv_simd(s, mp);
+    addi[1] += addi[0] >> 51;
+    let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]];
+
+    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
+    // and the final shift is done as part of the conversion back to u256
+    let reduced = reduce_ct_simd(addi);
+    let reduced = redundant_carry(reduced);
+    let u256_result = u255_to_u256_shr_1_simd(reduced);
+    let v = transpose_simd_to_u256(u256_result);
+    (v[0], v[1])
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[cfg(test)]
+mod tests {
+    use {
+        super::*,
+        crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference},
+        ark_bn254::Fr,
+        ark_ff::{BigInt, PrimeField},
+        proptest::{
+            prelude::{prop, Strategy},
+            prop_assert_eq, proptest,
+        },
+    };
+
+    #[test]
+    fn test_simd_mul() {
+        proptest!(|(
+                a in limbs5_51(),
+                b in limbs5_51(),
+            )| {
+                let a: [Simd<u64,1>;_] = a.map(Simd::splat);
+                let b: [Simd<u64,1>;_] = b.map(Simd::splat);
+                let a = u255_to_u256_simd(a).map(|x|x[0]);
+                let b = u255_to_u256_simd(b).map(|x|x[0]);
+                let (ab, _bc) = simd_mul(a, b);
+                let ab_ref = ark_ff_reference(a, b);
+                let ab = Fr::new(BigInt(ab));
+                prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint());
+        })
+    }
+
+    // #[test]
+    // fn test_simd_sqr() {
+    //     proptest!(|(
+    //             a in limbs5_51(),
+    //             b in limbs5_51(),
+    //         )| {
+    //             let a: [Simd<u64,1>;_] = a.map(Simd::splat);
+    //             let b: [Simd<u64,1>;_] = b.map(Simd::splat);
+    //             let a = u255_to_u256_simd(a).map(|x|x[0]);
+    //             let b = u255_to_u256_simd(b).map(|x|x[0]);
+    //             let (a2, _b2) = simd_mul(a, b, b);
+    //             let (a2s, _b2s) = simd_sqr(a, b);
+    //             prop_assert_eq!(a2, a2s);
+    //     })
+    // }
+
+    fn limb51() -> impl Strategy<Value = u64> {
+        0u64..(1u64 << 51)
+    }
+
+    fn limbs5_51() -> impl Strategy<Value = [u64; 5]> {
+        prop::array::uniform5(limb51())
+    }
+}

From e565c21b285ba58f1e4873ed4fa6c46622e39be2 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:00 +0800
Subject: [PATCH 03/21] =?UTF-8?q?rne/mono:=205=C3=975=20multiplication=20u?=
 =?UTF-8?q?sing=206-wide=20SIMD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 1fe990e1d..5a691ea7d 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -197,15 +197,15 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
 
     let mut t: [i64; 10] = [0; 10];
     t[0] = make_initial(1, 0);
-    t[9] = make_initial(0, 6);
+    t[9] = make_initial(1, 6);
     t[1] = make_initial(2, 1);
-    t[8] = make_initial(6, 7);
+    t[8] = make_initial(7, 7);
     t[2] = make_initial(3, 2);
-    t[7] = make_initial(7, 8);
+    t[7] = make_initial(8, 8);
     t[3] = make_initial(4, 3);
-    t[6] = make_initial(8, 9);
+    t[6] = make_initial(9, 9);
     t[4] = make_initial(10, 4);
-    t[5] = make_initial(9, 10);
+    t[5] = make_initial(10, 10);
 
     let mut ts = [Simd::splat(0); 9];
 
@@ -224,11 +224,11 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
         ts[i+3] += p_hi.to_bits().cast();
         ts[i+2] += p_lo.to_bits().cast();
 
-        let b4 = Simd::splat(i2f_scalar(v0_b[4]));
+        let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]);
         let p_hi = fma(ai, b4, Simd::splat(C1));
         let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
         t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64);
-        t[i + 4] = t[i + 4].wrapping_add(p_lo[0].to_bits() as i64);
+        ts[i + 4] += p_lo.to_bits().cast();
 
     });
 

From d5f84ecbc0a0c68a75ab770cc7c857e9e1635aa9 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:03 +0800
Subject: [PATCH 04/21] rne/mono: complete schoolbook phase

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 5a691ea7d..d2f394eaa 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -197,17 +197,17 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
 
     let mut t: [i64; 10] = [0; 10];
     t[0] = make_initial(1, 0);
-    t[9] = make_initial(1, 6);
+    t[9] = make_initial(1, 7);
     t[1] = make_initial(2, 1);
-    t[8] = make_initial(7, 7);
+    t[8] = make_initial(7, 8);
     t[2] = make_initial(3, 2);
-    t[7] = make_initial(8, 8);
+    t[7] = make_initial(8, 9);
     t[3] = make_initial(4, 3);
-    t[6] = make_initial(9, 9);
+    t[6] = make_initial(9, 10);
     t[4] = make_initial(10, 4);
     t[5] = make_initial(10, 10);
 
-    let mut ts = [Simd::splat(0); 9];
+    let mut ts = [Simd::splat(0); 10];
 
     // Offset multiplication to have less intermediate data
     seq!(i in 0..5{
@@ -227,7 +227,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
         let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]);
         let p_hi = fma(ai, b4, Simd::splat(C1));
         let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
-        t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64);
+        ts[i + 5] += p_hi.to_bits().cast();
         ts[i + 4] += p_lo.to_bits().cast();
 
     });
@@ -236,6 +236,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
         ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]);
         ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]);
     }
+    ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]);
 
     for s in (0..10).step_by(2) {
         t[s] = t[s].wrapping_add(ts[s][0]);

From 8493700ca57ca012b2bdb980ecd0cc152b196331 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:06 +0800
Subject: [PATCH 05/21] rne/mono: add SIMD initialization and RHO swizzle

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index d2f394eaa..57912c503 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -195,19 +195,12 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
     let v0_a = u256_to_u255(v0_a);
     let v0_b = u256_to_u255(v0_b);
 
-    let mut t: [i64; 10] = [0; 10];
-    t[0] = make_initial(1, 0);
-    t[9] = make_initial(1, 7);
-    t[1] = make_initial(2, 1);
-    t[8] = make_initial(7, 8);
-    t[2] = make_initial(3, 2);
-    t[7] = make_initial(8, 9);
-    t[3] = make_initial(4, 3);
-    t[6] = make_initial(9, 10);
-    t[4] = make_initial(10, 4);
-    t[5] = make_initial(10, 10);
-
     let mut ts = [Simd::splat(0); 10];
+    ts[0] = Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
+    ts[2] = Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
+    ts[4] = Simd::from_array([make_initial(10, 4), make_initial(10, 10)]);
+    ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]);
+    ts[8] = Simd::from_array([make_initial(7, 8), make_initial(1, 7)]);
 
     // Offset multiplication to have less intermediate data
     seq!(i in 0..5{
@@ -232,28 +225,44 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
 
     });
 
-    for s in (1..9).step_by(2) {
-        ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]);
-        ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]);
-    }
-    ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]);
+    let mut t: [i64; 10] = [0; 10];
 
-    for s in (0..10).step_by(2) {
-        t[s] = t[s].wrapping_add(ts[s][0]);
-        t[s + 1] = t[s + 1].wrapping_add(ts[s][1]);
-    }
+    seq!( i in 0..2 {
+        let s = i * 2;
+        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
+        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
+        t[s] = ts[s][0];
+        t[s + 1] = ts[s][1];
+    });
 
     // sign extend redundant carries
     t[1] += t[0] >> 51;
     t[2] += t[1] >> 51;
     t[3] += t[2] >> 51;
-    t[4] += t[3] >> 51;
+
+    // Lift carry into SIMD to prevent extraction
+    ts[4] += Simd::from_array([t[3] >> 51, 0]);
 
     let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4);
     let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3);
     let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2);
     let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1);
 
+    seq!( i in 2..4 {
+        let s = i * 2;
+        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
+        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
+    });
+    ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]);
+
+    // After this point only the even ts matter
+
+    seq!(i in 2..5 {
+        let s = i * 2;
+        t[s] = ts[s][0];
+        t[s + 1] = ts[s][1];
+    });
+
     let s = [
         r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]),
         r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]),

From df6219e3b258645e4e85e656d4f6151c22e3f62c Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:08 +0800
Subject: [PATCH 06/21] rne/mono: swizzle after Montgomery reduction

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 69 +++++++++----------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 57912c503..e0f92f1f8 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -166,24 +166,15 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
 
     let v23 = Simd::from_array([v[2] as f64, v[3] as f64]);
     let p_hi = fma(s, v23, Simd::splat(C1));
-    let p_lo_0 = fma(s, v23, Simd::splat(C2) - p_hi);
-    t[2] += p_hi.to_bits().cast();
-    t[3] += p_lo_0.to_bits().cast();
+    let p_lo = fma(s, v23, Simd::splat(C2) - p_hi);
+    t[3] += p_hi.to_bits().cast();
+    t[2] += p_lo.to_bits().cast();
 
-    let p_hi_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C1));
-    let p_lo_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2);
-    t[3] += p_hi_2.to_bits().cast();
-    t[2] += p_lo_2.to_bits().cast();
-
-    let p_hi_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C1));
-    let p_lo_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3);
-    t[4] += p_hi_3.to_bits().cast();
-    t[3] += p_lo_3.to_bits().cast();
-
-    let p_hi_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C1));
-    let p_lo_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4);
-    t[5] += p_hi_4.to_bits().cast();
-    t[4] += p_lo_4.to_bits().cast();
+    let v45 = Simd::from_array([v[4] as f64, 0.]);
+    let p_hi = fma(s, v45, Simd::splat(C1));
+    let p_lo = fma(s, v45, Simd::splat(C2) - p_hi);
+    t[5] += Simd::from_array([p_hi[0].to_bits() as i64, 0]);
+    t[4] += Simd::from_array([p_lo[0].to_bits() as i64, 0]);
 
     t
 }
@@ -225,7 +216,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
 
     });
 
-    let mut t: [i64; 10] = [0; 10];
+    let mut t: [i64; 4] = [0; 4];
 
     seq!( i in 0..2 {
         let s = i * 2;
@@ -243,33 +234,39 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
     // Lift carry into SIMD to prevent extraction
     ts[4] += Simd::from_array([t[3] >> 51, 0]);
 
-    let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4);
-    let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3);
-    let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2);
-    let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1);
+    let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4);
+    let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3);
+    let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2);
+    let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1);
 
-    seq!( i in 2..4 {
-        let s = i * 2;
-        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
-        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
+    let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]];
+
+    seq!( i in 0..6 {
+        ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
     });
-    ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]);
 
+    seq!( i in 0..2 {
+        let s = i * 2;
+        ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
+        ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]);
+    });
+    ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
     // After this point only the even ts matter
 
-    seq!(i in 2..5 {
+    let mut t = [0; 6];
+    seq!(i in 0..3 {
         let s = i * 2;
-        t[s] = ts[s][0];
-        t[s + 1] = ts[s][1];
+        t[s] = ss[s][0];
+        t[s + 1] = ss[s][1];
     });
 
     let s = [
-        r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]),
-        r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]),
-        r0[2] + r1[2] + r2[2] + r3[2] + Simd::splat(t[6]),
-        r0[3] + r1[3] + r2[3] + r3[3] + Simd::splat(t[7]),
-        r0[4] + r1[4] + r2[4] + r3[4] + Simd::splat(t[8]),
-        r0[5] + r1[5] + r2[5] + r3[5] + Simd::splat(t[9]),
+        Simd::splat(t[0]),
+        Simd::splat(t[1]),
+        Simd::splat(t[2]),
+        Simd::splat(t[3]),
+        Simd::splat(t[4]),
+        Simd::splat(t[5]),
     ];
 
     let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51));

From d7319144c5711ebd5ceab8b9aeb352f277a67925 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:11 +0800
Subject: [PATCH 07/21] rne/mono: swizzle after m*P Montgomery step

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index e0f92f1f8..d571da019 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -245,6 +245,13 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
         ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
     });
 
+    let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
+    let mp = smult_noinit(m, U51_P);
+
+    seq!( i in 0..6 {
+        ss[i] += mp[i];
+    });
+
     seq!( i in 0..2 {
         let s = i * 2;
         ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
@@ -253,32 +260,25 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
     ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
     // After this point only the even ts matter
 
-    let mut t = [0; 6];
+    let mut s: [i64; 6] = [0; 6];
     seq!(i in 0..3 {
-        let s = i * 2;
-        t[s] = ss[s][0];
-        t[s + 1] = ss[s][1];
+        let k = i * 2;
+        s[k] = ss[k][0];
+        s[k + 1] = ss[k][1];
     });
 
+    s[1] += s[0] >> 51;
     let s = [
-        Simd::splat(t[0]),
-        Simd::splat(t[1]),
-        Simd::splat(t[2]),
-        Simd::splat(t[3]),
-        Simd::splat(t[4]),
-        Simd::splat(t[5]),
+        Simd::splat(s[1]),
+        Simd::splat(s[2]),
+        Simd::splat(s[3]),
+        Simd::splat(s[4]),
+        Simd::splat(s[5]),
     ];
 
-    let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51));
-    let mp = smult_noinit_simd(m, U51_P);
-
-    let mut addi = addv_simd(s, mp);
-    addi[1] += addi[0] >> 51;
-    let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]];
-
     // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
     // and the final shift is done as part of the conversion back to u256
-    let reduced = reduce_ct_simd(addi);
+    let reduced = reduce_ct_simd(s);
     let reduced = redundant_carry(reduced);
     let u256_result = u255_to_u256_shr_1_simd(reduced);
     let v = transpose_simd_to_u256(u256_result);

From 7c35215fa0b89f137400b57a1dde01e347ea1b4b Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:14 +0800
Subject: [PATCH 08/21] rne/mono: 51-bit limb SIMD implementation

---
 .../bn254-multiplier/src/rne/simd_utils.rs    | 12 ++--
 skyscraper/bn254-multiplier/src/rne/single.rs | 68 ++++++++++++++-----
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/simd_utils.rs b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs
index 93acd71b0..17157f105 100644
--- a/skyscraper/bn254-multiplier/src/rne/simd_utils.rs
+++ b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs
@@ -1,17 +1,21 @@
 //! SIMD utilities for RNE Montgomery multiplication.
 
 use {
-    crate::rne::constants::{C1, C2, C3, MASK51, U51_P},
+    crate::rne::{
+        constants::{C1, C2, C3, MASK51},
+        U51_P,
+    },
     core::{
-        array,
         ops::BitAnd,
         simd::{
-            cmp::SimdPartialEq,
             num::{SimdFloat, SimdInt, SimdUint},
             Simd,
         },
     },
-    std::simd::{LaneCount, SupportedLaneCount},
+    std::{
+        array,
+        simd::{cmp::SimdPartialEq, LaneCount, SupportedLaneCount},
+    },
 };
 #[inline(always)]
 /// On WASM there is no single specialised instruction to cast an integer to a
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index d571da019..06621fba4 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -105,9 +105,6 @@ pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) {
     (v[0], v[1])
 }
 
-/// Move redundant carries from lower limbs to the higher limbs such that all
-/// limbs except the last one is 51 bits. The most significant limb can be
-/// larger than 51 bits as the input can be bigger 2^255-1.
 #[inline(always)]
 fn redundant_carry<const N: usize, const L: usize>(t: [Simd<i64, L>; N]) -> [Simd<u64, L>; N]
 where
@@ -125,6 +122,23 @@ where
     res
 }
 
+/// Move redundant carries from lower limbs to the higher limbs such that all
+/// limbs except the last one is 51 bits. The most significant limb can be
+/// larger than 51 bits as the input can be bigger 2^255-1.
+#[inline(always)]
+fn redundant_carry_scalar<const N: usize>(t: [i64; N]) -> [u64; N] {
+    let mut borrow = 0;
+    let mut res = [0; N];
+    for i in 0..t.len() - 1 {
+        let tmp = t[i] + borrow;
+        res[i] = (tmp as u64) & MASK51;
+        borrow = tmp >> 51;
+    }
+
+    res[N - 1] = (t[N - 1] + borrow) as u64;
+    res
+}
+
 /// Convert 4×64-bit to 5×51-bit limb representation.
 /// Input must fit in 255 bits; no runtime checking.
 #[inline(always)]
@@ -179,10 +193,25 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
     t
 }
 
+#[inline(always)]
+pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
+    // To reduce Check whether the least significant bit is set
+    let mask = -(a[0] & 1);
+
+    seq!( i in 0..5 {
+        a[i] += U51_P[i] as i64 & mask;
+    });
+
+    // Check that final result is even
+    debug_assert!(a[0] & 1 == 0);
+
+    a
+}
+
 /// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
 /// input must fit in 2^255-1; no runtime checking
 #[inline(always)]
-pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
+pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
     let v0_a = u256_to_u255(v0_a);
     let v0_b = u256_to_u255(v0_b);
 
@@ -268,21 +297,26 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) {
     });
 
     s[1] += s[0] >> 51;
-    let s = [
-        Simd::splat(s[1]),
-        Simd::splat(s[2]),
-        Simd::splat(s[3]),
-        Simd::splat(s[4]),
-        Simd::splat(s[5]),
-    ];
+    let s = [s[1], s[2], s[3], s[4], s[5]];
 
     // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
     // and the final shift is done as part of the conversion back to u256
-    let reduced = reduce_ct_simd(s);
-    let reduced = redundant_carry(reduced);
-    let u256_result = u255_to_u256_shr_1_simd(reduced);
-    let v = transpose_simd_to_u256(u256_result);
-    (v[0], v[1])
+    let reduced = reduce_ct(s);
+    let reduced = redundant_carry_scalar(reduced);
+    let u256_result = u255_to_u256_shr_1(reduced);
+    u256_result
+}
+
+/// Convert 5×51-bit to 4×64-bit with simultaneous division by 2.
+#[inline(always)]
+pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] {
+    let [l0, l1, l2, l3, l4] = limbs;
+    [
+        (l0 >> 1) | (l1 << 50),
+        (l1 >> 14) | (l2 << 37),
+        (l2 >> 27) | (l3 << 24),
+        (l3 >> 40) | (l4 << 11),
+    ]
 }
 
 #[cfg(not(target_arch = "wasm32"))]
@@ -309,7 +343,7 @@ mod tests {
                 let b: [Simd<u64,1>;_] = b.map(Simd::splat);
                 let a = u255_to_u256_simd(a).map(|x|x[0]);
                 let b = u255_to_u256_simd(b).map(|x|x[0]);
-                let (ab, _bc) = simd_mul(a, b);
+                let ab = simd_mul(a, b);
                 let ab_ref = ark_ff_reference(a, b);
                 let ab = Fr::new(BigInt(ab));
                 prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint());

From 9696d04ecde00ff8db2f2c4df4a16727af665150 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:16 +0800
Subject: [PATCH 09/21] rne/mono: extraction-free reduction phase

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 06621fba4..7e4de29b1 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -187,8 +187,8 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
     let v45 = Simd::from_array([v[4] as f64, 0.]);
     let p_hi = fma(s, v45, Simd::splat(C1));
     let p_lo = fma(s, v45, Simd::splat(C2) - p_hi);
-    t[5] += Simd::from_array([p_hi[0].to_bits() as i64, 0]);
-    t[4] += Simd::from_array([p_lo[0].to_bits() as i64, 0]);
+    t[5] += p_hi.to_bits().cast();
+    t[4] += p_lo.to_bits().cast();
 
     t
 }
@@ -220,7 +220,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
     ts[2] = Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
     ts[4] = Simd::from_array([make_initial(10, 4), make_initial(10, 10)]);
     ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]);
-    ts[8] = Simd::from_array([make_initial(7, 8), make_initial(1, 7)]);
+    ts[8] = Simd::from_array([make_initial(7, 8), make_initial(6, 7)]);
 
     // Offset multiplication to have less intermediate data
     seq!(i in 0..5{
@@ -281,14 +281,15 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
         ss[i] += mp[i];
     });
 
+    // Get rid of redundant SIMD form
     seq!( i in 0..2 {
         let s = i * 2;
         ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
         ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]);
     });
     ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
-    // After this point only the even ts matter
 
+    // Move to redundant scalar form
     let mut s: [i64; 6] = [0; 6];
     seq!(i in 0..3 {
         let k = i * 2;

From 70deb8c1703d04f06aeb5442862871d47597364d Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:19 +0800
Subject: [PATCH 10/21] rne/mono: WIP checkpoint

---
 skyscraper/bn254-multiplier/benches/bench.rs  |   8 +
 skyscraper/bn254-multiplier/src/rne/single.rs | 245 +++++++++---------
 2 files changed, 125 insertions(+), 128 deletions(-)

diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs
index 72f252e39..75219c12e 100644
--- a/skyscraper/bn254-multiplier/benches/bench.rs
+++ b/skyscraper/bn254-multiplier/benches/bench.rs
@@ -149,6 +149,14 @@ mod sqr {
             .bench_local_values(|(a, b)| rne::simd_sqr(a, b));
     }
 
+    #[divan::bench]
+    fn single_sqr_b51(bencher: Bencher) {
+        bencher
+            //.counter(ItemsCount::new(1usize))
+            .with_inputs(|| rng().random())
+            .bench_local_values(|a| rne::single::simd_sqr(a));
+    }
+
     #[divan::bench]
     fn ark_ff(bencher: Bencher) {
         use {ark_bn254::Fr, ark_ff::BigInt};
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 7e4de29b1..0e77b6e4a 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -6,122 +6,13 @@
 use {
     crate::rne::{
         constants::*,
-        simd_utils::{
-            addv_simd, fma, i2f, make_initial, reduce_ct_simd, smult_noinit_simd,
-            transpose_simd_to_u256, transpose_u256_to_simd, u255_to_u256_shr_1_simd,
-            u256_to_u255_simd,
-        },
-    },
-    core::{
-        ops::BitAnd,
-        simd::{num::SimdFloat, Simd},
+        simd_utils::{fma, i2f, make_initial},
     },
+    core::simd::{num::SimdFloat, Simd},
     seq_macro::seq,
-    std::simd::{
-        num::{SimdInt, SimdUint},
-        simd_swizzle,
-    },
+    std::simd::{num::SimdUint, simd_swizzle},
 };
 
-/// Two parallel Montgomery squarings: `(v0², v1²)`.
-/// input must fit in 2^255-1; no runtime checking
-#[inline]
-pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) {
-    let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a]));
-
-    let mut t: [Simd<i64, 2>; 10] = [Simd::splat(0); 10];
-
-    for i in 0..5 {
-        let avi: Simd<f64, 2> = i2f(v0_a[i]);
-        for j in (i + 1)..5 {
-            let bvj: Simd<f64, 2> = i2f(v0_a[j]);
-            let p_hi = fma(avi, bvj, Simd::splat(C1));
-            let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi);
-            t[i + j + 1] += p_hi.to_bits().cast();
-            t[i + j] += p_lo.to_bits().cast();
-        }
-    }
-
-    // Most shifting operations are more expensive addition thus for multiplying by
-    // 2 we use addition.
-    for i in 1..=8 {
-        t[i] += t[i];
-    }
-
-    for i in 0..5 {
-        let avi: Simd<f64, 2> = i2f(v0_a[i]);
-        let p_hi = fma(avi, avi, Simd::splat(C1));
-        let p_lo = fma(avi, avi, Simd::splat(C2) - p_hi);
-        t[i + i + 1] += p_hi.to_bits().cast();
-        t[i + i] += p_lo.to_bits().cast();
-    }
-
-    t[0] += Simd::splat(make_initial(1, 0));
-    t[9] += Simd::splat(make_initial(0, 6));
-    t[1] += Simd::splat(make_initial(2, 1));
-    t[8] += Simd::splat(make_initial(6, 7));
-    t[2] += Simd::splat(make_initial(3, 2));
-    t[7] += Simd::splat(make_initial(7, 8));
-    t[3] += Simd::splat(make_initial(4, 3));
-    t[6] += Simd::splat(make_initial(8, 9));
-    t[4] += Simd::splat(make_initial(10, 4));
-    t[5] += Simd::splat(make_initial(9, 10));
-
-    t[1] += t[0] >> 51;
-    t[2] += t[1] >> 51;
-    t[3] += t[2] >> 51;
-    t[4] += t[3] >> 51;
-
-    let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4);
-    let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3);
-    let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2);
-    let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1);
-
-    let s = [
-        r0[0] + r1[0] + r2[0] + r3[0] + t[4],
-        r0[1] + r1[1] + r2[1] + r3[1] + t[5],
-        r0[2] + r1[2] + r2[2] + r3[2] + t[6],
-        r0[3] + r1[3] + r2[3] + r3[3] + t[7],
-        r0[4] + r1[4] + r2[4] + r3[4] + t[8],
-        r0[5] + r1[5] + r2[5] + r3[5] + t[9],
-    ];
-
-    // The upper bits of s will not affect the lower 51 bits of the product and
-    // therefore we only have to bitmask once.
-    let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51));
-    let mp = smult_noinit_simd(m, U51_P);
-
-    let mut addi = addv_simd(s, mp);
-    // Apply carries before dropping the last limb
-    addi[1] += addi[0] >> 51;
-    let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]];
-
-    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
-    // and the final shift is done as part of the conversion back to u256
-    let reduced = reduce_ct_simd(addi);
-    let reduced = redundant_carry(reduced);
-    let u256_result = u255_to_u256_shr_1_simd(reduced);
-    let v = transpose_simd_to_u256(u256_result);
-    (v[0], v[1])
-}
-
-#[inline(always)]
-fn redundant_carry<const N: usize, const L: usize>(t: [Simd<i64, L>; N]) -> [Simd<u64, L>; N]
-where
-    std::simd::LaneCount<L>: std::simd::SupportedLaneCount,
-{
-    let mut borrow = Simd::splat(0);
-    let mut res = [Simd::splat(0); N];
-    for i in 0..t.len() - 1 {
-        let tmp = t[i] + borrow;
-        res[i] = (tmp.cast()).bitand(Simd::splat(MASK51));
-        borrow = tmp >> 51;
-    }
-
-    res[N - 1] = (t[N - 1] + borrow).cast();
-    res
-}
-
 /// Move redundant carries from lower limbs to the higher limbs such that all
 /// limbs except the last one is 51 bits. The most significant limb can be
 /// larger than 51 bits as the input can be bigger 2^255-1.
@@ -153,7 +44,7 @@ pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] {
     ]
 }
 
-pub fn i2f_scalar(a: u64) -> f64 {
+pub const fn i2f_scalar(a: u64) -> f64 {
     // This function has no target gating as we want to verify this function with
     // kani and proptest on a different platform than wasm
 
@@ -320,6 +211,107 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] {
     ]
 }
 
+/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
+/// input must fit in 2^255-1; no runtime checking
+#[inline(always)]
+pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
+    let v0_a = u256_to_u255(v0_a);
+    let v0_b = v0_a;
+
+    let mut ts = [Simd::splat(0); 10];
+
+    // Offset multiplication to have less intermediate data
+    seq!(i in 0..5{
+        let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
+        let b01: Simd<f64, 2> = i2f(Simd::from_array([v0_b[0], v0_b[1]]));
+        let p_hi = fma(ai, b01, Simd::splat(C1));
+        let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
+        ts[i+1] += p_hi.to_bits().cast();
+        ts[i+0] += p_lo.to_bits().cast();
+
+        let b23: Simd<f64, 2> = i2f(Simd::from_array([v0_b[2], v0_b[3]]));
+        let p_hi = fma(ai, b23, Simd::splat(C1));
+        let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi);
+        ts[i+3] += p_hi.to_bits().cast();
+        ts[i+2] += p_lo.to_bits().cast();
+
+        let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]);
+        let p_hi = fma(ai, b4, Simd::splat(C1));
+        let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
+        ts[i + 5] += p_hi.to_bits().cast();
+        ts[i + 4] += p_lo.to_bits().cast();
+
+    });
+
+    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
+    ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
+    ts[4] += Simd::from_array([make_initial(10, 4), make_initial(10, 10)]);
+    ts[6] += Simd::from_array([make_initial(9, 10), make_initial(8, 9)]);
+    ts[8] += Simd::from_array([make_initial(7, 8), make_initial(6, 7)]);
+
+    let mut t: [i64; 4] = [0; 4];
+
+    seq!( i in 0..2 {
+        let s = i * 2;
+        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
+        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
+        t[s] = ts[s][0];
+        t[s + 1] = ts[s][1];
+    });
+
+    // sign extend redundant carries
+    t[1] += t[0] >> 51;
+    t[2] += t[1] >> 51;
+    t[3] += t[2] >> 51;
+
+    // Lift carry into SIMD to prevent extraction
+    ts[4] += Simd::from_array([t[3] >> 51, 0]);
+
+    let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4);
+    let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3);
+    let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2);
+    let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1);
+
+    let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]];
+
+    seq!( i in 0..6 {
+        ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
+    });
+
+    let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
+    let mp = smult_noinit(m, U51_P);
+
+    seq!( i in 0..6 {
+        ss[i] += mp[i];
+    });
+
+    // Get rid of redundant SIMD form
+    seq!( i in 0..2 {
+        let s = i * 2;
+        ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
+        ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]);
+    });
+    ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
+
+    // Move to redundant scalar form
+    let mut s: [i64; 6] = [0; 6];
+    seq!(i in 0..3 {
+        let k = i * 2;
+        s[k] = ss[k][0];
+        s[k + 1] = ss[k][1];
+    });
+
+    s[1] += s[0] >> 51;
+    let s = [s[1], s[2], s[3], s[4], s[5]];
+
+    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
+    // and the final shift is done as part of the conversion back to u256
+    let reduced = reduce_ct(s);
+    let reduced = redundant_carry_scalar(reduced);
+    let u256_result = u255_to_u256_shr_1(reduced);
+    u256_result
+}
+
 #[cfg(not(target_arch = "wasm32"))]
 #[cfg(test)]
 mod tests {
@@ -351,21 +343,18 @@ mod tests {
         })
     }
 
-    // #[test]
-    // fn test_simd_sqr() {
-    //     proptest!(|(
-    //             a in limbs5_51(),
-    //             b in limbs5_51(),
-    //         )| {
-    //             let a: [Simd<u64,1>;_] = a.map(Simd::splat);
-    //             let b: [Simd<u64,1>;_] = b.map(Simd::splat);
-    //             let a = u255_to_u256_simd(a).map(|x|x[0]);
-    //             let b = u255_to_u256_simd(b).map(|x|x[0]);
-    //             let (a2, _b2) = simd_mul(a, b, b);
-    //             let (a2s, _b2s) = simd_sqr(a, b);
-    //             prop_assert_eq!(a2, a2s);
-    //     })
-    // }
+    #[test]
+    fn test_simd_sqr() {
+        proptest!(|(
+                a in limbs5_51(),
+            )| {
+                let a: [Simd<u64,1>;_] = a.map(Simd::splat);
+                let a = u255_to_u256_simd(a).map(|x|x[0]);
+                let a2 = simd_mul(a, a);
+                let b2 = simd_sqr(a);
+                prop_assert_eq!(a2, b2);
+        })
+    }
 
     fn limb51() -> impl Strategy<Value = u64> {
         0u64..(1u64 << 51)

From 15c69f70868d3afe39631c25f33d0f0864016d7e Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:22 +0800
Subject: [PATCH 11/21] rne/mono: sqr diagonal and off-diagonal terms

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 106 +++++++++++-------
 1 file changed, 64 insertions(+), 42 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 0e77b6e4a..446f312eb 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -10,7 +10,10 @@ use {
     },
     core::simd::{num::SimdFloat, Simd},
     seq_macro::seq,
-    std::simd::{num::SimdUint, simd_swizzle},
+    std::{
+        ops::BitAnd,
+        simd::{num::SimdUint, simd_swizzle},
+    },
 };
 
 /// Move redundant carries from lower limbs to the higher limbs such that all
@@ -220,34 +223,44 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     let mut ts = [Simd::splat(0); 10];
 
-    // Offset multiplication to have less intermediate data
-    seq!(i in 0..5{
+    for i in 0..5 {
         let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
-        let b01: Simd<f64, 2> = i2f(Simd::from_array([v0_b[0], v0_b[1]]));
-        let p_hi = fma(ai, b01, Simd::splat(C1));
-        let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
-        ts[i+1] += p_hi.to_bits().cast();
-        ts[i+0] += p_lo.to_bits().cast();
-
-        let b23: Simd<f64, 2> = i2f(Simd::from_array([v0_b[2], v0_b[3]]));
-        let p_hi = fma(ai, b23, Simd::splat(C1));
-        let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi);
-        ts[i+3] += p_hi.to_bits().cast();
-        ts[i+2] += p_lo.to_bits().cast();
+        for j in ((i + 1)..5).step_by(2) {
+            let l = v0_b[j];
+            let r = if j + 1 < 5 { v0_b[j + 1] } else { 0 };
+            let b01: Simd<f64, 2> = i2f(Simd::from_array([l, r]));
+            let p_hi = fma(ai, b01, Simd::splat(C1));
+            let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
+            ts[i + j + 1] += p_hi.to_bits().cast();
+            ts[i + j] += p_lo.to_bits().cast();
+        }
+    }
 
-        let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]);
-        let p_hi = fma(ai, b4, Simd::splat(C1));
-        let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
-        ts[i + 5] += p_hi.to_bits().cast();
-        ts[i + 4] += p_lo.to_bits().cast();
+    for i in (0..4).step_by(2) {
+        let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]]));
+        let p_hi = fma(ai, ai, Simd::splat(C1));
+        let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi);
+        let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]);
+        let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]);
+        ts[2 * (i + 1)] += p_hi.to_bits().bitand(r_mask).cast();
+        ts[2 * (i + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast();
+
+        ts[2 * i + 1] += p_hi.to_bits().bitand(l_mask).cast();
+        ts[2 * i] += p_lo.to_bits().bitand(l_mask).cast();
+    }
 
-    });
+    let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[4], 0]));
+    let p_hi = fma(ai, ai, Simd::splat(C1));
+    let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi);
+    let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]);
+    ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast();
+    ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast();
 
-    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
-    ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
-    ts[4] += Simd::from_array([make_initial(10, 4), make_initial(10, 10)]);
-    ts[6] += Simd::from_array([make_initial(9, 10), make_initial(8, 9)]);
-    ts[8] += Simd::from_array([make_initial(7, 8), make_initial(6, 7)]);
+    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]);
+    ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]);
+    ts[4] += Simd::from_array([make_initial(3, 2), make_initial(2, 3)]);
+    ts[6] += Simd::from_array([make_initial(3, 2), make_initial(1, 3)]);
+    ts[8] += Simd::from_array([make_initial(2, 1), make_initial(0, 2)]);
 
     let mut t: [i64; 4] = [0; 4];
 
@@ -259,6 +272,10 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
         t[s + 1] = ts[s][1];
     });
 
+    for (i, k) in t.iter().enumerate() {
+        println!("t[{i}]: {k:x}")
+    }
+
     // sign extend redundant carries
     t[1] += t[0] >> 51;
     t[2] += t[1] >> 51;
@@ -274,35 +291,40 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]];
 
-    seq!( i in 0..6 {
-        ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
-    });
+    // seq!( i in 0..6 {
+    //     ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
+    // });
+
+    println!("ss[0][0]: {:x}", ss[0][0]);
 
     let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
     let mp = smult_noinit(m, U51_P);
 
-    seq!( i in 0..6 {
-        ss[i] += mp[i];
-    });
+    // seq!( i in 0..6 {
+    //     ss[i] += mp[i];
+    // });
 
     // Get rid of redundant SIMD form
-    seq!( i in 0..2 {
-        let s = i * 2;
-        ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
-        ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]);
-    });
-    ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
+    ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]);
+    ss[2] += simd_swizzle!(Simd::splat(0), ss[1], [3, 0]);
+    ss[2] += simd_swizzle!(Simd::splat(0), ss[3], [0, 2]);
+    ss[4] += simd_swizzle!(Simd::splat(0), ss[3], [3, 0]);
+    ss[4] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
 
     // Move to redundant scalar form
     let mut s: [i64; 6] = [0; 6];
-    seq!(i in 0..3 {
-        let k = i * 2;
-        s[k] = ss[k][0];
-        s[k + 1] = ss[k][1];
-    });
+    s[0] = ss[0][0];
+    s[0 + 1] = ss[0][1];
+    s[2] = ss[2][0];
+    s[2 + 1] = ss[2][1];
+    s[4] = ss[4][0];
+    s[4 + 1] = ss[4][1];
 
     s[1] += s[0] >> 51;
     let s = [s[1], s[2], s[3], s[4], s[5]];
+    for (i, k) in s.iter().enumerate() {
+        println!("s[{i}]: {k:x}")
+    }
 
     // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
     // and the final shift is done as part of the conversion back to u256

From b6b32b20cf4c1b259129572e793c2afa7bbb5599 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:25 +0800
Subject: [PATCH 12/21] examples: add count.rs for debugging

---
 skyscraper/bn254-multiplier/examples/count.rs | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 skyscraper/bn254-multiplier/examples/count.rs

diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
new file mode 100644
index 000000000..b204c8ce5
--- /dev/null
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -0,0 +1,102 @@
+use bn254_multiplier::rne::{
+    make_initial,
+    single::{simd_mul, simd_sqr},
+};
+
+const LO: usize = 1;
+const HI: usize = 1 << 16;
+
+fn main() {
+    let t = diagonal();
+    for (i, x) in t.iter().enumerate() {
+        println!("t[{i}]: lo: {}\t hi: {}", x & (HI - 1), x >> 16);
+    }
+
+    let a = [0, 0, 0, 0];
+    let res = simd_sqr(a);
+    for (i, k) in res.iter().enumerate() {
+        println!("res[{i}]: {:x}", k)
+    }
+
+    println!("\nFULL");
+
+    let t = full();
+    for (i, x) in t.iter().enumerate() {
+        let lo = x & (HI - 1);
+        let hi = x >> 16;
+        println!(
+            "t[{i}]: lo: {}\t hi: {}\t init: {:x}",
+            lo,
+            hi,
+            make_initial(lo as u64, hi as u64)
+        );
+    }
+
+    let a = [0, 0, 0, 0];
+    let res = simd_mul(a, a);
+    for (i, k) in res.iter().enumerate() {
+        println!("res[{i}]: {:x}", k)
+    }
+}
+
+fn diagonal() -> [usize; 12] {
+    let mut t = [0; 12];
+    for i in 0..5 {
+        for j in ((i + 1)..5).step_by(2) {
+            println!("i: {i} j: {} {}", j, j + 1);
+            t[i + j + 2] += HI;
+            t[i + j + 1] += LO + HI;
+            t[i + j] += LO;
+        }
+        println!();
+    }
+
+    // scalar doubling
+    // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78
+    // respectively.
+    // for i in 1..=8 {
+    //     t[i] += t[i];
+    // }
+
+    for i in (0..4).step_by(2) {
+        t[2 * (i + 1) + 1] += HI;
+        t[2 * (i + 1)] += LO;
+        t[2 * i + 1] += HI;
+        t[2 * i] += LO;
+    }
+    t[2 * 4 + 1] += HI;
+    t[2 * 4] += LO;
+
+    // let i = 4;
+    // for _k in 0..5 {
+    //     for j in 0..3 {
+    //         t[i + 2 * j + 2] += HI;
+    //         t[i + 2 * j + 1] += LO + HI;
+    //         t[i + 2 * j] += LO;
+    //     }
+    // }
+
+    t
+}
+
+fn full() -> [usize; 11] {
+    let mut t = [0; 11];
+    for i in 0..5 {
+        for j in 0..3 {
+            t[i + 2 * j + 2] += HI;
+            t[i + 2 * j + 1] += LO + HI;
+            t[i + 2 * j] += LO;
+        }
+    }
+
+    let i = 4;
+    for _k in 0..5 {
+        for j in 0..3 {
+            t[i + 2 * j + 2] += HI;
+            t[i + 2 * j + 1] += LO + HI;
+            t[i + 2 * j] += LO;
+        }
+    }
+
+    t
+}

From a5820434557a4e2f98ef0d5e5ced915c9aba78d0 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:28 +0800
Subject: [PATCH 13/21] rne/mono: add sqr reduction

---
 skyscraper/bn254-multiplier/examples/count.rs | 16 ++++++++--------
 skyscraper/bn254-multiplier/src/rne/single.rs | 18 +++++++++---------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
index b204c8ce5..5eea1af65 100644
--- a/skyscraper/bn254-multiplier/examples/count.rs
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -67,14 +67,14 @@ fn diagonal() -> [usize; 12] {
     t[2 * 4 + 1] += HI;
     t[2 * 4] += LO;
 
-    // let i = 4;
-    // for _k in 0..5 {
-    //     for j in 0..3 {
-    //         t[i + 2 * j + 2] += HI;
-    //         t[i + 2 * j + 1] += LO + HI;
-    //         t[i + 2 * j] += LO;
-    //     }
-    // }
+    let i = 4;
+    for _k in 0..5 {
+        for j in 0..3 {
+            t[i + 2 * j + 2] += HI;
+            t[i + 2 * j + 1] += LO + HI;
+            t[i + 2 * j] += LO;
+        }
+    }
 
     t
 }
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 446f312eb..c945f5fb8 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -258,9 +258,9 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]);
     ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]);
-    ts[4] += Simd::from_array([make_initial(3, 2), make_initial(2, 3)]);
-    ts[6] += Simd::from_array([make_initial(3, 2), make_initial(1, 3)]);
-    ts[8] += Simd::from_array([make_initial(2, 1), make_initial(0, 2)]);
+    ts[4] += Simd::from_array([make_initial(8, 2), make_initial(7, 8)]);
+    ts[6] += Simd::from_array([make_initial(8, 7), make_initial(6, 8)]);
+    ts[8] += Simd::from_array([make_initial(7, 6), make_initial(5, 7)]);
 
     let mut t: [i64; 4] = [0; 4];
 
@@ -291,18 +291,18 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]];
 
-    // seq!( i in 0..6 {
-    //     ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
-    // });
+    seq!( i in 0..6 {
+        ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
+    });
 
     println!("ss[0][0]: {:x}", ss[0][0]);
 
     let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
     let mp = smult_noinit(m, U51_P);
 
-    // seq!( i in 0..6 {
-    //     ss[i] += mp[i];
-    // });
+    seq!( i in 0..6 {
+        ss[i] += mp[i];
+    });
 
     // Get rid of redundant SIMD form
     ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]);

From 74bc28a71b95248c0f64b849454cf3a80d788bc9 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:31 +0800
Subject: [PATCH 14/21] rne/mono: working sqr implementation

---
 skyscraper/bn254-multiplier/examples/count.rs |  6 +++---
 skyscraper/bn254-multiplier/src/rne/single.rs | 21 ++++++++++++++-----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
index 5eea1af65..5d908117b 100644
--- a/skyscraper/bn254-multiplier/examples/count.rs
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -54,9 +54,9 @@ fn diagonal() -> [usize; 12] {
     // scalar doubling
     // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78
     // respectively.
-    // for i in 1..=8 {
-    //     t[i] += t[i];
-    // }
+    for i in 1..=8 {
+        t[i] += t[i];
+    }
 
     for i in (0..4).step_by(2) {
         t[2 * (i + 1) + 1] += HI;
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index c945f5fb8..f027f6b68 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -236,6 +236,17 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
         }
     }
 
+    ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]);
+    ts[0] = ts[0].bitand(Simd::from_array([-1, 0]));
+
+    ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]);
+    ts[8] = ts[8].bitand(Simd::from_array([-1, 0]));
+
+    for i in 1..=8 {
+        ts[i] += ts[i]
+    }
+
+    // Diagonal
     for i in (0..4).step_by(2) {
         let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]]));
         let p_hi = fma(ai, ai, Simd::splat(C1));
@@ -256,11 +267,11 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
     ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast();
     ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast();
 
-    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]);
-    ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]);
-    ts[4] += Simd::from_array([make_initial(8, 2), make_initial(7, 8)]);
-    ts[6] += Simd::from_array([make_initial(8, 7), make_initial(6, 8)]);
-    ts[8] += Simd::from_array([make_initial(7, 6), make_initial(5, 7)]);
+    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
+    ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
+    ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]);
+    ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]);
+    ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 7)]);
 
     let mut t: [i64; 4] = [0; 4];
 

From 3c08555e19101bf7bc8cbfe516ed7905cde0fbb9 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:34 +0800
Subject: [PATCH 15/21] rne/mono: use seq_macro for loop unrolling

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 34 +++++++------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index f027f6b68..6b8b6071e 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -223,7 +223,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     let mut ts = [Simd::splat(0); 10];
 
-    for i in 0..5 {
+    seq!( i in 0..5 {
         let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
         for j in ((i + 1)..5).step_by(2) {
             let l = v0_b[j];
@@ -234,7 +234,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
             ts[i + j + 1] += p_hi.to_bits().cast();
             ts[i + j] += p_lo.to_bits().cast();
         }
-    }
+    });
 
     ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]);
     ts[0] = ts[0].bitand(Simd::from_array([-1, 0]));
@@ -242,23 +242,24 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
     ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]);
     ts[8] = ts[8].bitand(Simd::from_array([-1, 0]));
 
-    for i in 1..=8 {
-        ts[i] += ts[i]
-    }
+    seq!( i in 1..=8 {
+        ts[i] += ts[i];
+    });
 
     // Diagonal
-    for i in (0..4).step_by(2) {
-        let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]]));
+    seq!( i in 0..2 {
+        let k = i * 2;
+        let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[k], v0_a[k + 1]]));
         let p_hi = fma(ai, ai, Simd::splat(C1));
         let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi);
         let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]);
         let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]);
-        ts[2 * (i + 1)] += p_hi.to_bits().bitand(r_mask).cast();
-        ts[2 * (i + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast();
+        ts[2 * (k + 1)] += p_hi.to_bits().bitand(r_mask).cast();
+        ts[2 * (k + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast();
 
-        ts[2 * i + 1] += p_hi.to_bits().bitand(l_mask).cast();
-        ts[2 * i] += p_lo.to_bits().bitand(l_mask).cast();
-    }
+        ts[2 * k + 1] += p_hi.to_bits().bitand(l_mask).cast();
+        ts[2 * k] += p_lo.to_bits().bitand(l_mask).cast();
+    });
 
     let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[4], 0]));
     let p_hi = fma(ai, ai, Simd::splat(C1));
@@ -283,10 +284,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
         t[s + 1] = ts[s][1];
     });
 
-    for (i, k) in t.iter().enumerate() {
-        println!("t[{i}]: {k:x}")
-    }
-
     // sign extend redundant carries
     t[1] += t[0] >> 51;
     t[2] += t[1] >> 51;
@@ -306,8 +303,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
         ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
     });
 
-    println!("ss[0][0]: {:x}", ss[0][0]);
-
     let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
     let mp = smult_noinit(m, U51_P);
 
@@ -333,9 +328,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 
     s[1] += s[0] >> 51;
     let s = [s[1], s[2], s[3], s[4], s[5]];
-    for (i, k) in s.iter().enumerate() {
-        println!("s[{i}]: {k:x}")
-    }
 
     // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
     // and the final shift is done as part of the conversion back to u256

From 4fa46ae352f6874e6f66a305e80e5bf6e3baf4fb Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:37 +0800
Subject: [PATCH 16/21] rne/mono: optimize sqr doubling step

---
 skyscraper/bn254-multiplier/benches/bench.rs  | 8 ++++++++
 skyscraper/bn254-multiplier/examples/count.rs | 9 +++++----
 skyscraper/bn254-multiplier/src/rne/single.rs | 8 +-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs
index 75219c12e..b13581da6 100644
--- a/skyscraper/bn254-multiplier/benches/bench.rs
+++ b/skyscraper/bn254-multiplier/benches/bench.rs
@@ -157,6 +157,14 @@ mod sqr {
             .bench_local_values(|a| rne::single::simd_sqr(a));
     }
 
+    #[divan::bench]
+    fn single_mul_sqr_b51(bencher: Bencher) {
+        bencher
+            //.counter(ItemsCount::new(1usize))
+            .with_inputs(|| rng().random())
+            .bench_local_values(|a| rne::single::simd_mul(a, a));
+    }
+
     #[divan::bench]
     fn ark_ff(bencher: Bencher) {
         use {ark_bn254::Fr, ark_ff::BigInt};
diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
index 5d908117b..f447288f5 100644
--- a/skyscraper/bn254-multiplier/examples/count.rs
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -51,10 +51,11 @@ fn diagonal() -> [usize; 12] {
         println!();
     }
 
-    // scalar doubling
-    // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78
-    // respectively.
-    for i in 1..=8 {
+    // Since there is no data of the diagionals yet it is safe to double them. This
+    // means that t[8][1] = t[9] needs to be included.
+    // t[0][1] is not touched by the diagonalisation so no need to include t[0]
+
+    for i in 1..=9 {
         t[i] += t[i];
     }
 
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 6b8b6071e..d63ca3a06 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -236,12 +236,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
         }
     });
 
-    ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]);
-    ts[0] = ts[0].bitand(Simd::from_array([-1, 0]));
-
-    ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]);
-    ts[8] = ts[8].bitand(Simd::from_array([-1, 0]));
-
     seq!( i in 1..=8 {
         ts[i] += ts[i];
     });
@@ -272,7 +266,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
     ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
     ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]);
     ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]);
-    ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 7)]);
+    ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 8)]);
 
     let mut t: [i64; 4] = [0; 4];
 

From 0117fb008fceee151726c9f14ea895aeba620287 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:40 +0800
Subject: [PATCH 17/21] rne/mono: use mul for sqr and let compiler optimise

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 112 +-----------------
 1 file changed, 1 insertion(+), 111 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index d63ca3a06..22a027544 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -218,117 +218,7 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] {
 /// input must fit in 2^255-1; no runtime checking
 #[inline(always)]
 pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
-    let v0_a = u256_to_u255(v0_a);
-    let v0_b = v0_a;
-
-    let mut ts = [Simd::splat(0); 10];
-
-    seq!( i in 0..5 {
-        let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
-        for j in ((i + 1)..5).step_by(2) {
-            let l = v0_b[j];
-            let r = if j + 1 < 5 { v0_b[j + 1] } else { 0 };
-            let b01: Simd<f64, 2> = i2f(Simd::from_array([l, r]));
-            let p_hi = fma(ai, b01, Simd::splat(C1));
-            let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
-            ts[i + j + 1] += p_hi.to_bits().cast();
-            ts[i + j] += p_lo.to_bits().cast();
-        }
-    });
-
-    seq!( i in 1..=8 {
-        ts[i] += ts[i];
-    });
-
-    // Diagonal
-    seq!( i in 0..2 {
-        let k = i * 2;
-        let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[k], v0_a[k + 1]]));
-        let p_hi = fma(ai, ai, Simd::splat(C1));
-        let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi);
-        let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]);
-        let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]);
-        ts[2 * (k + 1)] += p_hi.to_bits().bitand(r_mask).cast();
-        ts[2 * (k + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast();
-
-        ts[2 * k + 1] += p_hi.to_bits().bitand(l_mask).cast();
-        ts[2 * k] += p_lo.to_bits().bitand(l_mask).cast();
-    });
-
-    let ai: Simd<f64, 2> = i2f(Simd::from_array([v0_a[4], 0]));
-    let p_hi = fma(ai, ai, Simd::splat(C1));
-    let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi);
-    let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]);
-    ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast();
-    ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast();
-
-    ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
-    ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]);
-    ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]);
-    ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]);
-    ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 8)]);
-
-    let mut t: [i64; 4] = [0; 4];
-
-    seq!( i in 0..2 {
-        let s = i * 2;
-        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
-        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
-        t[s] = ts[s][0];
-        t[s + 1] = ts[s][1];
-    });
-
-    // sign extend redundant carries
-    t[1] += t[0] >> 51;
-    t[2] += t[1] >> 51;
-    t[3] += t[2] >> 51;
-
-    // Lift carry into SIMD to prevent extraction
-    ts[4] += Simd::from_array([t[3] >> 51, 0]);
-
-    let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4);
-    let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3);
-    let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2);
-    let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1);
-
-    let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]];
-
-    seq!( i in 0..6 {
-        ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
-    });
-
-    let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
-    let mp = smult_noinit(m, U51_P);
-
-    seq!( i in 0..6 {
-        ss[i] += mp[i];
-    });
-
-    // Get rid of redundant SIMD form
-    ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]);
-    ss[2] += simd_swizzle!(Simd::splat(0), ss[1], [3, 0]);
-    ss[2] += simd_swizzle!(Simd::splat(0), ss[3], [0, 2]);
-    ss[4] += simd_swizzle!(Simd::splat(0), ss[3], [3, 0]);
-    ss[4] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
-
-    // Move to redundant scalar form
-    let mut s: [i64; 6] = [0; 6];
-    s[0] = ss[0][0];
-    s[0 + 1] = ss[0][1];
-    s[2] = ss[2][0];
-    s[2 + 1] = ss[2][1];
-    s[4] = ss[4][0];
-    s[4 + 1] = ss[4][1];
-
-    s[1] += s[0] >> 51;
-    let s = [s[1], s[2], s[3], s[4], s[5]];
-
-    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
-    // and the final shift is done as part of the conversion back to u256
-    let reduced = reduce_ct(s);
-    let reduced = redundant_carry_scalar(reduced);
-    let u256_result = u255_to_u256_shr_1(reduced);
-    u256_result
+    simd_mul(v0_a, v0_a)
 }
 
 #[cfg(not(target_arch = "wasm32"))]

From e60d82672b41445abaedda8c5da498620c15b581 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:42 +0800
Subject: [PATCH 18/21] rne/mono: improve naming and add docs

---
 skyscraper/bn254-multiplier/benches/bench.rs  |  6 +-
 skyscraper/bn254-multiplier/examples/count.rs |  6 +-
 .../bn254-multiplier/src/rne/portable_simd.rs | 18 ++--
 skyscraper/bn254-multiplier/src/rne/single.rs | 89 +++++++++----------
 skyscraper/bn254-multiplier/src/test_utils.rs | 10 ++-
 5 files changed, 60 insertions(+), 69 deletions(-)

diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs
index b13581da6..9ff580796 100644
--- a/skyscraper/bn254-multiplier/benches/bench.rs
+++ b/skyscraper/bn254-multiplier/benches/bench.rs
@@ -36,7 +36,7 @@ mod mul {
         bencher
             //.counter(ItemsCount::new(1usize))
             .with_inputs(|| rng().random())
-            .bench_local_values(|(a, b)| rne::single::simd_mul(a, b));
+            .bench_local_values(|(a, b)| rne::single::mul(a, b));
     }
 
     #[divan::bench]
@@ -154,7 +154,7 @@ mod sqr {
         bencher
             //.counter(ItemsCount::new(1usize))
             .with_inputs(|| rng().random())
-            .bench_local_values(|a| rne::single::simd_sqr(a));
+            .bench_local_values(|a| rne::single::sqr(a));
     }
 
     #[divan::bench]
@@ -162,7 +162,7 @@ mod sqr {
         bencher
             //.counter(ItemsCount::new(1usize))
             .with_inputs(|| rng().random())
-            .bench_local_values(|a| rne::single::simd_mul(a, a));
+            .bench_local_values(|a| rne::single::mul(a, a));
     }
 
     #[divan::bench]
diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
index f447288f5..f5fcafc35 100644
--- a/skyscraper/bn254-multiplier/examples/count.rs
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -1,6 +1,6 @@
 use bn254_multiplier::rne::{
     make_initial,
-    single::{simd_mul, simd_sqr},
+    single::{mul, sqr},
 };
 
 const LO: usize = 1;
@@ -13,7 +13,7 @@ fn main() {
     }
 
     let a = [0, 0, 0, 0];
-    let res = simd_sqr(a);
+    let res = sqr(a);
     for (i, k) in res.iter().enumerate() {
         println!("res[{i}]: {:x}", k)
     }
@@ -33,7 +33,7 @@ fn main() {
     }
 
     let a = [0, 0, 0, 0];
-    let res = simd_mul(a, a);
+    let res = mul(a, a);
     for (i, k) in res.iter().enumerate() {
         println!("res[{i}]: {:x}", k)
     }
diff --git a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs
index cef071596..634cde91b 100644
--- a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs
+++ b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs
@@ -200,13 +200,13 @@ pub fn simd_mul(
 mod tests {
     use {
         super::*,
-        crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference},
+        crate::{
+            rne::simd_utils::u255_to_u256_simd,
+            test_utils::{ark_ff_reference, limbs5_51},
+        },
         ark_bn254::Fr,
         ark_ff::{BigInt, PrimeField},
-        proptest::{
-            prelude::{prop, Strategy},
-            prop_assert_eq, proptest,
-        },
+        proptest::{prop_assert_eq, proptest},
     };
 
     #[test]
@@ -248,12 +248,4 @@ mod tests {
                 prop_assert_eq!(b2, b2s);
         })
     }
-
-    fn limb51() -> impl Strategy<Value = u64> {
-        0u64..(1u64 << 51)
-    }
-
-    fn limbs5_51() -> impl Strategy<Value = [u64; 5]> {
-        prop::array::uniform5(limb51())
-    }
 }
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index 22a027544..f8591c192 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -1,7 +1,4 @@
 //! Portable SIMD Montgomery multiplication and squaring.
-//!
-//! Processes two independent field multiplications in parallel using 2-lane
-//! SIMD.
 
 use {
     crate::rne::{
@@ -10,17 +7,14 @@ use {
     },
     core::simd::{num::SimdFloat, Simd},
     seq_macro::seq,
-    std::{
-        ops::BitAnd,
-        simd::{num::SimdUint, simd_swizzle},
-    },
+    std::simd::{num::SimdUint, simd_swizzle},
 };
 
 /// Move redundant carries from lower limbs to the higher limbs such that all
 /// limbs except the last one is 51 bits. The most significant limb can be
 /// larger than 51 bits as the input can be bigger 2^255-1.
 #[inline(always)]
-fn redundant_carry_scalar<const N: usize>(t: [i64; N]) -> [u64; N] {
+fn redundant_carry<const N: usize>(t: [i64; N]) -> [u64; N] {
     let mut borrow = 0;
     let mut res = [0; N];
     for i in 0..t.len() - 1 {
@@ -89,7 +83,7 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
 
 #[inline(always)]
 pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
-    // To reduce Check whether the least significant bit is set
+    // When input is odd add P to prepare for division
     let mask = -(a[0] & 1);
 
     seq!( i in 0..5 {
@@ -102,12 +96,12 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
     a
 }
 
-/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
+/// Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
 /// input must fit in 2^255-1; no runtime checking
 #[inline(always)]
-pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
-    let v0_a = u256_to_u255(v0_a);
-    let v0_b = u256_to_u255(v0_b);
+pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
+    let a = u256_to_u255(a);
+    let b = u256_to_u255(b);
 
     let mut ts = [Simd::splat(0); 10];
     ts[0] = Simd::from_array([make_initial(1, 0), make_initial(2, 1)]);
@@ -116,22 +110,21 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
     ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]);
     ts[8] = Simd::from_array([make_initial(7, 8), make_initial(6, 7)]);
 
-    // Offset multiplication to have less intermediate data
     seq!(i in 0..5{
-        let ai: Simd<f64, 2> = i2f(Simd::splat(v0_a[i]));
-        let b01: Simd<f64, 2> = i2f(Simd::from_array([v0_b[0], v0_b[1]]));
+        let ai: Simd<f64, 2> = i2f(Simd::splat(a[i]));
+        let b01: Simd<f64, 2> = i2f(Simd::from_array([b[0], b[1]]));
         let p_hi = fma(ai, b01, Simd::splat(C1));
         let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi);
         ts[i+1] += p_hi.to_bits().cast();
         ts[i+0] += p_lo.to_bits().cast();
 
-        let b23: Simd<f64, 2> = i2f(Simd::from_array([v0_b[2], v0_b[3]]));
+        let b23: Simd<f64, 2> = i2f(Simd::from_array([b[2], b[3]]));
         let p_hi = fma(ai, b23, Simd::splat(C1));
         let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi);
         ts[i+3] += p_hi.to_bits().cast();
         ts[i+2] += p_lo.to_bits().cast();
 
-        let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]);
+        let b4 = Simd::from_array([i2f_scalar(b[4]),0.]);
         let p_hi = fma(ai, b4, Simd::splat(C1));
         let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi);
         ts[i + 5] += p_hi.to_bits().cast();
@@ -141,12 +134,14 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
 
     let mut t: [i64; 4] = [0; 4];
 
+    // Swizzle the words that are reduced. Delay the rest of the swizzling until
+    // after the reduction.
     seq!( i in 0..2 {
-        let s = i * 2;
-        ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]);
-        ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]);
-        t[s] = ts[s][0];
-        t[s + 1] = ts[s][1];
+        let k = i * 2;
+        ts[k] += simd_swizzle!(Simd::splat(0), ts[k + 1], [0, 2]);
+        ts[k + 2] += simd_swizzle!(Simd::splat(0), ts[k + 1], [3, 0]);
+        t[k] = ts[k][0];
+        t[k + 1] = ts[k][1];
     });
 
     // sign extend redundant carries
@@ -197,7 +192,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] {
     // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
     // and the final shift is done as part of the conversion back to u256
     let reduced = reduce_ct(s);
-    let reduced = redundant_carry_scalar(reduced);
+    let reduced = redundant_carry(reduced);
     let u256_result = u255_to_u256_shr_1(reduced);
     u256_result
 }
@@ -214,11 +209,12 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] {
     ]
 }
 
-/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
-/// input must fit in 2^255-1; no runtime checking
+/// Montgomery squaring: `(va*v0_b, v1_a*v1_b)`.
+/// input must fit in 2^255-1; no runtime checking. Output lives in Montgomery
+/// space R=256.
 #[inline(always)]
-pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
-    simd_mul(v0_a, v0_a)
+pub fn sqr(v0_a: [u64; 4]) -> [u64; 4] {
+    mul(v0_a, v0_a)
 }
 
 #[cfg(not(target_arch = "wasm32"))]
@@ -226,13 +222,10 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] {
 mod tests {
     use {
         super::*,
-        crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference},
+        crate::test_utils::{ark_ff_reference, limbs5_51},
         ark_bn254::Fr,
         ark_ff::{BigInt, PrimeField},
-        proptest::{
-            prelude::{prop, Strategy},
-            prop_assert_eq, proptest,
-        },
+        proptest::{prop_assert_eq, proptest},
     };
 
     #[test]
@@ -241,11 +234,9 @@ mod tests {
                 a in limbs5_51(),
                 b in limbs5_51(),
             )| {
-                let a: [Simd<u64,1>;_] = a.map(Simd::splat);
-                let b: [Simd<u64,1>;_] = b.map(Simd::splat);
-                let a = u255_to_u256_simd(a).map(|x|x[0]);
-                let b = u255_to_u256_simd(b).map(|x|x[0]);
-                let ab = simd_mul(a, b);
+                let a = u255_to_u256(a);
+                let b = u255_to_u256(b);
+                let ab = mul(a, b);
                 let ab_ref = ark_ff_reference(a, b);
                 let ab = Fr::new(BigInt(ab));
                 prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint());
@@ -257,19 +248,19 @@ mod tests {
         proptest!(|(
                 a in limbs5_51(),
             )| {
-                let a: [Simd<u64,1>;_] = a.map(Simd::splat);
-                let a = u255_to_u256_simd(a).map(|x|x[0]);
-                let a2 = simd_mul(a, a);
-                let b2 = simd_sqr(a);
-                prop_assert_eq!(a2, b2);
+                let a = u255_to_u256(a);
+                prop_assert_eq!(mul(a,a), sqr(a));
         })
     }
 
-    fn limb51() -> impl Strategy<Value = u64> {
-        0u64..(1u64 << 51)
-    }
-
-    fn limbs5_51() -> impl Strategy<Value = [u64; 5]> {
-        prop::array::uniform5(limb51())
+    #[inline(always)]
+    pub fn u255_to_u256(limbs: [u64; 5]) -> [u64; 4] {
+        let [l0, l1, l2, l3, l4] = limbs;
+        [
+            l0 | (l1 << 51),
+            (l1 >> 13) | (l2 << 38),
+            (l2 >> 26) | (l3 << 25),
+            (l3 >> 39) | (l4 << 12),
+        ]
     }
 }
diff --git a/skyscraper/bn254-multiplier/src/test_utils.rs b/skyscraper/bn254-multiplier/src/test_utils.rs
index bfbdaab3a..6dceec140 100644
--- a/skyscraper/bn254-multiplier/src/test_utils.rs
+++ b/skyscraper/bn254-multiplier/src/test_utils.rs
@@ -6,7 +6,7 @@ use {
     ark_ff::{BigInt, Field},
     proptest::{
         collection,
-        prelude::{any, Strategy},
+        prelude::{any, prop, Strategy},
         proptest,
     },
 };
@@ -81,3 +81,11 @@ fn test_max_multiprecision_strategy() {
         );
     });
 }
+
+pub fn limb51() -> impl Strategy<Value = u64> {
+    0u64..(1u64 << 51)
+}
+
+pub fn limbs5_51() -> impl Strategy<Value = [u64; 5]> {
+    prop::array::uniform5(limb51())
+}

From 53ea1d123372889d2f1624e5f435ac17379b7df7 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:45 +0800
Subject: [PATCH 19/21] rne/mono: document mul algorithm

---
 skyscraper/bn254-multiplier/src/rne/single.rs | 100 +++++++++++++++---
 1 file changed, 86 insertions(+), 14 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs
index f8591c192..d860656ab 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/single.rs
@@ -1,4 +1,23 @@
-//! Portable SIMD Montgomery multiplication and squaring.
+//! Unbatched SIMD Montgomery multiplication and squaring for BN254.
+//!
+//! This module provides single-input Montgomery multiplication using relaxed
+//! SIMD FMA operations. Unlike [`portable_simd`](super::portable_simd) which
+//! batches two multiplications per call, this processes one at a time—faster
+//! on WASM when inputs aren't naturally paired.
+//!
+//! # Redundant Representations
+//!
+//! The code uses two distinct "redundant" representations:
+//!
+//! 1. **Redundant SIMD form**: FMA produces high/low product parts in separate
+//!    SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo, limb_{k+1}_lo]`
+//!    and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle operations realign these
+//!    so adjacent limbs occupy consecutive positions.
+//!
+//! 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits.
+//!    The excess bits are carries (or borrows) that propagate to higher limbs.
+//!    - `u64` limbs: excess bits are positive carries
+//!    - `i64` limbs: excess bits may be negative, representing borrows from above
 
 use {
     crate::rne::{
@@ -10,9 +29,11 @@ use {
     std::simd::{num::SimdUint, simd_swizzle},
 };
 
-/// Move redundant carries from lower limbs to the higher limbs such that all
-/// limbs except the last one is 51 bits. The most significant limb can be
-/// larger than 51 bits as the input can be bigger 2^255-1.
+/// Propagate carries/borrows from redundant limb form to normalized form.
+///
+/// Input `i64` limbs may have excess bits (positive = carry, negative = borrow).
+/// Output `u64` limbs are normalized to exactly 51 bits, except the MSB which
+/// absorbs any remaining carry and may exceed 51 bits.
 #[inline(always)]
 fn redundant_carry<const N: usize>(t: [i64; N]) -> [u64; N] {
     let mut borrow = 0;
@@ -55,6 +76,11 @@ pub const fn i2f_scalar(a: u64) -> f64 {
     a - b
 }
 
+/// Multiply a 51-bit scalar `s` by a 5×51-bit vector `v`.
+///
+/// Returns 6 SIMD vectors in redundant form. "noinit" means no FMA anchor
+/// compensation is applied; the caller must initialize accumulators with
+/// `make_initial` biases.
 #[inline(always)]
 pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
     let mut t = [Simd::splat(0); 6];
@@ -81,9 +107,13 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd<i64, 2>; 6] {
     t
 }
 
+/// Constant-time preparation for division by 2.
+///
+/// If the input is odd, adds P (which is also odd) to make it even.
+/// This ensures the subsequent right-shift is exact. "ct" = constant-time.
 #[inline(always)]
 pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
-    // When input is odd add P to prepare for division
+    // When input is odd, add P to make it even
     let mask = -(a[0] & 1);
 
     seq!( i in 0..5 {
@@ -96,8 +126,42 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
     a
 }
 
-/// Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`.
-/// input must fit in 2^255-1; no runtime checking
+/// Montgomery multiplication for BN254 scalar field.
+///
+/// Computes `a * b * R^{-256} mod P` where R = 2^256.
+///
+/// # Algorithm Overview
+///
+/// Uses floating-point FMA for fast 51×51-bit multiplications via the
+/// Renes-Costello-Batina technique. The algorithm proceeds in phases:
+///
+/// 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation.
+///
+/// 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using
+///    FMA-based multiply-accumulate. Results are stored in "redundant SIMD
+///    form" where high/low parts of products occupy separate SIMD lanes.
+///
+/// 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply
+///    the lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This
+///    converts `t[i] * 2^{51*i}` into an equivalent value in the upper limbs,
+///    allowing all reductions to proceed in parallel.
+///
+/// 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`,
+///    then add `m * P` to cancel the lowest limb.
+///
+/// 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final
+///    division by 2 (after conditional addition of P for odd results) to
+///    produce the correct R^{-256} Montgomery form.
+///
+/// # Preconditions
+///
+/// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs)
+/// - Inputs should be in Montgomery form with R = 2^256
+///
+/// # Performance
+///
+/// Optimized for WASM with relaxed SIMD. Processes one multiplication at a time
+/// (vs. `simd_mul` which batches two).
 #[inline(always)]
 pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
     let a = u256_to_u255(a);
@@ -134,8 +198,11 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
 
     let mut t: [i64; 4] = [0; 4];
 
-    // Swizzle the words that are reduced. Delay the rest of the swizzling until
-    // after the reduction.
+    // Realign redundant SIMD form to scalar form for the lower 4 limbs.
+    // FMA produces: ts[k] = [limb_k_lo, limb_{k+1}_lo]
+    //               ts[k+1] = [limb_k_hi, limb_{k+1}_hi]
+    // But limb_k_hi belongs with limb_{k+1}_lo (adjacent in the product).
+    // Swizzle moves: ts[k+1][0] -> ts[k][0], ts[k+1][1] -> ts[k+2][0]
     seq!( i in 0..2 {
         let k = i * 2;
         ts[k] += simd_swizzle!(Simd::splat(0), ts[k + 1], [0, 2]);
@@ -144,7 +211,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         t[k + 1] = ts[k][1];
     });
 
-    // sign extend redundant carries
+    // Propagate carries/borrows through redundant limb form (i64 allows negative excess)
     t[1] += t[0] >> 51;
     t[2] += t[1] >> 51;
     t[3] += t[2] >> 51;
@@ -152,6 +219,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
     // Lift carry into SIMD to prevent extraction
     ts[4] += Simd::from_array([t[3] >> 51, 0]);
 
+    // Parallel reduction: t[i] * RHO_{4-i} ≡ t[i] * 2^{51*i} * R^{-1} (mod P)
+    // This replaces sequential CIOS with independent multiplications.
     let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4);
     let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3);
     let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2);
@@ -163,6 +232,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         ss[i] += r0[i] + r1[i] + r2[i] + r3[i];
     });
 
+    // Final Montgomery reduction: m = ss[0] * (-P^{-1}) mod 2^51, then add m*P
+    // to make the lowest limb zero (it gets shifted out).
     let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51;
     let mp = smult_noinit(m, U51_P);
 
@@ -170,7 +241,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         ss[i] += mp[i];
     });
 
-    // Get rid of redundant SIMD form
+    // Realign from redundant SIMD form (misaligned lanes) to aligned lanes
     seq!( i in 0..2 {
         let s = i * 2;
         ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]);
@@ -178,7 +249,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
     });
     ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]);
 
-    // Move to redundant scalar form
+    // Extract to redundant limb form (i64 with carry/borrow in upper bits)
     let mut s: [i64; 6] = [0; 6];
     seq!(i in 0..3 {
         let k = i * 2;
@@ -186,11 +257,12 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         s[k + 1] = ss[k][1];
     });
 
+    // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery reduction)
     s[1] += s[0] >> 51;
     let s = [s[1], s[2], s[3], s[4], s[5]];
 
-    // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation
-    // and the final shift is done as part of the conversion back to u256
+    // Bit adjustment: 5×51 = 255 bits, but we need R^{-256}. Divide by 2 via
+    // reduce_ct (adds P if odd) followed by right-shift in u255_to_u256_shr_1.
     let reduced = reduce_ct(s);
     let reduced = redundant_carry(reduced);
     let u256_result = u255_to_u256_shr_1(reduced);

From 19c2035d705d6c0139f8e34c4566b26df3f07451 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 2 Feb 2026 19:14:50 +0800
Subject: [PATCH 20/21] =?UTF-8?q?rne:=20rename=20single=E2=86=92mono,=20po?=
 =?UTF-8?q?rtable=5Fsimd=E2=86=92batched?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 skyscraper/bn254-multiplier/benches/bench.rs  |  18 +--
 skyscraper/bn254-multiplier/examples/count.rs |   6 +-
 .../src/rne/{portable_simd.rs => batched.rs}  |   0
 skyscraper/bn254-multiplier/src/rne/mod.rs    |   6 +-
 .../src/rne/{single.rs => mono.rs}            | 113 +++++++++---------
 5 files changed, 71 insertions(+), 72 deletions(-)
 rename skyscraper/bn254-multiplier/src/rne/{portable_simd.rs => batched.rs} (100%)
 rename skyscraper/bn254-multiplier/src/rne/{single.rs => mono.rs} (78%)

diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs
index 9ff580796..5577cec51 100644
--- a/skyscraper/bn254-multiplier/benches/bench.rs
+++ b/skyscraper/bn254-multiplier/benches/bench.rs
@@ -32,11 +32,11 @@ mod mul {
     }
 
     #[divan::bench]
-    fn single_b51(bencher: Bencher) {
+    fn mono_mul_b51(bencher: Bencher) {
         bencher
             //.counter(ItemsCount::new(1usize))
             .with_inputs(|| rng().random())
-            .bench_local_values(|(a, b)| rne::single::mul(a, b));
+            .bench_local_values(|(a, b)| rne::mono::mul(a, b));
     }
 
     #[divan::bench]
@@ -45,7 +45,7 @@ mod mul {
             //.counter(ItemsCount::new(2usize))
             .with_inputs(|| rng().random())
             .bench_local_values(|(a, b, c, d)| {
-                bn254_multiplier::rne::portable_simd::simd_mul(a, b, c, d)
+                bn254_multiplier::rne::batched::simd_mul(a, b, c, d)
             });
     }
 
@@ -150,19 +150,11 @@ mod sqr {
     }
 
     #[divan::bench]
-    fn single_sqr_b51(bencher: Bencher) {
+    fn mono_sqr_b51(bencher: Bencher) {
         bencher
             //.counter(ItemsCount::new(1usize))
             .with_inputs(|| rng().random())
-            .bench_local_values(|a| rne::single::sqr(a));
-    }
-
-    #[divan::bench]
-    fn single_mul_sqr_b51(bencher: Bencher) {
-        bencher
-            //.counter(ItemsCount::new(1usize))
-            .with_inputs(|| rng().random())
-            .bench_local_values(|a| rne::single::mul(a, a));
+            .bench_local_values(|a| rne::mono::sqr(a));
     }
 
     #[divan::bench]
diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs
index f5fcafc35..6f5c7b243 100644
--- a/skyscraper/bn254-multiplier/examples/count.rs
+++ b/skyscraper/bn254-multiplier/examples/count.rs
@@ -1,6 +1,8 @@
+//! Helper script to calculate the magic numbers to cancel out the anchors used
+//! in FMA multiplication.
 use bn254_multiplier::rne::{
-    make_initial,
-    single::{mul, sqr},
+    mono::{mul, sqr},
+    simd_utils::make_initial,
 };
 
 const LO: usize = 1;
diff --git a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs b/skyscraper/bn254-multiplier/src/rne/batched.rs
similarity index 100%
rename from skyscraper/bn254-multiplier/src/rne/portable_simd.rs
rename to skyscraper/bn254-multiplier/src/rne/batched.rs
diff --git a/skyscraper/bn254-multiplier/src/rne/mod.rs b/skyscraper/bn254-multiplier/src/rne/mod.rs
index cee3ce350..a22af985a 100644
--- a/skyscraper/bn254-multiplier/src/rne/mod.rs
+++ b/skyscraper/bn254-multiplier/src/rne/mod.rs
@@ -22,9 +22,9 @@
 //! Point Arithmetic on the GPU, 2018 IEEE 25th Symposium on Computer Arithmetic
 //! (ARITH) by Emmart, Zheng and Weems; which uses RTZ.
 
+pub mod batched;
 pub mod constants;
-pub mod portable_simd;
+pub mod mono;
 pub mod simd_utils;
-pub mod single;
 
-pub use {constants::*, portable_simd::*, simd_utils::*};
+pub use {batched::*, constants::*, mono::*};
diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/mono.rs
similarity index 78%
rename from skyscraper/bn254-multiplier/src/rne/single.rs
rename to skyscraper/bn254-multiplier/src/rne/mono.rs
index d860656ab..a6a77bcfd 100644
--- a/skyscraper/bn254-multiplier/src/rne/single.rs
+++ b/skyscraper/bn254-multiplier/src/rne/mono.rs
@@ -1,23 +1,9 @@
-//! Unbatched SIMD Montgomery multiplication and squaring for BN254.
+//! SIMD Montgomery multiplication and squaring for BN254.
 //!
-//! This module provides single-input Montgomery multiplication using relaxed
-//! SIMD FMA operations. Unlike [`portable_simd`](super::portable_simd) which
-//! batches two multiplications per call, this processes one at a time—faster
-//! on WASM when inputs aren't naturally paired.
-//!
-//! # Redundant Representations
-//!
-//! The code uses two distinct "redundant" representations:
-//!
-//! 1. **Redundant SIMD form**: FMA produces high/low product parts in separate
-//!    SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo, limb_{k+1}_lo]`
-//!    and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle operations realign these
-//!    so adjacent limbs occupy consecutive positions.
-//!
-//! 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits.
-//!    The excess bits are carries (or borrows) that propagate to higher limbs.
-//!    - `u64` limbs: excess bits are positive carries
-//!    - `i64` limbs: excess bits may be negative, representing borrows from above
+//! This module provides single-input Montgomery multiplication using (relaxed)
+//! SIMD FMA operations. Unlike [`batched`](super::batched) which processes two
+//! multiplications per call, this handles one at a time— this variant is the
+//! fastest on WASM.
 
 use {
     crate::rne::{
@@ -31,9 +17,9 @@ use {
 
 /// Propagate carries/borrows from redundant limb form to normalized form.
 ///
-/// Input `i64` limbs may have excess bits (positive = carry, negative = borrow).
-/// Output `u64` limbs are normalized to exactly 51 bits, except the MSB which
-/// absorbs any remaining carry and may exceed 51 bits.
+/// Input `i64` limbs may have excess bits (positive = carry, negative =
+/// borrow). Output `u64` limbs are normalized to exactly 51 bits, except the
+/// MSB which absorbs any remaining carry and may exceed 51 bits.
 #[inline(always)]
 fn redundant_carry<const N: usize>(t: [i64; N]) -> [u64; N] {
     let mut borrow = 0;
@@ -78,7 +64,7 @@ pub const fn i2f_scalar(a: u64) -> f64 {
 
 /// Multiply a 51-bit scalar `s` by a 5×51-bit vector `v`.
 ///
-/// Returns 6 SIMD vectors in redundant form. "noinit" means no FMA anchor
+/// Returns vector redundant SIMD and carry form. "noinit" means no FMA anchor
 /// compensation is applied; the caller must initialize accumulators with
 /// `make_initial` biases.
 #[inline(always)]
@@ -129,30 +115,7 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
 /// Montgomery multiplication for BN254 scalar field.
 ///
 /// Computes `a * b * R^{-256} mod P` where R = 2^256.
-///
-/// # Algorithm Overview
-///
-/// Uses floating-point FMA for fast 51×51-bit multiplications via the
-/// Renes-Costello-Batina technique. The algorithm proceeds in phases:
-///
-/// 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation.
-///
-/// 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using
-///    FMA-based multiply-accumulate. Results are stored in "redundant SIMD
-///    form" where high/low parts of products occupy separate SIMD lanes.
-///
-/// 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply
-///    the lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This
-///    converts `t[i] * 2^{51*i}` into an equivalent value in the upper limbs,
-///    allowing all reductions to proceed in parallel.
-///
-/// 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`,
-///    then add `m * P` to cancel the lowest limb.
-///
-/// 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final
-///    division by 2 (after conditional addition of P for odd results) to
-///    produce the correct R^{-256} Montgomery form.
-///
+
 /// # Preconditions
 ///
 /// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs)
@@ -164,6 +127,43 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
 /// (vs. `simd_mul` which batches two).
 #[inline(always)]
 pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
+    // # Algorithm Overview
+    // Uses floating-point FMA for fast 51×51-bit multiplications via the
+    // Renes-Costello-Batina technique. The algorithm proceeds in phases:
+    //
+    // 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation.
+    //
+    // 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using
+    //    FMA-based multiply-accumulate. Results are stored in "redundant SIMD form"
+    //    where high/low parts of products occupy separate SIMD lanes.
+    //
+    // 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply the
+    //    lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This converts
+    //    `t[i] * 2^{51*i}` into an equivalent value in the upper limbs, allowing
+    //    all reductions to proceed in parallel.
+    //
+    // 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`,
+    //    then add `m * P` to cancel the lowest limb.
+    //
+    // 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final
+    //    division by 2 (after conditional addition of P for odd results) to produce
+    //    the correct R^{-256} Montgomery form.
+    //
+    // # Redundant Representations
+    //
+    // The code uses two distinct "redundant" representations:
+    //
+    // 1. **Redundant SIMD form**: FMA produces high/low product parts in separate
+    //    SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo,
+    //    limb_{k+1}_lo]` and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle
+    //    operations realign these so adjacent limbs occupy consecutive positions.
+    //
+    // 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits.
+    //    The excess bits are carries (or borrows) that propagate to higher limbs.
+    //    - `u64` limbs: excess bits are positive carries
+    //    - `i64` limbs: excess bits may be negative, representing borrows from
+    //      above
+
     let a = u256_to_u255(a);
     let b = u256_to_u255(b);
 
@@ -211,7 +211,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         t[k + 1] = ts[k][1];
     });
 
-    // Propagate carries/borrows through redundant limb form (i64 allows negative excess)
+    // Propagate carries/borrows through redundant limb form (i64 allows negative
+    // excess)
     t[1] += t[0] >> 51;
     t[2] += t[1] >> 51;
     t[3] += t[2] >> 51;
@@ -257,7 +258,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
         s[k + 1] = ss[k][1];
     });
 
-    // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery reduction)
+    // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery
+    // reduction)
     s[1] += s[0] >> 51;
     let s = [s[1], s[2], s[3], s[4], s[5]];
 
@@ -281,12 +283,15 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] {
     ]
 }
 
-/// Montgomery squaring: `(va*v0_b, v1_a*v1_b)`.
-/// input must fit in 2^255-1; no runtime checking. Output lives in Montgomery
-/// space R=256.
+/// Montgomery squaring: a²
+///
+/// Input and output are in Montgomery form R=256.
+///
+/// Precondition:
+/// - a < 2^255; no runtime check.
 #[inline(always)]
-pub fn sqr(v0_a: [u64; 4]) -> [u64; 4] {
-    mul(v0_a, v0_a)
+pub fn sqr(a: [u64; 4]) -> [u64; 4] {
+    mul(a, a)
 }
 
 #[cfg(not(target_arch = "wasm32"))]

From 14d0523ec3889323df7b87fe549ef91762b05b84 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Mon, 9 Feb 2026 14:31:09 +0530
Subject: [PATCH 21/21] rne/mono: fix split doc comment and add parens for
 consistency

---
 skyscraper/bn254-multiplier/src/rne/mono.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/rne/mono.rs b/skyscraper/bn254-multiplier/src/rne/mono.rs
index a6a77bcfd..a4a9e41e1 100644
--- a/skyscraper/bn254-multiplier/src/rne/mono.rs
+++ b/skyscraper/bn254-multiplier/src/rne/mono.rs
@@ -44,7 +44,7 @@ pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] {
         ((l0 >> 51) | (l1 << 13)) & MASK51,
         ((l1 >> 38) | (l2 << 26)) & MASK51,
         ((l2 >> 25) | (l3 << 39)) & MASK51,
-        l3 >> 12 & MASK51,
+        (l3 >> 12) & MASK51,
     ]
 }
 
@@ -115,7 +115,7 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] {
 /// Montgomery multiplication for BN254 scalar field.
 ///
 /// Computes `a * b * R^{-256} mod P` where R = 2^256.
-
+///
 /// # Preconditions
 ///
 /// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs)