From c1e372932607b7c6d2f33e6256738a5af7c910f7 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:13:49 +0800 Subject: [PATCH 01/21] rne/mono: scaffold schoolbook multiplication --- skyscraper/bn254-multiplier/benches/bench.rs | 10 +++++++++- skyscraper/bn254-multiplier/src/rne/mod.rs | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs index 7d27d2563..72f252e39 100644 --- a/skyscraper/bn254-multiplier/benches/bench.rs +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -7,7 +7,7 @@ use { // #[divan::bench_group] mod mul { - use super::*; + use {super::*, bn254_multiplier::rne}; #[divan::bench] fn scalar_mul(bencher: Bencher) { @@ -31,6 +31,14 @@ mod mul { .bench_local_values(|(a, b)| a * b); } + #[divan::bench] + fn single_b51(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|(a, b)| rne::single::simd_mul(a, b)); + } + #[divan::bench] fn simd_mul_51b(bencher: Bencher) { bencher diff --git a/skyscraper/bn254-multiplier/src/rne/mod.rs b/skyscraper/bn254-multiplier/src/rne/mod.rs index 415090bd7..cee3ce350 100644 --- a/skyscraper/bn254-multiplier/src/rne/mod.rs +++ b/skyscraper/bn254-multiplier/src/rne/mod.rs @@ -25,5 +25,6 @@ pub mod constants; pub mod portable_simd; pub mod simd_utils; +pub mod single; pub use {constants::*, portable_simd::*, simd_utils::*}; From ed4d4d89c8a567d4a1b02a0913a65b1a8007b219 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:13:57 +0800 Subject: [PATCH 02/21] rne/mono: implement single-input mul function --- skyscraper/bn254-multiplier/src/rne/single.rs | 335 ++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 skyscraper/bn254-multiplier/src/rne/single.rs diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs new file mode 100644 index 000000000..1fe990e1d --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -0,0 +1,335 @@ +//! Portable SIMD Montgomery multiplication and squaring. +//! +//! Processes two independent field multiplications in parallel using 2-lane +//! SIMD. + +use { + crate::rne::{ + constants::*, + simd_utils::{ + addv_simd, fma, i2f, make_initial, reduce_ct_simd, smult_noinit_simd, + transpose_simd_to_u256, transpose_u256_to_simd, u255_to_u256_shr_1_simd, + u256_to_u255_simd, + }, + }, + core::{ + ops::BitAnd, + simd::{num::SimdFloat, Simd}, + }, + seq_macro::seq, + std::simd::{ + num::{SimdInt, SimdUint}, + simd_swizzle, + }, +}; + +/// Two parallel Montgomery squarings: `(v0², v1²)`. +/// input must fit in 2^255-1; no runtime checking +#[inline] +pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { + let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a])); + + let mut t: [Simd; 10] = [Simd::splat(0); 10]; + + for i in 0..5 { + let avi: Simd = i2f(v0_a[i]); + for j in (i + 1)..5 { + let bvj: Simd = i2f(v0_a[j]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[i + j + 1] += p_hi.to_bits().cast(); + t[i + j] += p_lo.to_bits().cast(); + } + } + + // Most shifting operations are more expensive addition thus for multiplying by + // 2 we use addition. + for i in 1..=8 { + t[i] += t[i]; + } + + for i in 0..5 { + let avi: Simd = i2f(v0_a[i]); + let p_hi = fma(avi, avi, Simd::splat(C1)); + let p_lo = fma(avi, avi, Simd::splat(C2) - p_hi); + t[i + i + 1] += p_hi.to_bits().cast(); + t[i + i] += p_lo.to_bits().cast(); + } + + t[0] += Simd::splat(make_initial(1, 0)); + t[9] += Simd::splat(make_initial(0, 6)); + t[1] += Simd::splat(make_initial(2, 1)); + t[8] += Simd::splat(make_initial(6, 7)); + t[2] += Simd::splat(make_initial(3, 2)); + t[7] += Simd::splat(make_initial(7, 8)); + t[3] += Simd::splat(make_initial(4, 3)); + t[6] += Simd::splat(make_initial(8, 9)); + t[4] += Simd::splat(make_initial(10, 4)); + t[5] += Simd::splat(make_initial(9, 10)); + + t[1] += t[0] >> 51; + t[2] += t[1] >> 51; + t[3] += t[2] >> 51; + t[4] += t[3] >> 51; + + let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4); + let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3); + let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2); + let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1); + + let s = [ + r0[0] + r1[0] + r2[0] + r3[0] + t[4], + r0[1] + r1[1] + r2[1] + r3[1] + t[5], + r0[2] + r1[2] + r2[2] + r3[2] + t[6], + r0[3] + r1[3] + r2[3] + r3[3] + t[7], + r0[4] + r1[4] + r2[4] + r3[4] + t[8], + r0[5] + r1[5] + r2[5] + r3[5] + t[9], + ]; + + // The upper bits of s will not affect the lower 51 bits of the product and + // therefore we only have to bitmask once. + let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); + let mp = smult_noinit_simd(m, U51_P); + + let mut addi = addv_simd(s, mp); + // Apply carries before dropping the last limb + addi[1] += addi[0] >> 51; + let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; + + // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation + // and the final shift is done as part of the conversion back to u256 + let reduced = reduce_ct_simd(addi); + let reduced = redundant_carry(reduced); + let u256_result = u255_to_u256_shr_1_simd(reduced); + let v = transpose_simd_to_u256(u256_result); + (v[0], v[1]) +} + +/// Move redundant carries from lower limbs to the higher limbs such that all +/// limbs except the last one is 51 bits. The most significant limb can be +/// larger than 51 bits as the input can be bigger 2^255-1. +#[inline(always)] +fn redundant_carry(t: [Simd; N]) -> [Simd; N] +where + std::simd::LaneCount: std::simd::SupportedLaneCount, +{ + let mut borrow = Simd::splat(0); + let mut res = [Simd::splat(0); N]; + for i in 0..t.len() - 1 { + let tmp = t[i] + borrow; + res[i] = (tmp.cast()).bitand(Simd::splat(MASK51)); + borrow = tmp >> 51; + } + + res[N - 1] = (t[N - 1] + borrow).cast(); + res +} + +/// Convert 4×64-bit to 5×51-bit limb representation. +/// Input must fit in 255 bits; no runtime checking. +#[inline(always)] +pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] { + let [l0, l1, l2, l3] = limbs; + [ + (l0) & MASK51, + ((l0 >> 51) | (l1 << 13)) & MASK51, + ((l1 >> 38) | (l2 << 26)) & MASK51, + ((l2 >> 25) | (l3 << 39)) & MASK51, + l3 >> 12 & MASK51, + ] +} + +pub fn i2f_scalar(a: u64) -> f64 { + // This function has no target gating as we want to verify this function with + // kani and proptest on a different platform than wasm + + // By adding 2^52 represented as float (0x1p52) -> 0x433 << 52, we align the + // 52bit number fully in the mantissa. This can be done with a simple or. Then + // to convert a to it's floating point number we subtract this again. This way + // we only pay for the conversion of the lower bits and not the full 64 bits. + let exponent = 0x433 << 52; + let a: f64 = f64::from_bits(a | exponent); + let b: f64 = f64::from_bits(exponent); + a - b +} + +#[inline(always)] +pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { + let mut t = [Simd::splat(0); 6]; + let s: Simd = Simd::splat(i2f_scalar(s)); + + let v01 = Simd::from_array([v[0] as f64, v[1] as f64]); + let p_hi = fma(s, v01, Simd::splat(C1)); + let p_lo = fma(s, v01, Simd::splat(C2) - p_hi); + t[1] += p_hi.to_bits().cast(); + t[0] += p_lo.to_bits().cast(); + + let v23 = Simd::from_array([v[2] as f64, v[3] as f64]); + let p_hi = fma(s, v23, Simd::splat(C1)); + let p_lo_0 = fma(s, v23, Simd::splat(C2) - p_hi); + t[2] += p_hi.to_bits().cast(); + t[3] += p_lo_0.to_bits().cast(); + + let p_hi_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C1)); + let p_lo_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2); + t[3] += p_hi_2.to_bits().cast(); + t[2] += p_lo_2.to_bits().cast(); + + let p_hi_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C1)); + let p_lo_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3); + t[4] += p_hi_3.to_bits().cast(); + t[3] += p_lo_3.to_bits().cast(); + + let p_hi_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C1)); + let p_lo_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4); + t[5] += p_hi_4.to_bits().cast(); + t[4] += p_lo_4.to_bits().cast(); + + t +} + +/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. +/// input must fit in 2^255-1; no runtime checking +#[inline(always)] +pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { + let v0_a = u256_to_u255(v0_a); + let v0_b = u256_to_u255(v0_b); + + let mut t: [i64; 10] = [0; 10]; + t[0] = make_initial(1, 0); + t[9] = make_initial(0, 6); + t[1] = make_initial(2, 1); + t[8] = make_initial(6, 7); + t[2] = make_initial(3, 2); + t[7] = make_initial(7, 8); + t[3] = make_initial(4, 3); + t[6] = make_initial(8, 9); + t[4] = make_initial(10, 4); + t[5] = make_initial(9, 10); + + let mut ts = [Simd::splat(0); 9]; + + // Offset multiplication to have less intermediate data + seq!(i in 0..5{ + let ai: Simd = i2f(Simd::splat(v0_a[i])); + let b01: Simd = i2f(Simd::from_array([v0_b[0], v0_b[1]])); + let p_hi = fma(ai, b01, Simd::splat(C1)); + let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); + ts[i+1] += p_hi.to_bits().cast(); + ts[i+0] += p_lo.to_bits().cast(); + + let b23: Simd = i2f(Simd::from_array([v0_b[2], v0_b[3]])); + let p_hi = fma(ai, b23, Simd::splat(C1)); + let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi); + ts[i+3] += p_hi.to_bits().cast(); + ts[i+2] += p_lo.to_bits().cast(); + + let b4 = Simd::splat(i2f_scalar(v0_b[4])); + let p_hi = fma(ai, b4, Simd::splat(C1)); + let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); + t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64); + t[i + 4] = t[i + 4].wrapping_add(p_lo[0].to_bits() as i64); + + }); + + for s in (1..9).step_by(2) { + ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]); + ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]); + } + + for s in (0..10).step_by(2) { + t[s] = t[s].wrapping_add(ts[s][0]); + t[s + 1] = t[s + 1].wrapping_add(ts[s][1]); + } + + // sign extend redundant carries + t[1] += t[0] >> 51; + t[2] += t[1] >> 51; + t[3] += t[2] >> 51; + t[4] += t[3] >> 51; + + let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4); + let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3); + let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2); + let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1); + + let s = [ + r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]), + r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]), + r0[2] + r1[2] + r2[2] + r3[2] + Simd::splat(t[6]), + r0[3] + r1[3] + r2[3] + r3[3] + Simd::splat(t[7]), + r0[4] + r1[4] + r2[4] + r3[4] + Simd::splat(t[8]), + r0[5] + r1[5] + r2[5] + r3[5] + Simd::splat(t[9]), + ]; + + let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); + let mp = smult_noinit_simd(m, U51_P); + + let mut addi = addv_simd(s, mp); + addi[1] += addi[0] >> 51; + let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; + + // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation + // and the final shift is done as part of the conversion back to u256 + let reduced = reduce_ct_simd(addi); + let reduced = redundant_carry(reduced); + let u256_result = u255_to_u256_shr_1_simd(reduced); + let v = transpose_simd_to_u256(u256_result); + (v[0], v[1]) +} + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(test)] +mod tests { + use { + super::*, + crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference}, + ark_bn254::Fr, + ark_ff::{BigInt, PrimeField}, + proptest::{ + prelude::{prop, Strategy}, + prop_assert_eq, proptest, + }, + }; + + #[test] + fn test_simd_mul() { + proptest!(|( + a in limbs5_51(), + b in limbs5_51(), + )| { + let a: [Simd;_] = a.map(Simd::splat); + let b: [Simd;_] = b.map(Simd::splat); + let a = u255_to_u256_simd(a).map(|x|x[0]); + let b = u255_to_u256_simd(b).map(|x|x[0]); + let (ab, _bc) = simd_mul(a, b); + let ab_ref = ark_ff_reference(a, b); + let ab = Fr::new(BigInt(ab)); + prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint()); + }) + } + + // #[test] + // fn test_simd_sqr() { + // proptest!(|( + // a in limbs5_51(), + // b in limbs5_51(), + // )| { + // let a: [Simd;_] = a.map(Simd::splat); + // let b: [Simd;_] = b.map(Simd::splat); + // let a = u255_to_u256_simd(a).map(|x|x[0]); + // let b = u255_to_u256_simd(b).map(|x|x[0]); + // let (a2, _b2) = simd_mul(a, b, b); + // let (a2s, _b2s) = simd_sqr(a, b); + // prop_assert_eq!(a2, a2s); + // }) + // } + + fn limb51() -> impl Strategy { + 0u64..(1u64 << 51) + } + + fn limbs5_51() -> impl Strategy { + prop::array::uniform5(limb51()) + } +} From e565c21b285ba58f1e4873ed4fa6c46622e39be2 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:00 +0800 Subject: [PATCH 03/21] =?UTF-8?q?rne/mono:=205=C3=975=20multiplication=20u?= =?UTF-8?q?sing=206-wide=20SIMD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skyscraper/bn254-multiplier/src/rne/single.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 1fe990e1d..5a691ea7d 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -197,15 +197,15 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { let mut t: [i64; 10] = [0; 10]; t[0] = make_initial(1, 0); - t[9] = make_initial(0, 6); + t[9] = make_initial(1, 6); t[1] = make_initial(2, 1); - t[8] = make_initial(6, 7); + t[8] = make_initial(7, 7); t[2] = make_initial(3, 2); - t[7] = make_initial(7, 8); + t[7] = make_initial(8, 8); t[3] = make_initial(4, 3); - t[6] = make_initial(8, 9); + t[6] = make_initial(9, 9); t[4] = make_initial(10, 4); - t[5] = make_initial(9, 10); + t[5] = make_initial(10, 10); let mut ts = [Simd::splat(0); 9]; @@ -224,11 +224,11 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { ts[i+3] += p_hi.to_bits().cast(); ts[i+2] += p_lo.to_bits().cast(); - let b4 = Simd::splat(i2f_scalar(v0_b[4])); + let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]); let p_hi = fma(ai, b4, Simd::splat(C1)); let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64); - t[i + 4] = t[i + 4].wrapping_add(p_lo[0].to_bits() as i64); + ts[i + 4] += p_lo.to_bits().cast(); }); From d5f84ecbc0a0c68a75ab770cc7c857e9e1635aa9 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:03 +0800 Subject: [PATCH 04/21] rne/mono: complete schoolbook phase --- skyscraper/bn254-multiplier/src/rne/single.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 5a691ea7d..d2f394eaa 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -197,17 +197,17 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { let mut t: [i64; 10] = [0; 10]; t[0] = make_initial(1, 0); - t[9] = make_initial(1, 6); + t[9] = make_initial(1, 7); t[1] = make_initial(2, 1); - t[8] = make_initial(7, 7); + t[8] = make_initial(7, 8); t[2] = make_initial(3, 2); - t[7] = make_initial(8, 8); + t[7] = make_initial(8, 9); t[3] = make_initial(4, 3); - t[6] = make_initial(9, 9); + t[6] = make_initial(9, 10); t[4] = make_initial(10, 4); t[5] = make_initial(10, 10); - let mut ts = [Simd::splat(0); 9]; + let mut ts = [Simd::splat(0); 10]; // Offset multiplication to have less intermediate data seq!(i in 0..5{ @@ -227,7 +227,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]); let p_hi = fma(ai, b4, Simd::splat(C1)); let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); - t[i + 5] = t[i + 5].wrapping_add(p_hi[0].to_bits() as i64); + ts[i + 5] += p_hi.to_bits().cast(); ts[i + 4] += p_lo.to_bits().cast(); }); @@ -236,6 +236,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]); ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]); } + ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]); for s in (0..10).step_by(2) { t[s] = t[s].wrapping_add(ts[s][0]); From 8493700ca57ca012b2bdb980ecd0cc152b196331 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:06 +0800 Subject: [PATCH 05/21] rne/mono: add SIMD initialization and RHO swizzle --- skyscraper/bn254-multiplier/src/rne/single.rs | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index d2f394eaa..57912c503 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -195,19 +195,12 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { let v0_a = u256_to_u255(v0_a); let v0_b = u256_to_u255(v0_b); - let mut t: [i64; 10] = [0; 10]; - t[0] = make_initial(1, 0); - t[9] = make_initial(1, 7); - t[1] = make_initial(2, 1); - t[8] = make_initial(7, 8); - t[2] = make_initial(3, 2); - t[7] = make_initial(8, 9); - t[3] = make_initial(4, 3); - t[6] = make_initial(9, 10); - t[4] = make_initial(10, 4); - t[5] = make_initial(10, 10); - let mut ts = [Simd::splat(0); 10]; + ts[0] = Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); + ts[2] = Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); + ts[4] = Simd::from_array([make_initial(10, 4), make_initial(10, 10)]); + ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]); + ts[8] = Simd::from_array([make_initial(7, 8), make_initial(1, 7)]); // Offset multiplication to have less intermediate data seq!(i in 0..5{ @@ -232,28 +225,44 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { }); - for s in (1..9).step_by(2) { - ts[s - 1] += simd_swizzle!(Simd::splat(0), ts[s], [0, 2]); - ts[s + 1] += simd_swizzle!(Simd::splat(0), ts[s], [3, 0]); - } - ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]); + let mut t: [i64; 10] = [0; 10]; - for s in (0..10).step_by(2) { - t[s] = t[s].wrapping_add(ts[s][0]); - t[s + 1] = t[s + 1].wrapping_add(ts[s][1]); - } + seq!( i in 0..2 { + let s = i * 2; + ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); + ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); + t[s] = ts[s][0]; + t[s + 1] = ts[s][1]; + }); // sign extend redundant carries t[1] += t[0] >> 51; t[2] += t[1] >> 51; t[3] += t[2] >> 51; - t[4] += t[3] >> 51; + + // Lift carry into SIMD to prevent extraction + ts[4] += Simd::from_array([t[3] >> 51, 0]); let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4); let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3); let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2); let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1); + seq!( i in 2..4 { + let s = i * 2; + ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); + ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); + }); + ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]); + + // After this point only the even ts matter + + seq!(i in 2..5 { + let s = i * 2; + t[s] = ts[s][0]; + t[s + 1] = ts[s][1]; + }); + let s = [ r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]), r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]), From df6219e3b258645e4e85e656d4f6151c22e3f62c Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:08 +0800 Subject: [PATCH 06/21] rne/mono: swizzle after Montgomery reduction --- skyscraper/bn254-multiplier/src/rne/single.rs | 69 +++++++++---------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 57912c503..e0f92f1f8 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -166,24 +166,15 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { let v23 = Simd::from_array([v[2] as f64, v[3] as f64]); let p_hi = fma(s, v23, Simd::splat(C1)); - let p_lo_0 = fma(s, v23, Simd::splat(C2) - p_hi); - t[2] += p_hi.to_bits().cast(); - t[3] += p_lo_0.to_bits().cast(); + let p_lo = fma(s, v23, Simd::splat(C2) - p_hi); + t[3] += p_hi.to_bits().cast(); + t[2] += p_lo.to_bits().cast(); - let p_hi_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C1)); - let p_lo_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2); - t[3] += p_hi_2.to_bits().cast(); - t[2] += p_lo_2.to_bits().cast(); - - let p_hi_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C1)); - let p_lo_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3); - t[4] += p_hi_3.to_bits().cast(); - t[3] += p_lo_3.to_bits().cast(); - - let p_hi_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C1)); - let p_lo_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4); - t[5] += p_hi_4.to_bits().cast(); - t[4] += p_lo_4.to_bits().cast(); + let v45 = Simd::from_array([v[4] as f64, 0.]); + let p_hi = fma(s, v45, Simd::splat(C1)); + let p_lo = fma(s, v45, Simd::splat(C2) - p_hi); + t[5] += Simd::from_array([p_hi[0].to_bits() as i64, 0]); + t[4] += Simd::from_array([p_lo[0].to_bits() as i64, 0]); t } @@ -225,7 +216,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { }); - let mut t: [i64; 10] = [0; 10]; + let mut t: [i64; 4] = [0; 4]; seq!( i in 0..2 { let s = i * 2; @@ -243,33 +234,39 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { // Lift carry into SIMD to prevent extraction ts[4] += Simd::from_array([t[3] >> 51, 0]); - let r0 = smult_noinit_simd(Simd::splat(t[0] as u64 & MASK51), RHO_4); - let r1 = smult_noinit_simd(Simd::splat(t[1] as u64 & MASK51), RHO_3); - let r2 = smult_noinit_simd(Simd::splat(t[2] as u64 & MASK51), RHO_2); - let r3 = smult_noinit_simd(Simd::splat(t[3] as u64 & MASK51), RHO_1); + let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4); + let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3); + let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2); + let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1); - seq!( i in 2..4 { - let s = i * 2; - ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); - ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); + let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]]; + + seq!( i in 0..6 { + ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; }); - ts[9 - 1] += simd_swizzle!(Simd::splat(0), ts[9], [0, 2]); + seq!( i in 0..2 { + let s = i * 2; + ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); + ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]); + }); + ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); // After this point only the even ts matter - seq!(i in 2..5 { + let mut t = [0; 6]; + seq!(i in 0..3 { let s = i * 2; - t[s] = ts[s][0]; - t[s + 1] = ts[s][1]; + t[s] = ss[s][0]; + t[s + 1] = ss[s][1]; }); let s = [ - r0[0] + r1[0] + r2[0] + r3[0] + Simd::splat(t[4]), - r0[1] + r1[1] + r2[1] + r3[1] + Simd::splat(t[5]), - r0[2] + r1[2] + r2[2] + r3[2] + Simd::splat(t[6]), - r0[3] + r1[3] + r2[3] + r3[3] + Simd::splat(t[7]), - r0[4] + r1[4] + r2[4] + r3[4] + Simd::splat(t[8]), - r0[5] + r1[5] + r2[5] + r3[5] + Simd::splat(t[9]), + Simd::splat(t[0]), + Simd::splat(t[1]), + Simd::splat(t[2]), + Simd::splat(t[3]), + Simd::splat(t[4]), + Simd::splat(t[5]), ]; let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); From d7319144c5711ebd5ceab8b9aeb352f277a67925 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:11 +0800 Subject: [PATCH 07/21] rne/mono: swizzle after m*P Montgomery step --- skyscraper/bn254-multiplier/src/rne/single.rs | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index e0f92f1f8..d571da019 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -245,6 +245,13 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; }); + let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; + let mp = smult_noinit(m, U51_P); + + seq!( i in 0..6 { + ss[i] += mp[i]; + }); + seq!( i in 0..2 { let s = i * 2; ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); @@ -253,32 +260,25 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); // After this point only the even ts matter - let mut t = [0; 6]; + let mut s: [i64; 6] = [0; 6]; seq!(i in 0..3 { - let s = i * 2; - t[s] = ss[s][0]; - t[s + 1] = ss[s][1]; + let k = i * 2; + s[k] = ss[k][0]; + s[k + 1] = ss[k][1]; }); + s[1] += s[0] >> 51; let s = [ - Simd::splat(t[0]), - Simd::splat(t[1]), - Simd::splat(t[2]), - Simd::splat(t[3]), - Simd::splat(t[4]), - Simd::splat(t[5]), + Simd::splat(s[1]), + Simd::splat(s[2]), + Simd::splat(s[3]), + Simd::splat(s[4]), + Simd::splat(s[5]), ]; - let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); - let mp = smult_noinit_simd(m, U51_P); - - let mut addi = addv_simd(s, mp); - addi[1] += addi[0] >> 51; - let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; - // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation // and the final shift is done as part of the conversion back to u256 - let reduced = reduce_ct_simd(addi); + let reduced = reduce_ct_simd(s); let reduced = redundant_carry(reduced); let u256_result = u255_to_u256_shr_1_simd(reduced); let v = transpose_simd_to_u256(u256_result); From 7c35215fa0b89f137400b57a1dde01e347ea1b4b Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:14 +0800 Subject: [PATCH 08/21] rne/mono: 51-bit limb SIMD implementation --- .../bn254-multiplier/src/rne/simd_utils.rs | 12 ++-- skyscraper/bn254-multiplier/src/rne/single.rs | 68 ++++++++++++++----- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/simd_utils.rs b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs index 93acd71b0..17157f105 100644 --- a/skyscraper/bn254-multiplier/src/rne/simd_utils.rs +++ b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs @@ -1,17 +1,21 @@ //! SIMD utilities for RNE Montgomery multiplication. use { - crate::rne::constants::{C1, C2, C3, MASK51, U51_P}, + crate::rne::{ + constants::{C1, C2, C3, MASK51}, + U51_P, + }, core::{ - array, ops::BitAnd, simd::{ - cmp::SimdPartialEq, num::{SimdFloat, SimdInt, SimdUint}, Simd, }, }, - std::simd::{LaneCount, SupportedLaneCount}, + std::{ + array, + simd::{cmp::SimdPartialEq, LaneCount, SupportedLaneCount}, + }, }; #[inline(always)] /// On WASM there is no single specialised instruction to cast an integer to a diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index d571da019..06621fba4 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -105,9 +105,6 @@ pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { (v[0], v[1]) } -/// Move redundant carries from lower limbs to the higher limbs such that all -/// limbs except the last one is 51 bits. The most significant limb can be -/// larger than 51 bits as the input can be bigger 2^255-1. #[inline(always)] fn redundant_carry(t: [Simd; N]) -> [Simd; N] where @@ -125,6 +122,23 @@ where res } +/// Move redundant carries from lower limbs to the higher limbs such that all +/// limbs except the last one is 51 bits. The most significant limb can be +/// larger than 51 bits as the input can be bigger 2^255-1. +#[inline(always)] +fn redundant_carry_scalar(t: [i64; N]) -> [u64; N] { + let mut borrow = 0; + let mut res = [0; N]; + for i in 0..t.len() - 1 { + let tmp = t[i] + borrow; + res[i] = (tmp as u64) & MASK51; + borrow = tmp >> 51; + } + + res[N - 1] = (t[N - 1] + borrow) as u64; + res +} + /// Convert 4×64-bit to 5×51-bit limb representation. /// Input must fit in 255 bits; no runtime checking. #[inline(always)] @@ -179,10 +193,25 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { t } +#[inline(always)] +pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { + // To reduce Check whether the least significant bit is set + let mask = -(a[0] & 1); + + seq!( i in 0..5 { + a[i] += U51_P[i] as i64 & mask; + }); + + // Check that final result is even + debug_assert!(a[0] & 1 == 0); + + a +} + /// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. /// input must fit in 2^255-1; no runtime checking #[inline(always)] -pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { +pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { let v0_a = u256_to_u255(v0_a); let v0_b = u256_to_u255(v0_b); @@ -268,21 +297,26 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> ([u64; 4], [u64; 4]) { }); s[1] += s[0] >> 51; - let s = [ - Simd::splat(s[1]), - Simd::splat(s[2]), - Simd::splat(s[3]), - Simd::splat(s[4]), - Simd::splat(s[5]), - ]; + let s = [s[1], s[2], s[3], s[4], s[5]]; // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation // and the final shift is done as part of the conversion back to u256 - let reduced = reduce_ct_simd(s); - let reduced = redundant_carry(reduced); - let u256_result = u255_to_u256_shr_1_simd(reduced); - let v = transpose_simd_to_u256(u256_result); - (v[0], v[1]) + let reduced = reduce_ct(s); + let reduced = redundant_carry_scalar(reduced); + let u256_result = u255_to_u256_shr_1(reduced); + u256_result +} + +/// Convert 5×51-bit to 4×64-bit with simultaneous division by 2. +#[inline(always)] +pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] { + let [l0, l1, l2, l3, l4] = limbs; + [ + (l0 >> 1) | (l1 << 50), + (l1 >> 14) | (l2 << 37), + (l2 >> 27) | (l3 << 24), + (l3 >> 40) | (l4 << 11), + ] } #[cfg(not(target_arch = "wasm32"))] @@ -309,7 +343,7 @@ mod tests { let b: [Simd;_] = b.map(Simd::splat); let a = u255_to_u256_simd(a).map(|x|x[0]); let b = u255_to_u256_simd(b).map(|x|x[0]); - let (ab, _bc) = simd_mul(a, b); + let ab = simd_mul(a, b); let ab_ref = ark_ff_reference(a, b); let ab = Fr::new(BigInt(ab)); prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint()); From 9696d04ecde00ff8db2f2c4df4a16727af665150 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:16 +0800 Subject: [PATCH 09/21] rne/mono: extraction-free reduction phase --- skyscraper/bn254-multiplier/src/rne/single.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 06621fba4..7e4de29b1 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -187,8 +187,8 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { let v45 = Simd::from_array([v[4] as f64, 0.]); let p_hi = fma(s, v45, Simd::splat(C1)); let p_lo = fma(s, v45, Simd::splat(C2) - p_hi); - t[5] += Simd::from_array([p_hi[0].to_bits() as i64, 0]); - t[4] += Simd::from_array([p_lo[0].to_bits() as i64, 0]); + t[5] += p_hi.to_bits().cast(); + t[4] += p_lo.to_bits().cast(); t } @@ -220,7 +220,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { ts[2] = Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); ts[4] = Simd::from_array([make_initial(10, 4), make_initial(10, 10)]); ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]); - ts[8] = Simd::from_array([make_initial(7, 8), make_initial(1, 7)]); + ts[8] = Simd::from_array([make_initial(7, 8), make_initial(6, 7)]); // Offset multiplication to have less intermediate data seq!(i in 0..5{ @@ -281,14 +281,15 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { ss[i] += mp[i]; }); + // Get rid of redundant SIMD form seq!( i in 0..2 { let s = i * 2; ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]); }); ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); - // After this point only the even ts matter + // Move to redundant scalar form let mut s: [i64; 6] = [0; 6]; seq!(i in 0..3 { let k = i * 2; From 70deb8c1703d04f06aeb5442862871d47597364d Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:19 +0800 Subject: [PATCH 10/21] rne/mono: WIP checkpoint --- skyscraper/bn254-multiplier/benches/bench.rs | 8 + skyscraper/bn254-multiplier/src/rne/single.rs | 245 +++++++++--------- 2 files changed, 125 insertions(+), 128 deletions(-) diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs index 72f252e39..75219c12e 100644 --- a/skyscraper/bn254-multiplier/benches/bench.rs +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -149,6 +149,14 @@ mod sqr { .bench_local_values(|(a, b)| rne::simd_sqr(a, b)); } + #[divan::bench] + fn single_sqr_b51(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|a| rne::single::simd_sqr(a)); + } + #[divan::bench] fn ark_ff(bencher: Bencher) { use {ark_bn254::Fr, ark_ff::BigInt}; diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 7e4de29b1..0e77b6e4a 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -6,122 +6,13 @@ use { crate::rne::{ constants::*, - simd_utils::{ - addv_simd, fma, i2f, make_initial, reduce_ct_simd, smult_noinit_simd, - transpose_simd_to_u256, transpose_u256_to_simd, u255_to_u256_shr_1_simd, - u256_to_u255_simd, - }, - }, - core::{ - ops::BitAnd, - simd::{num::SimdFloat, Simd}, + simd_utils::{fma, i2f, make_initial}, }, + core::simd::{num::SimdFloat, Simd}, seq_macro::seq, - std::simd::{ - num::{SimdInt, SimdUint}, - simd_swizzle, - }, + std::simd::{num::SimdUint, simd_swizzle}, }; -/// Two parallel Montgomery squarings: `(v0², v1²)`. -/// input must fit in 2^255-1; no runtime checking -#[inline] -pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { - let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a])); - - let mut t: [Simd; 10] = [Simd::splat(0); 10]; - - for i in 0..5 { - let avi: Simd = i2f(v0_a[i]); - for j in (i + 1)..5 { - let bvj: Simd = i2f(v0_a[j]); - let p_hi = fma(avi, bvj, Simd::splat(C1)); - let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); - t[i + j + 1] += p_hi.to_bits().cast(); - t[i + j] += p_lo.to_bits().cast(); - } - } - - // Most shifting operations are more expensive addition thus for multiplying by - // 2 we use addition. - for i in 1..=8 { - t[i] += t[i]; - } - - for i in 0..5 { - let avi: Simd = i2f(v0_a[i]); - let p_hi = fma(avi, avi, Simd::splat(C1)); - let p_lo = fma(avi, avi, Simd::splat(C2) - p_hi); - t[i + i + 1] += p_hi.to_bits().cast(); - t[i + i] += p_lo.to_bits().cast(); - } - - t[0] += Simd::splat(make_initial(1, 0)); - t[9] += Simd::splat(make_initial(0, 6)); - t[1] += Simd::splat(make_initial(2, 1)); - t[8] += Simd::splat(make_initial(6, 7)); - t[2] += Simd::splat(make_initial(3, 2)); - t[7] += Simd::splat(make_initial(7, 8)); - t[3] += Simd::splat(make_initial(4, 3)); - t[6] += Simd::splat(make_initial(8, 9)); - t[4] += Simd::splat(make_initial(10, 4)); - t[5] += Simd::splat(make_initial(9, 10)); - - t[1] += t[0] >> 51; - t[2] += t[1] >> 51; - t[3] += t[2] >> 51; - t[4] += t[3] >> 51; - - let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4); - let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3); - let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2); - let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1); - - let s = [ - r0[0] + r1[0] + r2[0] + r3[0] + t[4], - r0[1] + r1[1] + r2[1] + r3[1] + t[5], - r0[2] + r1[2] + r2[2] + r3[2] + t[6], - r0[3] + r1[3] + r2[3] + r3[3] + t[7], - r0[4] + r1[4] + r2[4] + r3[4] + t[8], - r0[5] + r1[5] + r2[5] + r3[5] + t[9], - ]; - - // The upper bits of s will not affect the lower 51 bits of the product and - // therefore we only have to bitmask once. - let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); - let mp = smult_noinit_simd(m, U51_P); - - let mut addi = addv_simd(s, mp); - // Apply carries before dropping the last limb - addi[1] += addi[0] >> 51; - let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; - - // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation - // and the final shift is done as part of the conversion back to u256 - let reduced = reduce_ct_simd(addi); - let reduced = redundant_carry(reduced); - let u256_result = u255_to_u256_shr_1_simd(reduced); - let v = transpose_simd_to_u256(u256_result); - (v[0], v[1]) -} - -#[inline(always)] -fn redundant_carry(t: [Simd; N]) -> [Simd; N] -where - std::simd::LaneCount: std::simd::SupportedLaneCount, -{ - let mut borrow = Simd::splat(0); - let mut res = [Simd::splat(0); N]; - for i in 0..t.len() - 1 { - let tmp = t[i] + borrow; - res[i] = (tmp.cast()).bitand(Simd::splat(MASK51)); - borrow = tmp >> 51; - } - - res[N - 1] = (t[N - 1] + borrow).cast(); - res -} - /// Move redundant carries from lower limbs to the higher limbs such that all /// limbs except the last one is 51 bits. The most significant limb can be /// larger than 51 bits as the input can be bigger 2^255-1. @@ -153,7 +44,7 @@ pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] { ] } -pub fn i2f_scalar(a: u64) -> f64 { +pub const fn i2f_scalar(a: u64) -> f64 { // This function has no target gating as we want to verify this function with // kani and proptest on a different platform than wasm @@ -320,6 +211,107 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] { ] } +/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. +/// input must fit in 2^255-1; no runtime checking +#[inline(always)] +pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { + let v0_a = u256_to_u255(v0_a); + let v0_b = v0_a; + + let mut ts = [Simd::splat(0); 10]; + + // Offset multiplication to have less intermediate data + seq!(i in 0..5{ + let ai: Simd = i2f(Simd::splat(v0_a[i])); + let b01: Simd = i2f(Simd::from_array([v0_b[0], v0_b[1]])); + let p_hi = fma(ai, b01, Simd::splat(C1)); + let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); + ts[i+1] += p_hi.to_bits().cast(); + ts[i+0] += p_lo.to_bits().cast(); + + let b23: Simd = i2f(Simd::from_array([v0_b[2], v0_b[3]])); + let p_hi = fma(ai, b23, Simd::splat(C1)); + let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi); + ts[i+3] += p_hi.to_bits().cast(); + ts[i+2] += p_lo.to_bits().cast(); + + let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]); + let p_hi = fma(ai, b4, Simd::splat(C1)); + let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); + ts[i + 5] += p_hi.to_bits().cast(); + ts[i + 4] += p_lo.to_bits().cast(); + + }); + + ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); + ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); + ts[4] += Simd::from_array([make_initial(10, 4), make_initial(10, 10)]); + ts[6] += Simd::from_array([make_initial(9, 10), make_initial(8, 9)]); + ts[8] += Simd::from_array([make_initial(7, 8), make_initial(6, 7)]); + + let mut t: [i64; 4] = [0; 4]; + + seq!( i in 0..2 { + let s = i * 2; + ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); + ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); + t[s] = ts[s][0]; + t[s + 1] = ts[s][1]; + }); + + // sign extend redundant carries + t[1] += t[0] >> 51; + t[2] += t[1] >> 51; + t[3] += t[2] >> 51; + + // Lift carry into SIMD to prevent extraction + ts[4] += Simd::from_array([t[3] >> 51, 0]); + + let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4); + let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3); + let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2); + let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1); + + let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]]; + + seq!( i in 0..6 { + ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; + }); + + let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; + let mp = smult_noinit(m, U51_P); + + seq!( i in 0..6 { + ss[i] += mp[i]; + }); + + // Get rid of redundant SIMD form + seq!( i in 0..2 { + let s = i * 2; + ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); + ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]); + }); + ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); + + // Move to redundant scalar form + let mut s: [i64; 6] = [0; 6]; + seq!(i in 0..3 { + let k = i * 2; + s[k] = ss[k][0]; + s[k + 1] = ss[k][1]; + }); + + s[1] += s[0] >> 51; + let s = [s[1], s[2], s[3], s[4], s[5]]; + + // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation + // and the final shift is done as part of the conversion back to u256 + let reduced = reduce_ct(s); + let reduced = redundant_carry_scalar(reduced); + let u256_result = u255_to_u256_shr_1(reduced); + u256_result +} + #[cfg(not(target_arch = "wasm32"))] #[cfg(test)] mod tests { @@ -351,21 +343,18 @@ mod tests { }) } - // #[test] - // fn test_simd_sqr() { - // proptest!(|( - // a in limbs5_51(), - // b in limbs5_51(), - // )| { - // let a: [Simd;_] = a.map(Simd::splat); - // let b: [Simd;_] = b.map(Simd::splat); - // let a = u255_to_u256_simd(a).map(|x|x[0]); - // let b = u255_to_u256_simd(b).map(|x|x[0]); - // let (a2, _b2) = simd_mul(a, b, b); - // let (a2s, _b2s) = simd_sqr(a, b); - // prop_assert_eq!(a2, a2s); - // }) - // } + #[test] + fn test_simd_sqr() { + proptest!(|( + a in limbs5_51(), + )| { + let a: [Simd;_] = a.map(Simd::splat); + let a = u255_to_u256_simd(a).map(|x|x[0]); + let a2 = simd_mul(a, a); + let b2 = simd_sqr(a); + prop_assert_eq!(a2, b2); + }) + } fn limb51() -> impl Strategy { 0u64..(1u64 << 51) From 15c69f70868d3afe39631c25f33d0f0864016d7e Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:22 +0800 Subject: [PATCH 11/21] rne/mono: sqr diagonal and off-diagonal terms --- skyscraper/bn254-multiplier/src/rne/single.rs | 106 +++++++++++------- 1 file changed, 64 insertions(+), 42 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 0e77b6e4a..446f312eb 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -10,7 +10,10 @@ use { }, core::simd::{num::SimdFloat, Simd}, seq_macro::seq, - std::simd::{num::SimdUint, simd_swizzle}, + std::{ + ops::BitAnd, + simd::{num::SimdUint, simd_swizzle}, + }, }; /// Move redundant carries from lower limbs to the higher limbs such that all @@ -220,34 +223,44 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { let mut ts = [Simd::splat(0); 10]; - // Offset multiplication to have less intermediate data - seq!(i in 0..5{ + for i in 0..5 { let ai: Simd = i2f(Simd::splat(v0_a[i])); - let b01: Simd = i2f(Simd::from_array([v0_b[0], v0_b[1]])); - let p_hi = fma(ai, b01, Simd::splat(C1)); - let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); - ts[i+1] += p_hi.to_bits().cast(); - ts[i+0] += p_lo.to_bits().cast(); - - let b23: Simd = i2f(Simd::from_array([v0_b[2], v0_b[3]])); - let p_hi = fma(ai, b23, Simd::splat(C1)); - let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi); - ts[i+3] += p_hi.to_bits().cast(); - ts[i+2] += p_lo.to_bits().cast(); + for j in ((i + 1)..5).step_by(2) { + let l = v0_b[j]; + let r = if j + 1 < 5 { v0_b[j + 1] } else { 0 }; + let b01: Simd = i2f(Simd::from_array([l, r])); + let p_hi = fma(ai, b01, Simd::splat(C1)); + let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); + ts[i + j + 1] += p_hi.to_bits().cast(); + ts[i + j] += p_lo.to_bits().cast(); + } + } - let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]); - let p_hi = fma(ai, b4, Simd::splat(C1)); - let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); - ts[i + 5] += p_hi.to_bits().cast(); - ts[i + 4] += p_lo.to_bits().cast(); + for i in (0..4).step_by(2) { + let ai: Simd = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]])); + let p_hi = fma(ai, ai, Simd::splat(C1)); + let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi); + let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]); + let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]); + ts[2 * (i + 1)] += p_hi.to_bits().bitand(r_mask).cast(); + ts[2 * (i + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast(); + + ts[2 * i + 1] += p_hi.to_bits().bitand(l_mask).cast(); + ts[2 * i] += p_lo.to_bits().bitand(l_mask).cast(); + } - }); + let ai: Simd = i2f(Simd::from_array([v0_a[4], 0])); + let p_hi = fma(ai, ai, Simd::splat(C1)); + let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi); + let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]); + ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast(); + ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast(); - ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); - ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); - ts[4] += Simd::from_array([make_initial(10, 4), make_initial(10, 10)]); - ts[6] += Simd::from_array([make_initial(9, 10), make_initial(8, 9)]); - ts[8] += Simd::from_array([make_initial(7, 8), make_initial(6, 7)]); + ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]); + ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]); + ts[4] += Simd::from_array([make_initial(3, 2), make_initial(2, 3)]); + ts[6] += Simd::from_array([make_initial(3, 2), make_initial(1, 3)]); + ts[8] += Simd::from_array([make_initial(2, 1), make_initial(0, 2)]); let mut t: [i64; 4] = [0; 4]; @@ -259,6 +272,10 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { t[s + 1] = ts[s][1]; }); + for (i, k) in t.iter().enumerate() { + println!("t[{i}]: {k:x}") + } + // sign extend redundant carries t[1] += t[0] >> 51; t[2] += t[1] >> 51; @@ -274,35 +291,40 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]]; - seq!( i in 0..6 { - ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; - }); + // seq!( i in 0..6 { + // ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; + // }); + + println!("ss[0][0]: {:x}", ss[0][0]); let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; let mp = smult_noinit(m, U51_P); - seq!( i in 0..6 { - ss[i] += mp[i]; - }); + // seq!( i in 0..6 { + // ss[i] += mp[i]; + // }); // Get rid of redundant SIMD form - seq!( i in 0..2 { - let s = i * 2; - ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); - ss[s + 2] += simd_swizzle!(Simd::splat(0), ss[s + 1], [3, 0]); - }); - ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); + ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]); + ss[2] += simd_swizzle!(Simd::splat(0), ss[1], [3, 0]); + ss[2] += simd_swizzle!(Simd::splat(0), ss[3], [0, 2]); + ss[4] += simd_swizzle!(Simd::splat(0), ss[3], [3, 0]); + ss[4] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); // Move to redundant scalar form let mut s: [i64; 6] = [0; 6]; - seq!(i in 0..3 { - let k = i * 2; - s[k] = ss[k][0]; - s[k + 1] = ss[k][1]; - }); + s[0] = ss[0][0]; + s[0 + 1] = ss[0][1]; + s[2] = ss[2][0]; + s[2 + 1] = ss[2][1]; + s[4] = ss[4][0]; + s[4 + 1] = ss[4][1]; s[1] += s[0] >> 51; let s = [s[1], s[2], s[3], s[4], s[5]]; + for (i, k) in s.iter().enumerate() { + println!("s[{i}]: {k:x}") + } // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation // and the final shift is done as part of the conversion back to u256 From b6b32b20cf4c1b259129572e793c2afa7bbb5599 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:25 +0800 Subject: [PATCH 12/21] examples: add count.rs for debugging --- skyscraper/bn254-multiplier/examples/count.rs | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 skyscraper/bn254-multiplier/examples/count.rs diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs new file mode 100644 index 000000000..b204c8ce5 --- /dev/null +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -0,0 +1,102 @@ +use bn254_multiplier::rne::{ + make_initial, + single::{simd_mul, simd_sqr}, +}; + +const LO: usize = 1; +const HI: usize = 1 << 16; + +fn main() { + let t = diagonal(); + for (i, x) in t.iter().enumerate() { + println!("t[{i}]: lo: {}\t hi: {}", x & (HI - 1), x >> 16); + } + + let a = [0, 0, 0, 0]; + let res = simd_sqr(a); + for (i, k) in res.iter().enumerate() { + println!("res[{i}]: {:x}", k) + } + + println!("\nFULL"); + + let t = full(); + for (i, x) in t.iter().enumerate() { + let lo = x & (HI - 1); + let hi = x >> 16; + println!( + "t[{i}]: lo: {}\t hi: {}\t init: {:x}", + lo, + hi, + make_initial(lo as u64, hi as u64) + ); + } + + let a = [0, 0, 0, 0]; + let res = simd_mul(a, a); + for (i, k) in res.iter().enumerate() { + println!("res[{i}]: {:x}", k) + } +} + +fn diagonal() -> [usize; 12] { + let mut t = [0; 12]; + for i in 0..5 { + for j in ((i + 1)..5).step_by(2) { + println!("i: {i} j: {} {}", j, j + 1); + t[i + j + 2] += HI; + t[i + j + 1] += LO + HI; + t[i + j] += LO; + } + println!(); + } + + // scalar doubling + // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78 + // respectively. + // for i in 1..=8 { + // t[i] += t[i]; + // } + + for i in (0..4).step_by(2) { + t[2 * (i + 1) + 1] += HI; + t[2 * (i + 1)] += LO; + t[2 * i + 1] += HI; + t[2 * i] += LO; + } + t[2 * 4 + 1] += HI; + t[2 * 4] += LO; + + // let i = 4; + // for _k in 0..5 { + // for j in 0..3 { + // t[i + 2 * j + 2] += HI; + // t[i + 2 * j + 1] += LO + HI; + // t[i + 2 * j] += LO; + // } + // } + + t +} + +fn full() -> [usize; 11] { + let mut t = [0; 11]; + for i in 0..5 { + for j in 0..3 { + t[i + 2 * j + 2] += HI; + t[i + 2 * j + 1] += LO + HI; + t[i + 2 * j] += LO; + } + } + + let i = 4; + for _k in 0..5 { + for j in 0..3 { + t[i + 2 * j + 2] += HI; + t[i + 2 * j + 1] += LO + HI; + t[i + 2 * j] += LO; + } + } + + t +} From a5820434557a4e2f98ef0d5e5ced915c9aba78d0 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:28 +0800 Subject: [PATCH 13/21] rne/mono: add sqr reduction --- skyscraper/bn254-multiplier/examples/count.rs | 16 ++++++++-------- skyscraper/bn254-multiplier/src/rne/single.rs | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs index b204c8ce5..5eea1af65 100644 --- a/skyscraper/bn254-multiplier/examples/count.rs +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -67,14 +67,14 @@ fn diagonal() -> [usize; 12] { t[2 * 4 + 1] += HI; t[2 * 4] += LO; - // let i = 4; - // for _k in 0..5 { - // for j in 0..3 { - // t[i + 2 * j + 2] += HI; - // t[i + 2 * j + 1] += LO + HI; - // t[i + 2 * j] += LO; - // } - // } + let i = 4; + for _k in 0..5 { + for j in 0..3 { + t[i + 2 * j + 2] += HI; + t[i + 2 * j + 1] += LO + HI; + t[i + 2 * j] += LO; + } + } t } diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 446f312eb..c945f5fb8 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -258,9 +258,9 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]); ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]); - ts[4] += Simd::from_array([make_initial(3, 2), make_initial(2, 3)]); - ts[6] += Simd::from_array([make_initial(3, 2), make_initial(1, 3)]); - ts[8] += Simd::from_array([make_initial(2, 1), make_initial(0, 2)]); + ts[4] += Simd::from_array([make_initial(8, 2), make_initial(7, 8)]); + ts[6] += Simd::from_array([make_initial(8, 7), make_initial(6, 8)]); + ts[8] += Simd::from_array([make_initial(7, 6), make_initial(5, 7)]); let mut t: [i64; 4] = [0; 4]; @@ -291,18 +291,18 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]]; - // seq!( i in 0..6 { - // ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; - // }); + seq!( i in 0..6 { + ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; + }); println!("ss[0][0]: {:x}", ss[0][0]); let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; let mp = smult_noinit(m, U51_P); - // seq!( i in 0..6 { - // ss[i] += mp[i]; - // }); + seq!( i in 0..6 { + ss[i] += mp[i]; + }); // Get rid of redundant SIMD form ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]); From 74bc28a71b95248c0f64b849454cf3a80d788bc9 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:31 +0800 Subject: [PATCH 14/21] rne/mono: working sqr implementation --- skyscraper/bn254-multiplier/examples/count.rs | 6 +++--- skyscraper/bn254-multiplier/src/rne/single.rs | 21 ++++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs index 5eea1af65..5d908117b 100644 --- a/skyscraper/bn254-multiplier/examples/count.rs +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -54,9 +54,9 @@ fn diagonal() -> [usize; 12] { // scalar doubling // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78 // respectively. - // for i in 1..=8 { - // t[i] += t[i]; - // } + for i in 1..=8 { + t[i] += t[i]; + } for i in (0..4).step_by(2) { t[2 * (i + 1) + 1] += HI; diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index c945f5fb8..f027f6b68 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -236,6 +236,17 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { } } + ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]); + ts[0] = ts[0].bitand(Simd::from_array([-1, 0])); + + ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]); + ts[8] = ts[8].bitand(Simd::from_array([-1, 0])); + + for i in 1..=8 { + ts[i] += ts[i] + } + + // Diagonal for i in (0..4).step_by(2) { let ai: Simd = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]])); let p_hi = fma(ai, ai, Simd::splat(C1)); @@ -256,11 +267,11 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast(); ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast(); - ts[0] += Simd::from_array([make_initial(1, 0), make_initial(1, 1)]); - ts[2] += Simd::from_array([make_initial(2, 1), make_initial(2, 2)]); - ts[4] += Simd::from_array([make_initial(8, 2), make_initial(7, 8)]); - ts[6] += Simd::from_array([make_initial(8, 7), make_initial(6, 8)]); - ts[8] += Simd::from_array([make_initial(7, 6), make_initial(5, 7)]); + ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); + ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); + ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]); + ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]); + ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 7)]); let mut t: [i64; 4] = [0; 4]; From 3c08555e19101bf7bc8cbfe516ed7905cde0fbb9 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:34 +0800 Subject: [PATCH 15/21] rne/mono: use seq_macro for loop unrolling --- skyscraper/bn254-multiplier/src/rne/single.rs | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index f027f6b68..6b8b6071e 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -223,7 +223,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { let mut ts = [Simd::splat(0); 10]; - for i in 0..5 { + seq!( i in 0..5 { let ai: Simd = i2f(Simd::splat(v0_a[i])); for j in ((i + 1)..5).step_by(2) { let l = v0_b[j]; @@ -234,7 +234,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ts[i + j + 1] += p_hi.to_bits().cast(); ts[i + j] += p_lo.to_bits().cast(); } - } + }); ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]); ts[0] = ts[0].bitand(Simd::from_array([-1, 0])); @@ -242,23 +242,24 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]); ts[8] = ts[8].bitand(Simd::from_array([-1, 0])); - for i in 1..=8 { - ts[i] += ts[i] - } + seq!( i in 1..=8 { + ts[i] += ts[i]; + }); // Diagonal - for i in (0..4).step_by(2) { - let ai: Simd = i2f(Simd::from_array([v0_a[i], v0_a[i + 1]])); + seq!( i in 0..2 { + let k = i * 2; + let ai: Simd = i2f(Simd::from_array([v0_a[k], v0_a[k + 1]])); let p_hi = fma(ai, ai, Simd::splat(C1)); let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi); let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]); let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]); - ts[2 * (i + 1)] += p_hi.to_bits().bitand(r_mask).cast(); - ts[2 * (i + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast(); + ts[2 * (k + 1)] += p_hi.to_bits().bitand(r_mask).cast(); + ts[2 * (k + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast(); - ts[2 * i + 1] += p_hi.to_bits().bitand(l_mask).cast(); - ts[2 * i] += p_lo.to_bits().bitand(l_mask).cast(); - } + ts[2 * k + 1] += p_hi.to_bits().bitand(l_mask).cast(); + ts[2 * k] += p_lo.to_bits().bitand(l_mask).cast(); + }); let ai: Simd = i2f(Simd::from_array([v0_a[4], 0])); let p_hi = fma(ai, ai, Simd::splat(C1)); @@ -283,10 +284,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { t[s + 1] = ts[s][1]; }); - for (i, k) in t.iter().enumerate() { - println!("t[{i}]: {k:x}") - } - // sign extend redundant carries t[1] += t[0] >> 51; t[2] += t[1] >> 51; @@ -306,8 +303,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; }); - println!("ss[0][0]: {:x}", ss[0][0]); - let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; let mp = smult_noinit(m, U51_P); @@ -333,9 +328,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { s[1] += s[0] >> 51; let s = [s[1], s[2], s[3], s[4], s[5]]; - for (i, k) in s.iter().enumerate() { - println!("s[{i}]: {k:x}") - } // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation // and the final shift is done as part of the conversion back to u256 From 4fa46ae352f6874e6f66a305e80e5bf6e3baf4fb Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:37 +0800 Subject: [PATCH 16/21] rne/mono: optimize sqr doubling step --- skyscraper/bn254-multiplier/benches/bench.rs | 8 ++++++++ skyscraper/bn254-multiplier/examples/count.rs | 9 +++++---- skyscraper/bn254-multiplier/src/rne/single.rs | 8 +------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs index 75219c12e..b13581da6 100644 --- a/skyscraper/bn254-multiplier/benches/bench.rs +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -157,6 +157,14 @@ mod sqr { .bench_local_values(|a| rne::single::simd_sqr(a)); } + #[divan::bench] + fn single_mul_sqr_b51(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|a| rne::single::simd_mul(a, a)); + } + #[divan::bench] fn ark_ff(bencher: Bencher) { use {ark_bn254::Fr, ark_ff::BigInt}; diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs index 5d908117b..f447288f5 100644 --- a/skyscraper/bn254-multiplier/examples/count.rs +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -51,10 +51,11 @@ fn diagonal() -> [usize; 12] { println!(); } - // scalar doubling - // needs to chop off the 1 in 01 and the 8 in 89 and feed it back into 12 and 78 - // respectively. - for i in 1..=8 { + // Since there is no data of the diagionals yet it is safe to double them. This + // means that t[8][1] = t[9] needs to be included. + // t[0][1] is not touched by the diagonalisation so no need to include t[0] + + for i in 1..=9 { t[i] += t[i]; } diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 6b8b6071e..d63ca3a06 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -236,12 +236,6 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { } }); - ts[1] += simd_swizzle!(Simd::splat(0), ts[0], [3, 0]); - ts[0] = ts[0].bitand(Simd::from_array([-1, 0])); - - ts[9] += simd_swizzle!(Simd::splat(0), ts[8], [3, 0]); - ts[8] = ts[8].bitand(Simd::from_array([-1, 0])); - seq!( i in 1..=8 { ts[i] += ts[i]; }); @@ -272,7 +266,7 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]); ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]); - ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 7)]); + ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 8)]); let mut t: [i64; 4] = [0; 4]; From 0117fb008fceee151726c9f14ea895aeba620287 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:40 +0800 Subject: [PATCH 17/21] rne/mono: use mul for sqr and let compiler optimise --- skyscraper/bn254-multiplier/src/rne/single.rs | 112 +----------------- 1 file changed, 1 insertion(+), 111 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index d63ca3a06..22a027544 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -218,117 +218,7 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] { /// input must fit in 2^255-1; no runtime checking #[inline(always)] pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { - let v0_a = u256_to_u255(v0_a); - let v0_b = v0_a; - - let mut ts = [Simd::splat(0); 10]; - - seq!( i in 0..5 { - let ai: Simd = i2f(Simd::splat(v0_a[i])); - for j in ((i + 1)..5).step_by(2) { - let l = v0_b[j]; - let r = if j + 1 < 5 { v0_b[j + 1] } else { 0 }; - let b01: Simd = i2f(Simd::from_array([l, r])); - let p_hi = fma(ai, b01, Simd::splat(C1)); - let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); - ts[i + j + 1] += p_hi.to_bits().cast(); - ts[i + j] += p_lo.to_bits().cast(); - } - }); - - seq!( i in 1..=8 { - ts[i] += ts[i]; - }); - - // Diagonal - seq!( i in 0..2 { - let k = i * 2; - let ai: Simd = i2f(Simd::from_array([v0_a[k], v0_a[k + 1]])); - let p_hi = fma(ai, ai, Simd::splat(C1)); - let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi); - let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]); - let r_mask = Simd::from_array([0, 0xffffffffffffffff_u64]); - ts[2 * (k + 1)] += p_hi.to_bits().bitand(r_mask).cast(); - ts[2 * (k + 1) - 1] += p_lo.to_bits().bitand(r_mask).cast(); - - ts[2 * k + 1] += p_hi.to_bits().bitand(l_mask).cast(); - ts[2 * k] += p_lo.to_bits().bitand(l_mask).cast(); - }); - - let ai: Simd = i2f(Simd::from_array([v0_a[4], 0])); - let p_hi = fma(ai, ai, Simd::splat(C1)); - let p_lo = fma(ai, ai, Simd::splat(C2) - p_hi); - let l_mask = Simd::from_array([0xffffffffffffffff_u64, 0]); - ts[2 * 4 + 1] += p_hi.to_bits().bitand(l_mask).cast(); - ts[2 * 4] += p_lo.to_bits().bitand(l_mask).cast(); - - ts[0] += Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); - ts[2] += Simd::from_array([make_initial(3, 2), make_initial(4, 3)]); - ts[4] += Simd::from_array([make_initial(10, 4), make_initial(9, 10)]); - ts[6] += Simd::from_array([make_initial(10, 9), make_initial(7, 10)]); - ts[8] += Simd::from_array([make_initial(8, 7), make_initial(5, 8)]); - - let mut t: [i64; 4] = [0; 4]; - - seq!( i in 0..2 { - let s = i * 2; - ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); - ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); - t[s] = ts[s][0]; - t[s + 1] = ts[s][1]; - }); - - // sign extend redundant carries - t[1] += t[0] >> 51; - t[2] += t[1] >> 51; - t[3] += t[2] >> 51; - - // Lift carry into SIMD to prevent extraction - ts[4] += Simd::from_array([t[3] >> 51, 0]); - - let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4); - let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3); - let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2); - let r3 = smult_noinit(t[3] as u64 & MASK51, RHO_1); - - let mut ss = [ts[4], ts[5], ts[6], ts[7], ts[8], ts[9]]; - - seq!( i in 0..6 { - ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; - }); - - let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; - let mp = smult_noinit(m, U51_P); - - seq!( i in 0..6 { - ss[i] += mp[i]; - }); - - // Get rid of redundant SIMD form - ss[0] += simd_swizzle!(Simd::splat(0), ss[1], [0, 2]); - ss[2] += simd_swizzle!(Simd::splat(0), ss[1], [3, 0]); - ss[2] += simd_swizzle!(Simd::splat(0), ss[3], [0, 2]); - ss[4] += simd_swizzle!(Simd::splat(0), ss[3], [3, 0]); - ss[4] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); - - // Move to redundant scalar form - let mut s: [i64; 6] = [0; 6]; - s[0] = ss[0][0]; - s[0 + 1] = ss[0][1]; - s[2] = ss[2][0]; - s[2 + 1] = ss[2][1]; - s[4] = ss[4][0]; - s[4 + 1] = ss[4][1]; - - s[1] += s[0] >> 51; - let s = [s[1], s[2], s[3], s[4], s[5]]; - - // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation - // and the final shift is done as part of the conversion back to u256 - let reduced = reduce_ct(s); - let reduced = redundant_carry_scalar(reduced); - let u256_result = u255_to_u256_shr_1(reduced); - u256_result + simd_mul(v0_a, v0_a) } #[cfg(not(target_arch = "wasm32"))] From e60d82672b41445abaedda8c5da498620c15b581 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:42 +0800 Subject: [PATCH 18/21] rne/mono: improve naming and add docs --- skyscraper/bn254-multiplier/benches/bench.rs | 6 +- skyscraper/bn254-multiplier/examples/count.rs | 6 +- .../bn254-multiplier/src/rne/portable_simd.rs | 18 ++-- skyscraper/bn254-multiplier/src/rne/single.rs | 89 +++++++++---------- skyscraper/bn254-multiplier/src/test_utils.rs | 10 ++- 5 files changed, 60 insertions(+), 69 deletions(-) diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs index b13581da6..9ff580796 100644 --- a/skyscraper/bn254-multiplier/benches/bench.rs +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -36,7 +36,7 @@ mod mul { bencher //.counter(ItemsCount::new(1usize)) .with_inputs(|| rng().random()) - .bench_local_values(|(a, b)| rne::single::simd_mul(a, b)); + .bench_local_values(|(a, b)| rne::single::mul(a, b)); } #[divan::bench] @@ -154,7 +154,7 @@ mod sqr { bencher //.counter(ItemsCount::new(1usize)) .with_inputs(|| rng().random()) - .bench_local_values(|a| rne::single::simd_sqr(a)); + .bench_local_values(|a| rne::single::sqr(a)); } #[divan::bench] @@ -162,7 +162,7 @@ mod sqr { bencher //.counter(ItemsCount::new(1usize)) .with_inputs(|| rng().random()) - .bench_local_values(|a| rne::single::simd_mul(a, a)); + .bench_local_values(|a| rne::single::mul(a, a)); } #[divan::bench] diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs index f447288f5..f5fcafc35 100644 --- a/skyscraper/bn254-multiplier/examples/count.rs +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -1,6 +1,6 @@ use bn254_multiplier::rne::{ make_initial, - single::{simd_mul, simd_sqr}, + single::{mul, sqr}, }; const LO: usize = 1; @@ -13,7 +13,7 @@ fn main() { } let a = [0, 0, 0, 0]; - let res = simd_sqr(a); + let res = sqr(a); for (i, k) in res.iter().enumerate() { println!("res[{i}]: {:x}", k) } @@ -33,7 +33,7 @@ fn main() { } let a = [0, 0, 0, 0]; - let res = simd_mul(a, a); + let res = mul(a, a); for (i, k) in res.iter().enumerate() { println!("res[{i}]: {:x}", k) } diff --git a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs index cef071596..634cde91b 100644 --- a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs +++ b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs @@ -200,13 +200,13 @@ pub fn simd_mul( mod tests { use { super::*, - crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference}, + crate::{ + rne::simd_utils::u255_to_u256_simd, + test_utils::{ark_ff_reference, limbs5_51}, + }, ark_bn254::Fr, ark_ff::{BigInt, PrimeField}, - proptest::{ - prelude::{prop, Strategy}, - prop_assert_eq, proptest, - }, + proptest::{prop_assert_eq, proptest}, }; #[test] @@ -248,12 +248,4 @@ mod tests { prop_assert_eq!(b2, b2s); }) } - - fn limb51() -> impl Strategy { - 0u64..(1u64 << 51) - } - - fn limbs5_51() -> impl Strategy { - prop::array::uniform5(limb51()) - } } diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index 22a027544..f8591c192 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -1,7 +1,4 @@ //! Portable SIMD Montgomery multiplication and squaring. -//! -//! Processes two independent field multiplications in parallel using 2-lane -//! SIMD. use { crate::rne::{ @@ -10,17 +7,14 @@ use { }, core::simd::{num::SimdFloat, Simd}, seq_macro::seq, - std::{ - ops::BitAnd, - simd::{num::SimdUint, simd_swizzle}, - }, + std::simd::{num::SimdUint, simd_swizzle}, }; /// Move redundant carries from lower limbs to the higher limbs such that all /// limbs except the last one is 51 bits. The most significant limb can be /// larger than 51 bits as the input can be bigger 2^255-1. #[inline(always)] -fn redundant_carry_scalar(t: [i64; N]) -> [u64; N] { +fn redundant_carry(t: [i64; N]) -> [u64; N] { let mut borrow = 0; let mut res = [0; N]; for i in 0..t.len() - 1 { @@ -89,7 +83,7 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { #[inline(always)] pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { - // To reduce Check whether the least significant bit is set + // When input is odd add P to prepare for division let mask = -(a[0] & 1); seq!( i in 0..5 { @@ -102,12 +96,12 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { a } -/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. +/// Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. /// input must fit in 2^255-1; no runtime checking #[inline(always)] -pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { - let v0_a = u256_to_u255(v0_a); - let v0_b = u256_to_u255(v0_b); +pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { + let a = u256_to_u255(a); + let b = u256_to_u255(b); let mut ts = [Simd::splat(0); 10]; ts[0] = Simd::from_array([make_initial(1, 0), make_initial(2, 1)]); @@ -116,22 +110,21 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { ts[6] = Simd::from_array([make_initial(9, 10), make_initial(8, 9)]); ts[8] = Simd::from_array([make_initial(7, 8), make_initial(6, 7)]); - // Offset multiplication to have less intermediate data seq!(i in 0..5{ - let ai: Simd = i2f(Simd::splat(v0_a[i])); - let b01: Simd = i2f(Simd::from_array([v0_b[0], v0_b[1]])); + let ai: Simd = i2f(Simd::splat(a[i])); + let b01: Simd = i2f(Simd::from_array([b[0], b[1]])); let p_hi = fma(ai, b01, Simd::splat(C1)); let p_lo = fma(ai, b01, Simd::splat(C2) - p_hi); ts[i+1] += p_hi.to_bits().cast(); ts[i+0] += p_lo.to_bits().cast(); - let b23: Simd = i2f(Simd::from_array([v0_b[2], v0_b[3]])); + let b23: Simd = i2f(Simd::from_array([b[2], b[3]])); let p_hi = fma(ai, b23, Simd::splat(C1)); let p_lo = fma(ai, b23, Simd::splat(C2) - p_hi); ts[i+3] += p_hi.to_bits().cast(); ts[i+2] += p_lo.to_bits().cast(); - let b4 = Simd::from_array([i2f_scalar(v0_b[4]),0.]); + let b4 = Simd::from_array([i2f_scalar(b[4]),0.]); let p_hi = fma(ai, b4, Simd::splat(C1)); let p_lo = fma(ai, b4, Simd::splat(C2) - p_hi); ts[i + 5] += p_hi.to_bits().cast(); @@ -141,12 +134,14 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { let mut t: [i64; 4] = [0; 4]; + // Swizzle the words that are reduced. Delay the rest of the swizzling until + // after the reduction. seq!( i in 0..2 { - let s = i * 2; - ts[s] += simd_swizzle!(Simd::splat(0), ts[s + 1], [0, 2]); - ts[s + 2] += simd_swizzle!(Simd::splat(0), ts[s + 1], [3, 0]); - t[s] = ts[s][0]; - t[s + 1] = ts[s][1]; + let k = i * 2; + ts[k] += simd_swizzle!(Simd::splat(0), ts[k + 1], [0, 2]); + ts[k + 2] += simd_swizzle!(Simd::splat(0), ts[k + 1], [3, 0]); + t[k] = ts[k][0]; + t[k + 1] = ts[k][1]; }); // sign extend redundant carries @@ -197,7 +192,7 @@ pub fn simd_mul(v0_a: [u64; 4], v0_b: [u64; 4]) -> [u64; 4] { // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation // and the final shift is done as part of the conversion back to u256 let reduced = reduce_ct(s); - let reduced = redundant_carry_scalar(reduced); + let reduced = redundant_carry(reduced); let u256_result = u255_to_u256_shr_1(reduced); u256_result } @@ -214,11 +209,12 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] { ] } -/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. -/// input must fit in 2^255-1; no runtime checking +/// Montgomery squaring: `(va*v0_b, v1_a*v1_b)`. +/// input must fit in 2^255-1; no runtime checking. Output lives in Montgomery +/// space R=256. #[inline(always)] -pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { - simd_mul(v0_a, v0_a) +pub fn sqr(v0_a: [u64; 4]) -> [u64; 4] { + mul(v0_a, v0_a) } #[cfg(not(target_arch = "wasm32"))] @@ -226,13 +222,10 @@ pub fn simd_sqr(v0_a: [u64; 4]) -> [u64; 4] { mod tests { use { super::*, - crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference}, + crate::test_utils::{ark_ff_reference, limbs5_51}, ark_bn254::Fr, ark_ff::{BigInt, PrimeField}, - proptest::{ - prelude::{prop, Strategy}, - prop_assert_eq, proptest, - }, + proptest::{prop_assert_eq, proptest}, }; #[test] @@ -241,11 +234,9 @@ mod tests { a in limbs5_51(), b in limbs5_51(), )| { - let a: [Simd;_] = a.map(Simd::splat); - let b: [Simd;_] = b.map(Simd::splat); - let a = u255_to_u256_simd(a).map(|x|x[0]); - let b = u255_to_u256_simd(b).map(|x|x[0]); - let ab = simd_mul(a, b); + let a = u255_to_u256(a); + let b = u255_to_u256(b); + let ab = mul(a, b); let ab_ref = ark_ff_reference(a, b); let ab = Fr::new(BigInt(ab)); prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint()); @@ -257,19 +248,19 @@ mod tests { proptest!(|( a in limbs5_51(), )| { - let a: [Simd;_] = a.map(Simd::splat); - let a = u255_to_u256_simd(a).map(|x|x[0]); - let a2 = simd_mul(a, a); - let b2 = simd_sqr(a); - prop_assert_eq!(a2, b2); + let a = u255_to_u256(a); + prop_assert_eq!(mul(a,a), sqr(a)); }) } - fn limb51() -> impl Strategy { - 0u64..(1u64 << 51) - } - - fn limbs5_51() -> impl Strategy { - prop::array::uniform5(limb51()) + #[inline(always)] + pub fn u255_to_u256(limbs: [u64; 5]) -> [u64; 4] { + let [l0, l1, l2, l3, l4] = limbs; + [ + l0 | (l1 << 51), + (l1 >> 13) | (l2 << 38), + (l2 >> 26) | (l3 << 25), + (l3 >> 39) | (l4 << 12), + ] } } diff --git a/skyscraper/bn254-multiplier/src/test_utils.rs b/skyscraper/bn254-multiplier/src/test_utils.rs index bfbdaab3a..6dceec140 100644 --- a/skyscraper/bn254-multiplier/src/test_utils.rs +++ b/skyscraper/bn254-multiplier/src/test_utils.rs @@ -6,7 +6,7 @@ use { ark_ff::{BigInt, Field}, proptest::{ collection, - prelude::{any, Strategy}, + prelude::{any, prop, Strategy}, proptest, }, }; @@ -81,3 +81,11 @@ fn test_max_multiprecision_strategy() { ); }); } + +pub fn limb51() -> impl Strategy { + 0u64..(1u64 << 51) +} + +pub fn limbs5_51() -> impl Strategy { + prop::array::uniform5(limb51()) +} From 53ea1d123372889d2f1624e5f435ac17379b7df7 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:45 +0800 Subject: [PATCH 19/21] rne/mono: document mul algorithm --- skyscraper/bn254-multiplier/src/rne/single.rs | 100 +++++++++++++++--- 1 file changed, 86 insertions(+), 14 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/single.rs index f8591c192..d860656ab 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/single.rs @@ -1,4 +1,23 @@ -//! Portable SIMD Montgomery multiplication and squaring. +//! Unbatched SIMD Montgomery multiplication and squaring for BN254. +//! +//! This module provides single-input Montgomery multiplication using relaxed +//! SIMD FMA operations. Unlike [`portable_simd`](super::portable_simd) which +//! batches two multiplications per call, this processes one at a time—faster +//! on WASM when inputs aren't naturally paired. +//! +//! # Redundant Representations +//! +//! The code uses two distinct "redundant" representations: +//! +//! 1. **Redundant SIMD form**: FMA produces high/low product parts in separate +//! SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo, limb_{k+1}_lo]` +//! and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle operations realign these +//! so adjacent limbs occupy consecutive positions. +//! +//! 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits. +//! The excess bits are carries (or borrows) that propagate to higher limbs. +//! - `u64` limbs: excess bits are positive carries +//! - `i64` limbs: excess bits may be negative, representing borrows from above use { crate::rne::{ @@ -10,9 +29,11 @@ use { std::simd::{num::SimdUint, simd_swizzle}, }; -/// Move redundant carries from lower limbs to the higher limbs such that all -/// limbs except the last one is 51 bits. The most significant limb can be -/// larger than 51 bits as the input can be bigger 2^255-1. +/// Propagate carries/borrows from redundant limb form to normalized form. +/// +/// Input `i64` limbs may have excess bits (positive = carry, negative = borrow). +/// Output `u64` limbs are normalized to exactly 51 bits, except the MSB which +/// absorbs any remaining carry and may exceed 51 bits. #[inline(always)] fn redundant_carry(t: [i64; N]) -> [u64; N] { let mut borrow = 0; @@ -55,6 +76,11 @@ pub const fn i2f_scalar(a: u64) -> f64 { a - b } +/// Multiply a 51-bit scalar `s` by a 5×51-bit vector `v`. +/// +/// Returns 6 SIMD vectors in redundant form. "noinit" means no FMA anchor +/// compensation is applied; the caller must initialize accumulators with +/// `make_initial` biases. #[inline(always)] pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { let mut t = [Simd::splat(0); 6]; @@ -81,9 +107,13 @@ pub fn smult_noinit(s: u64, v: [u64; 5]) -> [Simd; 6] { t } +/// Constant-time preparation for division by 2. +/// +/// If the input is odd, adds P (which is also odd) to make it even. +/// This ensures the subsequent right-shift is exact. "ct" = constant-time. #[inline(always)] pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { - // When input is odd add P to prepare for division + // When input is odd, add P to make it even let mask = -(a[0] & 1); seq!( i in 0..5 { @@ -96,8 +126,42 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { a } -/// Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. -/// input must fit in 2^255-1; no runtime checking +/// Montgomery multiplication for BN254 scalar field. +/// +/// Computes `a * b * R^{-256} mod P` where R = 2^256. +/// +/// # Algorithm Overview +/// +/// Uses floating-point FMA for fast 51×51-bit multiplications via the +/// Renes-Costello-Batina technique. The algorithm proceeds in phases: +/// +/// 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation. +/// +/// 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using +/// FMA-based multiply-accumulate. Results are stored in "redundant SIMD +/// form" where high/low parts of products occupy separate SIMD lanes. +/// +/// 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply +/// the lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This +/// converts `t[i] * 2^{51*i}` into an equivalent value in the upper limbs, +/// allowing all reductions to proceed in parallel. +/// +/// 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`, +/// then add `m * P` to cancel the lowest limb. +/// +/// 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final +/// division by 2 (after conditional addition of P for odd results) to +/// produce the correct R^{-256} Montgomery form. +/// +/// # Preconditions +/// +/// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs) +/// - Inputs should be in Montgomery form with R = 2^256 +/// +/// # Performance +/// +/// Optimized for WASM with relaxed SIMD. Processes one multiplication at a time +/// (vs. `simd_mul` which batches two). #[inline(always)] pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { let a = u256_to_u255(a); @@ -134,8 +198,11 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { let mut t: [i64; 4] = [0; 4]; - // Swizzle the words that are reduced. Delay the rest of the swizzling until - // after the reduction. + // Realign redundant SIMD form to scalar form for the lower 4 limbs. + // FMA produces: ts[k] = [limb_k_lo, limb_{k+1}_lo] + // ts[k+1] = [limb_k_hi, limb_{k+1}_hi] + // But limb_k_hi belongs with limb_{k+1}_lo (adjacent in the product). + // Swizzle moves: ts[k+1][0] -> ts[k][0], ts[k+1][1] -> ts[k+2][0] seq!( i in 0..2 { let k = i * 2; ts[k] += simd_swizzle!(Simd::splat(0), ts[k + 1], [0, 2]); @@ -144,7 +211,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { t[k + 1] = ts[k][1]; }); - // sign extend redundant carries + // Propagate carries/borrows through redundant limb form (i64 allows negative excess) t[1] += t[0] >> 51; t[2] += t[1] >> 51; t[3] += t[2] >> 51; @@ -152,6 +219,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { // Lift carry into SIMD to prevent extraction ts[4] += Simd::from_array([t[3] >> 51, 0]); + // Parallel reduction: t[i] * RHO_{4-i} ≡ t[i] * 2^{51*i} * R^{-1} (mod P) + // This replaces sequential CIOS with independent multiplications. let r0 = smult_noinit(t[0] as u64 & MASK51, RHO_4); let r1 = smult_noinit(t[1] as u64 & MASK51, RHO_3); let r2 = smult_noinit(t[2] as u64 & MASK51, RHO_2); @@ -163,6 +232,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { ss[i] += r0[i] + r1[i] + r2[i] + r3[i]; }); + // Final Montgomery reduction: m = ss[0] * (-P^{-1}) mod 2^51, then add m*P + // to make the lowest limb zero (it gets shifted out). let m = (ss[0][0] as u64).wrapping_mul(U51_NP0) & MASK51; let mp = smult_noinit(m, U51_P); @@ -170,7 +241,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { ss[i] += mp[i]; }); - // Get rid of redundant SIMD form + // Realign from redundant SIMD form (misaligned lanes) to aligned lanes seq!( i in 0..2 { let s = i * 2; ss[s] += simd_swizzle!(Simd::splat(0), ss[s + 1], [0, 2]); @@ -178,7 +249,7 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { }); ss[5 - 1] += simd_swizzle!(Simd::splat(0), ss[5], [0, 2]); - // Move to redundant scalar form + // Extract to redundant limb form (i64 with carry/borrow in upper bits) let mut s: [i64; 6] = [0; 6]; seq!(i in 0..3 { let k = i * 2; @@ -186,11 +257,12 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { s[k + 1] = ss[k][1]; }); + // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery reduction) s[1] += s[0] >> 51; let s = [s[1], s[2], s[3], s[4], s[5]]; - // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation - // and the final shift is done as part of the conversion back to u256 + // Bit adjustment: 5×51 = 255 bits, but we need R^{-256}. Divide by 2 via + // reduce_ct (adds P if odd) followed by right-shift in u255_to_u256_shr_1. let reduced = reduce_ct(s); let reduced = redundant_carry(reduced); let u256_result = u255_to_u256_shr_1(reduced); From 19c2035d705d6c0139f8e34c4566b26df3f07451 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 2 Feb 2026 19:14:50 +0800 Subject: [PATCH 20/21] =?UTF-8?q?rne:=20rename=20single=E2=86=92mono,=20po?= =?UTF-8?q?rtable=5Fsimd=E2=86=92batched?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skyscraper/bn254-multiplier/benches/bench.rs | 18 +-- skyscraper/bn254-multiplier/examples/count.rs | 6 +- .../src/rne/{portable_simd.rs => batched.rs} | 0 skyscraper/bn254-multiplier/src/rne/mod.rs | 6 +- .../src/rne/{single.rs => mono.rs} | 113 +++++++++--------- 5 files changed, 71 insertions(+), 72 deletions(-) rename skyscraper/bn254-multiplier/src/rne/{portable_simd.rs => batched.rs} (100%) rename skyscraper/bn254-multiplier/src/rne/{single.rs => mono.rs} (78%) diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs index 9ff580796..5577cec51 100644 --- a/skyscraper/bn254-multiplier/benches/bench.rs +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -32,11 +32,11 @@ mod mul { } #[divan::bench] - fn single_b51(bencher: Bencher) { + fn mono_mul_b51(bencher: Bencher) { bencher //.counter(ItemsCount::new(1usize)) .with_inputs(|| rng().random()) - .bench_local_values(|(a, b)| rne::single::mul(a, b)); + .bench_local_values(|(a, b)| rne::mono::mul(a, b)); } #[divan::bench] @@ -45,7 +45,7 @@ mod mul { //.counter(ItemsCount::new(2usize)) .with_inputs(|| rng().random()) .bench_local_values(|(a, b, c, d)| { - bn254_multiplier::rne::portable_simd::simd_mul(a, b, c, d) + bn254_multiplier::rne::batched::simd_mul(a, b, c, d) }); } @@ -150,19 +150,11 @@ mod sqr { } #[divan::bench] - fn single_sqr_b51(bencher: Bencher) { + fn mono_sqr_b51(bencher: Bencher) { bencher //.counter(ItemsCount::new(1usize)) .with_inputs(|| rng().random()) - .bench_local_values(|a| rne::single::sqr(a)); - } - - #[divan::bench] - fn single_mul_sqr_b51(bencher: Bencher) { - bencher - //.counter(ItemsCount::new(1usize)) - .with_inputs(|| rng().random()) - .bench_local_values(|a| rne::single::mul(a, a)); + .bench_local_values(|a| rne::mono::sqr(a)); } #[divan::bench] diff --git a/skyscraper/bn254-multiplier/examples/count.rs b/skyscraper/bn254-multiplier/examples/count.rs index f5fcafc35..6f5c7b243 100644 --- a/skyscraper/bn254-multiplier/examples/count.rs +++ b/skyscraper/bn254-multiplier/examples/count.rs @@ -1,6 +1,8 @@ +//! Helper script to calculate the magic numbers to cancel out the anchors used +//! in FMA multiplication. use bn254_multiplier::rne::{ - make_initial, - single::{mul, sqr}, + mono::{mul, sqr}, + simd_utils::make_initial, }; const LO: usize = 1; diff --git a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs b/skyscraper/bn254-multiplier/src/rne/batched.rs similarity index 100% rename from skyscraper/bn254-multiplier/src/rne/portable_simd.rs rename to skyscraper/bn254-multiplier/src/rne/batched.rs diff --git a/skyscraper/bn254-multiplier/src/rne/mod.rs b/skyscraper/bn254-multiplier/src/rne/mod.rs index cee3ce350..a22af985a 100644 --- a/skyscraper/bn254-multiplier/src/rne/mod.rs +++ b/skyscraper/bn254-multiplier/src/rne/mod.rs @@ -22,9 +22,9 @@ //! Point Arithmetic on the GPU, 2018 IEEE 25th Symposium on Computer Arithmetic //! (ARITH) by Emmart, Zheng and Weems; which uses RTZ. +pub mod batched; pub mod constants; -pub mod portable_simd; +pub mod mono; pub mod simd_utils; -pub mod single; -pub use {constants::*, portable_simd::*, simd_utils::*}; +pub use {batched::*, constants::*, mono::*}; diff --git a/skyscraper/bn254-multiplier/src/rne/single.rs b/skyscraper/bn254-multiplier/src/rne/mono.rs similarity index 78% rename from skyscraper/bn254-multiplier/src/rne/single.rs rename to skyscraper/bn254-multiplier/src/rne/mono.rs index d860656ab..a6a77bcfd 100644 --- a/skyscraper/bn254-multiplier/src/rne/single.rs +++ b/skyscraper/bn254-multiplier/src/rne/mono.rs @@ -1,23 +1,9 @@ -//! Unbatched SIMD Montgomery multiplication and squaring for BN254. +//! SIMD Montgomery multiplication and squaring for BN254. //! -//! This module provides single-input Montgomery multiplication using relaxed -//! SIMD FMA operations. Unlike [`portable_simd`](super::portable_simd) which -//! batches two multiplications per call, this processes one at a time—faster -//! on WASM when inputs aren't naturally paired. -//! -//! # Redundant Representations -//! -//! The code uses two distinct "redundant" representations: -//! -//! 1. **Redundant SIMD form**: FMA produces high/low product parts in separate -//! SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo, limb_{k+1}_lo]` -//! and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle operations realign these -//! so adjacent limbs occupy consecutive positions. -//! -//! 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits. -//! The excess bits are carries (or borrows) that propagate to higher limbs. -//! - `u64` limbs: excess bits are positive carries -//! - `i64` limbs: excess bits may be negative, representing borrows from above +//! This module provides single-input Montgomery multiplication using (relaxed) +//! SIMD FMA operations. Unlike [`batched`](super::batched) which processes two +//! multiplications per call, this handles one at a time— this variant is the +//! fastest on WASM. use { crate::rne::{ @@ -31,9 +17,9 @@ use { /// Propagate carries/borrows from redundant limb form to normalized form. /// -/// Input `i64` limbs may have excess bits (positive = carry, negative = borrow). -/// Output `u64` limbs are normalized to exactly 51 bits, except the MSB which -/// absorbs any remaining carry and may exceed 51 bits. +/// Input `i64` limbs may have excess bits (positive = carry, negative = +/// borrow). Output `u64` limbs are normalized to exactly 51 bits, except the +/// MSB which absorbs any remaining carry and may exceed 51 bits. #[inline(always)] fn redundant_carry(t: [i64; N]) -> [u64; N] { let mut borrow = 0; @@ -78,7 +64,7 @@ pub const fn i2f_scalar(a: u64) -> f64 { /// Multiply a 51-bit scalar `s` by a 5×51-bit vector `v`. /// -/// Returns 6 SIMD vectors in redundant form. "noinit" means no FMA anchor +/// Returns vector redundant SIMD and carry form. "noinit" means no FMA anchor /// compensation is applied; the caller must initialize accumulators with /// `make_initial` biases. #[inline(always)] @@ -129,30 +115,7 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { /// Montgomery multiplication for BN254 scalar field. /// /// Computes `a * b * R^{-256} mod P` where R = 2^256. -/// -/// # Algorithm Overview -/// -/// Uses floating-point FMA for fast 51×51-bit multiplications via the -/// Renes-Costello-Batina technique. The algorithm proceeds in phases: -/// -/// 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation. -/// -/// 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using -/// FMA-based multiply-accumulate. Results are stored in "redundant SIMD -/// form" where high/low parts of products occupy separate SIMD lanes. -/// -/// 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply -/// the lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This -/// converts `t[i] * 2^{51*i}` into an equivalent value in the upper limbs, -/// allowing all reductions to proceed in parallel. -/// -/// 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`, -/// then add `m * P` to cancel the lowest limb. -/// -/// 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final -/// division by 2 (after conditional addition of P for odd results) to -/// produce the correct R^{-256} Montgomery form. -/// + /// # Preconditions /// /// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs) @@ -164,6 +127,43 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { /// (vs. `simd_mul` which batches two). #[inline(always)] pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { + // # Algorithm Overview + // Uses floating-point FMA for fast 51×51-bit multiplications via the + // Renes-Costello-Batina technique. The algorithm proceeds in phases: + // + // 1. **Limb conversion**: Convert 4×64-bit inputs to 5×51-bit representation. + // + // 2. **Schoolbook multiplication**: Compute the 10-limb product `a × b` using + // FMA-based multiply-accumulate. Results are stored in "redundant SIMD form" + // where high/low parts of products occupy separate SIMD lanes. + // + // 3. **Parallel reduction**: Instead of sequential CIOS reduction, multiply the + // lower 4 limbs by precomputed `RHO_i = R^i mod P` constants. This converts + // `t[i] * 2^{51*i}` into an equivalent value in the upper limbs, allowing + // all reductions to proceed in parallel. + // + // 4. **Final Montgomery step**: Compute `m = (result[0] * -P^{-1}) mod 2^51`, + // then add `m * P` to cancel the lowest limb. + // + // 5. **Bit adjustment**: The 5×51 = 255 bit representation requires a final + // division by 2 (after conditional addition of P for odd results) to produce + // the correct R^{-256} Montgomery form. + // + // # Redundant Representations + // + // The code uses two distinct "redundant" representations: + // + // 1. **Redundant SIMD form**: FMA produces high/low product parts in separate + // SIMD vectors with misaligned lanes. E.g., `ts[k] = [limb_k_lo, + // limb_{k+1}_lo]` and `ts[k+1] = [limb_k_hi, limb_{k+1}_hi]`. Swizzle + // operations realign these so adjacent limbs occupy consecutive positions. + // + // 2. **Redundant limb form**: Each 51-bit limb may temporarily exceed 51 bits. + // The excess bits are carries (or borrows) that propagate to higher limbs. + // - `u64` limbs: excess bits are positive carries + // - `i64` limbs: excess bits may be negative, representing borrows from + // above + let a = u256_to_u255(a); let b = u256_to_u255(b); @@ -211,7 +211,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { t[k + 1] = ts[k][1]; }); - // Propagate carries/borrows through redundant limb form (i64 allows negative excess) + // Propagate carries/borrows through redundant limb form (i64 allows negative + // excess) t[1] += t[0] >> 51; t[2] += t[1] >> 51; t[3] += t[2] >> 51; @@ -257,7 +258,8 @@ pub fn mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { s[k + 1] = ss[k][1]; }); - // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery reduction) + // Propagate carry/borrow from s[0] and discard it (absorbed by Montgomery + // reduction) s[1] += s[0] >> 51; let s = [s[1], s[2], s[3], s[4], s[5]]; @@ -281,12 +283,15 @@ pub fn u255_to_u256_shr_1(limbs: [u64; 5]) -> [u64; 4] { ] } -/// Montgomery squaring: `(va*v0_b, v1_a*v1_b)`. -/// input must fit in 2^255-1; no runtime checking. Output lives in Montgomery -/// space R=256. +/// Montgomery squaring: a² +/// +/// Input and output are in Montgomery form R=256. +/// +/// Precondition: +/// - a < 2^255; no runtime check. #[inline(always)] -pub fn sqr(v0_a: [u64; 4]) -> [u64; 4] { - mul(v0_a, v0_a) +pub fn sqr(a: [u64; 4]) -> [u64; 4] { + mul(a, a) } #[cfg(not(target_arch = "wasm32"))] From 14d0523ec3889323df7b87fe549ef91762b05b84 Mon Sep 17 00:00:00 2001 From: Aditya Bisht Date: Mon, 9 Feb 2026 14:31:09 +0530 Subject: [PATCH 21/21] rne/mono: fix split doc comment and add parens for consistency --- skyscraper/bn254-multiplier/src/rne/mono.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/rne/mono.rs b/skyscraper/bn254-multiplier/src/rne/mono.rs index a6a77bcfd..a4a9e41e1 100644 --- a/skyscraper/bn254-multiplier/src/rne/mono.rs +++ b/skyscraper/bn254-multiplier/src/rne/mono.rs @@ -44,7 +44,7 @@ pub fn u256_to_u255(limbs: [u64; 4]) -> [u64; 5] { ((l0 >> 51) | (l1 << 13)) & MASK51, ((l1 >> 38) | (l2 << 26)) & MASK51, ((l2 >> 25) | (l3 << 39)) & MASK51, - l3 >> 12 & MASK51, + (l3 >> 12) & MASK51, ] } @@ -115,7 +115,7 @@ pub fn reduce_ct(mut a: [i64; 5]) -> [i64; 5] { /// Montgomery multiplication for BN254 scalar field. /// /// Computes `a * b * R^{-256} mod P` where R = 2^256. - +/// /// # Preconditions /// /// - Both inputs must be < 2^255 (i.e., fit in 5×51-bit limbs)