diff --git a/packages/simd/README.md b/packages/simd/README.md index 6ba52cdad0..557ac434b8 100644 --- a/packages/simd/README.md +++ b/packages/simd/README.md @@ -32,25 +32,37 @@ See [/assembly](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/assembly) for sources: +- `abs4_f32` - `add4_f32` +- `addn4_f32` +- `clamp4_f32` - `div4_f32` (*) +- `divn4_f32` (*) - `dot2_f32_aos` (2x vec2 per iteration) - `dot4_f32_aos` - `dot4_f32_soa` - `invsqrt4_f32` (*) - `madd4_f32` - `maddn4_f32` +- `max4_f32` +- `min4_f32` +- `msub4_f32` +- `msubn4_f32` - `mul4_f32` -- `mul_m23v2_aos` -- `mul_m23v2_aos_single` (2x vec2 per iteration) +- `muln4_f32` +- `mul_m23v2_aos` (2x vec2 per iteration) - `mul_m44v4_aos` -- `mul_m44v4_aos_single` +- `neg4_f32` +- `normalize4_f32_aos` - `sqrt4_f32` (*) - `sub4_f32` +- `subn4_f32` (*) Missing native implementation, waiting on... -Also see [src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts) for documentation about the exposed TS/JS API... +Also see +[src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts) +for documentation about the exposed TS/JS API... ## Status diff --git a/packages/simd/assembly/abs.ts b/packages/simd/assembly/abs.ts new file mode 100644 index 0000000000..9780b8dbfc --- /dev/null +++ b/packages/simd/assembly/abs.ts @@ -0,0 +1,17 @@ +export function abs4_f32( + out: usize, + a: usize, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + for (; num-- > 0; ) { + v128.store(out, f32x4.abs(v128.load(a))); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/add.ts b/packages/simd/assembly/add.ts index c9b130538a..c40de7fff7 100644 --- a/packages/simd/assembly/add.ts +++ b/packages/simd/assembly/add.ts @@ -3,9 +3,9 @@ export function add4_f32( a: usize, b: usize, num: usize, - so: usize = 4, - sa: usize = 4, - sb: usize = 4 + so: usize, + sa: usize, + sb: usize ): usize { so <<= 2; sa <<= 2; diff --git a/packages/simd/assembly/addn.ts b/packages/simd/assembly/addn.ts new file mode 100644 index 0000000000..2699de96c6 --- /dev/null +++ b/packages/simd/assembly/addn.ts @@ -0,0 +1,19 @@ +export function addn4_f32( + out: usize, + a: usize, + n: f32, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + const vn = f32x4.splat(n); + for (; num-- > 0; ) { + v128.store(out, f32x4.add(v128.load(a), vn)); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/clamp.ts b/packages/simd/assembly/clamp.ts new file mode 100644 index 0000000000..64e3a4ce38 --- /dev/null +++ b/packages/simd/assembly/clamp.ts @@ -0,0 +1,49 @@ +/** + * Takes three vec4 buffers, clamps `a` componentwise to `min(max(a, b), + * c)` and stores results in `out`. Both AOS / SOA layouts are + * supported, as long as all buffers are using the same layout. + * + * All strides must by multiples of 4. All pointers must be aligned to + * multiples of 16. Returns `out` pointer. + * + * Set `sb` and `sc` to 0 for clamping all `a` vectors against same + * bounds. + * + * @param out + * @param a + * @param b + * @param c + * @param num number of vec4 + * @param so out element stride + * @param sa A element stride + * @param sb B element stride + * @param sc C element stride + */ +export function clamp4_f32( + out: usize, + a: usize, + b: usize, + c: usize, + num: usize, + so: usize, + sa: usize, + sb: usize, + sc: usize +): usize { + const res = out; + so <<= 2; + sa <<= 2; + sb <<= 2; + sc <<= 2; + for (; num-- > 0; ) { + v128.store( + out, + f32x4.min(f32x4.max(v128.load(a), v128.load(b)), v128.load(c)) + ); + out += so; + a += sa; + b += sb; + c += sc; + } + return res; +} diff --git a/packages/simd/assembly/divn.ts b/packages/simd/assembly/divn.ts new file mode 100644 index 0000000000..dd4dde38a3 --- /dev/null +++ b/packages/simd/assembly/divn.ts @@ -0,0 +1,19 @@ +export function divn4_f32( + out: usize, + a: usize, + n: f32, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + const vn = f32x4.splat(n); + for (; num-- > 0; ) { + v128.store(out, f32x4.div(v128.load(a), vn)); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/index.ts b/packages/simd/assembly/index.ts index c0e9493b7b..391efa5f20 100644 --- a/packages/simd/assembly/index.ts +++ b/packages/simd/assembly/index.ts @@ -1,14 +1,25 @@ +export * from "./abs"; export * from "./add"; +export * from "./addn"; +export * from "./clamp"; // TODO waiting for native impl // export * from "./div"; +// export * from "./divn"; export * from "./dot"; export * from "./madd"; export * from "./maddn"; +export * from "./max"; +export * from "./min"; export * from "./mul"; +export * from "./muln"; export * from "./mulv"; -export * from "./sub"; +export * from "./neg"; +export * from "./normalize"; // TODO waiting for native impl // export * from "./sqrt"; + +export * from "./sub"; +export * from "./subn"; diff --git a/packages/simd/assembly/maddn.ts b/packages/simd/assembly/maddn.ts index 503888240c..58dab925ee 100644 --- a/packages/simd/assembly/maddn.ts +++ b/packages/simd/assembly/maddn.ts @@ -1,7 +1,7 @@ export function maddn4_f32( out: usize, a: usize, - b: f32, + n: f32, c: usize, num: usize, so: usize, @@ -12,9 +12,9 @@ export function maddn4_f32( so <<= 2; sa <<= 2; sc <<= 2; - const vb = v128.splat(b); + const vn = f32x4.splat(n); for (; num-- > 0; ) { - v128.store(out, f32x4.add(f32x4.mul(v128.load(a), vb), v128.load(c))); + v128.store(out, f32x4.add(f32x4.mul(v128.load(a), vn), v128.load(c))); out += so; a += sa; c += sc; diff --git a/packages/simd/assembly/max.ts b/packages/simd/assembly/max.ts new file mode 100644 index 0000000000..cba8083fe8 --- /dev/null +++ b/packages/simd/assembly/max.ts @@ -0,0 +1,21 @@ +export function max4_f32( + out: usize, + a: usize, + b: usize, + num: usize, + so: usize, + sa: usize, + sb: usize +): usize { + so <<= 2; + sa <<= 2; + sb <<= 2; + const res = out; + for (; num-- > 0; ) { + v128.store(out, f32x4.max(v128.load(a), v128.load(b))); + out += so; + a += sa; + b += sb; + } + return res; +} diff --git a/packages/simd/assembly/min.ts b/packages/simd/assembly/min.ts new file mode 100644 index 0000000000..ec34b21a9a --- /dev/null +++ b/packages/simd/assembly/min.ts @@ -0,0 +1,21 @@ +export function min4_f32( + out: usize, + a: usize, + b: usize, + num: usize, + so: usize, + sa: usize, + sb: usize +): usize { + so <<= 2; + sa <<= 2; + sb <<= 2; + const res = out; + for (; num-- > 0; ) { + v128.store(out, f32x4.min(v128.load(a), v128.load(b))); + out += so; + a += sa; + b += sb; + } + return res; +} diff --git a/packages/simd/assembly/msub.ts b/packages/simd/assembly/msub.ts new file mode 100644 index 0000000000..c5f8039d7d --- /dev/null +++ b/packages/simd/assembly/msub.ts @@ -0,0 +1,46 @@ +/** + * Takes three vec4 buffers, computes componentwise a * b - c and stores + * results in `out`. Both AOS / SOA layouts are supported, as long as + * all buffers are using the same layout. + * + * All strides must by multiples of 4. All pointers must be aligned to + * multiples of 16. Returns `out` pointer. + * + * @param out + * @param a + * @param b + * @param c + * @param num number of vec4 + * @param so out element stride + * @param sa A element stride + * @param sb B element stride + * @param sc C element stride + */ +export function msub4_f32( + out: usize, + a: usize, + b: usize, + c: usize, + num: usize, + so: usize, + sa: usize, + sb: usize, + sc: usize +): usize { + const res = out; + so <<= 2; + sa <<= 2; + sb <<= 2; + sc <<= 2; + for (; num-- > 0; ) { + v128.store( + out, + f32x4.sub(f32x4.mul(v128.load(a), v128.load(b)), v128.load(c)) + ); + out += so; + a += sa; + b += sb; + c += sc; + } + return res; +} diff --git a/packages/simd/assembly/msubn.ts b/packages/simd/assembly/msubn.ts new file mode 100644 index 0000000000..619eeb8980 --- /dev/null +++ b/packages/simd/assembly/msubn.ts @@ -0,0 +1,23 @@ +export function msubn4_f32( + out: usize, + a: usize, + n: f32, + c: usize, + num: usize, + so: usize, + sa: usize, + sc: usize +): usize { + const res = out; + so <<= 2; + sa <<= 2; + sc <<= 2; + const vn = f32x4.splat(n); + for (; num-- > 0; ) { + v128.store(out, f32x4.sub(f32x4.mul(v128.load(a), vn), v128.load(c))); + out += so; + a += sa; + c += sc; + } + return res; +} diff --git a/packages/simd/assembly/muln.ts b/packages/simd/assembly/muln.ts new file mode 100644 index 0000000000..fe51e65e26 --- /dev/null +++ b/packages/simd/assembly/muln.ts @@ -0,0 +1,19 @@ +export function muln4_f32( + out: usize, + a: usize, + n: f32, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + const vn = f32x4.splat(n); + for (; num-- > 0; ) { + v128.store(out, f32x4.mul(v128.load(a), vn)); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/mulv.ts b/packages/simd/assembly/mulv.ts index d0e5ada9cd..80468d2b35 100644 --- a/packages/simd/assembly/mulv.ts +++ b/packages/simd/assembly/mulv.ts @@ -10,9 +10,9 @@ export function mul_m23v2_aos( so <<= 2; sa <<= 2; num >>= 1; - const m = v128.load(mat); - const m1 = v128.shuffle(m, m, 0, 1, 0, 1); - const m2 = v128.shuffle(m, m, 2, 3, 2, 3); + const m0 = v128.load(mat); + const m1 = v128.shuffle(m0, m0, 0, 1, 0, 1); + const m2 = v128.shuffle(m0, m0, 2, 3, 2, 3); let m3 = v128.load(mat, 16); m3 = v128.shuffle(m3, m3, 0, 1, 0, 1); for (; num-- > 0; ) { @@ -34,34 +34,6 @@ export function mul_m23v2_aos( return res; } -export function mul_m23v2_aos_single( - out: usize, - mat: usize, - vec: usize -): usize { - const m = v128.load(mat); - const m2 = v128.load(mat, 16); - // v1xv1xv2xv2x * m.0101 + v1yv1yv2yv2y * m.2323 + m.4545 - const v = v128.load(vec); - v128.store( - out, - f32x4.add( - f32x4.add( - f32x4.mul( - v128.shuffle(v, v, 0, 0, 2, 2), - v128.shuffle(m, m, 0, 1, 0, 1) - ), - f32x4.mul( - v128.shuffle(v, v, 1, 1, 3, 3), - v128.shuffle(m, m, 2, 3, 2, 3) - ) - ), - v128.shuffle(m2, m2, 0, 1, 0, 1) - ) - ); - return out; -} - export function mul_m44v4_aos( out: usize, mat: usize, @@ -70,45 +42,33 @@ export function mul_m44v4_aos( so: usize, sa: usize ): usize { - const res = out; so <<= 2; sa <<= 2; + const res = out; + const m0 = v128.load(mat); + const m1 = v128.load(mat, 16); + const m2 = v128.load(mat, 32); + const m3 = v128.load(mat, 48); for (; num-- > 0; ) { - mul_m44v4_aos_single(out, mat, vec); - out += so; - vec += sa; - } - return res; -} - -export function mul_m44v4_aos_single( - out: usize, - mat: usize, - vec: usize -): usize { - // v.xxxx * m.0123 + v.yyyy * m.4567 + v.zzzz * m.89ab + v.wwww * m.cdef - const v = v128.load(vec); - v128.store( - out, - f32x4.add( - f32x4.add( - f32x4.mul(v128.shuffle(v, v, 0, 0, 0, 0), v128.load(mat)), - f32x4.mul( - v128.shuffle(v, v, 1, 1, 1, 1), - v128.load(mat, 16) - ) - ), + const v = v128.load(vec); + // v.xxxx * m.0123 + v.yyyy * m.4567 + v.zzzz * m.89ab + v.wwww * m.cdef + // TODO ryg's shuffle opt: + // https://fgiesen.wordpress.com/2015/02/05/a-small-note-on-simd-matrix-vector-multiplication/ + v128.store( + out, f32x4.add( - f32x4.mul( - v128.shuffle(v, v, 2, 2, 2, 2), - v128.load(mat, 32) + f32x4.add( + f32x4.mul(v128.shuffle(v, v, 0, 0, 0, 0), m0), + f32x4.mul(v128.shuffle(v, v, 1, 1, 1, 1), m1) ), - f32x4.mul( - v128.shuffle(v, v, 3, 3, 3, 3), - v128.load(mat, 48) + f32x4.add( + f32x4.mul(v128.shuffle(v, v, 2, 2, 2, 2), m2), + f32x4.mul(v128.shuffle(v, v, 3, 3, 3, 3), m3) ) ) - ) - ); - return out; + ); + out += so; + vec += sa; + } + return res; } diff --git a/packages/simd/assembly/neg.ts b/packages/simd/assembly/neg.ts new file mode 100644 index 0000000000..ab0fde6f9a --- /dev/null +++ b/packages/simd/assembly/neg.ts @@ -0,0 +1,17 @@ +export function neg4_f32( + out: usize, + a: usize, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + for (; num-- > 0; ) { + v128.store(out, f32x4.neg(v128.load(a))); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/normalize.ts b/packages/simd/assembly/normalize.ts new file mode 100644 index 0000000000..8c7f57d744 --- /dev/null +++ b/packages/simd/assembly/normalize.ts @@ -0,0 +1,27 @@ +export function normalize4_f32_aos( + out: usize, + a: usize, + num: usize, + norm: f32, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + for (; num-- > 0; ) { + const v = v128.load(a); + let m = f32x4.mul(v, v); + m = f32x4.add(m, v128.shuffle(m, m, 2, 3, 0, 1)); + const mag = f32x4.extract_lane(m, 0) + f32x4.extract_lane(m, 1); + v128.store( + out, + mag > f32.EPSILON + ? f32x4.mul(v, f32x4.splat(sqrt(norm / mag))) + : v + ); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/assembly/subn.ts b/packages/simd/assembly/subn.ts new file mode 100644 index 0000000000..888919b5f5 --- /dev/null +++ b/packages/simd/assembly/subn.ts @@ -0,0 +1,19 @@ +export function subn4_f32( + out: usize, + a: usize, + n: f32, + num: usize, + so: usize, + sa: usize +): usize { + so <<= 2; + sa <<= 2; + const res = out; + const vn = f32x4.splat(n); + for (; num-- > 0; ) { + v128.store(out, f32x4.sub(v128.load(a), vn)); + out += so; + a += sa; + } + return res; +} diff --git a/packages/simd/src/api.ts b/packages/simd/src/api.ts index 6562786663..ceafdf8303 100644 --- a/packages/simd/src/api.ts +++ b/packages/simd/src/api.ts @@ -1,47 +1,22 @@ export interface SIMD { - /** - * WASM memory instance given to `init()`. - */ - memory: WebAssembly.Memory; - /** - * Float64 view of WASM memory. - */ - f64: Float64Array; - /** - * Float32 view of WASM memory. - */ - f32: Float32Array; - /** - * Uint32 view of WASM memory. - */ - u32: Uint32Array; - /** - * Int32 view of WASM memory. - */ - i32: Int32Array; - /** - * Uint16 of WASM memory. - */ - u16: Uint16Array; - /** - * Int16 view of WASM memory. - */ - i16: Int16Array; - /** - * Uint8 view of WASM memory. - */ - u8: Uint8Array; - /** - * Int8 view of WASM memory. - */ - i8: Int8Array; + // prettier-ignore + abs4_f32(out: number, a: number, num: number, so: number, sa: number): number; // prettier-ignore add4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + // prettier-ignore + addn4_f32(out: number, a: number, n: number, num: number, so: number, sa: number): number; + + // prettier-ignore + clamp4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sb: number, sc: number): number; + // prettier-ignore div4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + // prettier-ignore + divn4_f32(out: number, a: number, n: number, num: number, so: number, sa: number): number; + /** * Takes two densely packed vec2 AOS buffers `a` and `b`, computes * their 2D dot products and stores results in `out`. Computes two @@ -134,18 +109,35 @@ export interface SIMD { // prettier-ignore maddn4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sc: number): number; + // prettier-ignore + max4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + + // prettier-ignore + min4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + // prettier-ignore mul4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; // prettier-ignore - mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; + muln4_f32(out: number, a: number, n: number, num: number, so: number, sa: number): number; - mul_m23v2_aos_single(out: number, mat: number, vec: number): number; + // prettier-ignore + mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; // prettier-ignore mul_m44v4_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; - mul_m44v4_aos_single(out: number, mat: number, vec: number): number; + // prettier-ignore + msub4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sb: number, sc: number): number; + + // prettier-ignore + msubn4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sc: number): number; + + // prettier-ignore + neg4_f32(out: number, a: number, num: number, so: number, sa: number): number; + + // prettier-ignore + normalize4_f32(out: number, a: number, num: number, norm: number, so: number, sa: number): number; /** * FIXME waiting for native impl of SIMD instr @@ -163,4 +155,44 @@ export interface SIMD { // prettier-ignore sub4_f32(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + + // prettier-ignore + subn4_f32(out: number, a: number, n: number, num: number, so: number, sa: number): number; + + /** + * WASM memory instance given to `init()`. + */ + memory: WebAssembly.Memory; + /** + * Float64 view of WASM memory. + */ + f64: Float64Array; + /** + * Float32 view of WASM memory. + */ + f32: Float32Array; + /** + * Uint32 view of WASM memory. + */ + u32: Uint32Array; + /** + * Int32 view of WASM memory. + */ + i32: Int32Array; + /** + * Uint16 of WASM memory. + */ + u16: Uint16Array; + /** + * Int16 view of WASM memory. + */ + i16: Int16Array; + /** + * Uint8 view of WASM memory. + */ + u8: Uint8Array; + /** + * Int8 view of WASM memory. + */ + i8: Int8Array; }