Average & Stddev Ryu
32: 26.908 5.560
64: 30.465 5.866
Average & Stddev Ryu
32: 32.423 6.206
64: 42.308 7.505
I checked the disassembly for the Zig benchmark and noticed that the memcpy LLVM emits (avx on my machine) is being inlined when copying from a fixed digit table. The interesting thing here is that these std.mem.copy invocations are only two bytes in length and should be statically known by the compiler.
std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);
56a0: c5 fc 10 84 2b 20 fe vmovups -0x1e0(%rbx,%rbp,1),%ymm0
56a7: ff ff
56a9: c5 fc 10 8c 2b 40 fe vmovups -0x1c0(%rbx,%rbp,1),%ymm1
56b0: ff ff
56b2: c5 fc 10 94 2b 60 fe vmovups -0x1a0(%rbx,%rbp,1),%ymm2
56b9: ff ff
56bb: c5 fc 10 9c 2b 80 fe vmovups -0x180(%rbx,%rbp,1),%ymm3
56c2: ff ff
56c4: c5 fc 11 84 2f 20 fe vmovups %ymm0,-0x1e0(%rdi,%rbp,1)
56cb: ff ff
56cd: c5 fc 11 8c 2f 40 fe vmovups %ymm1,-0x1c0(%rdi,%rbp,1)
56d4: ff ff
56d6: c5 fc 11 94 2f 60 fe vmovups %ymm2,-0x1a0(%rdi,%rbp,1)
56dd: ff ff
56df: c5 fc 11 9c 2f 80 fe vmovups %ymm3,-0x180(%rdi,%rbp,1)
56e6: ff ff
56e8: c5 fc 10 84 2b a0 fe vmovups -0x160(%rbx,%rbp,1),%ymm0
56ef: ff ff
56f1: c5 fc 10 8c 2b c0 fe vmovups -0x140(%rbx,%rbp,1),%ymm1
56f8: ff ff
56fa: c5 fc 10 94 2b e0 fe vmovups -0x120(%rbx,%rbp,1),%ymm2
5701: ff ff
5703: c5 fc 10 9c 2b 00 ff vmovups -0x100(%rbx,%rbp,1),%ymm3
570a: ff ff
570c: c5 fc 11 84 2f a0 fe vmovups %ymm0,-0x160(%rdi,%rbp,1)
5713: ff ff
5715: c5 fc 11 8c 2f c0 fe vmovups %ymm1,-0x140(%rdi,%rbp,1)
571c: ff ff
571e: c5 fc 11 94 2f e0 fe vmovups %ymm2,-0x120(%rdi,%rbp,1)
5725: ff ff
5727: c5 fc 11 9c 2f 00 ff vmovups %ymm3,-0x100(%rdi,%rbp,1)
572e: ff ff
5730: c5 fc 10 84 2b 20 ff vmovups -0xe0(%rbx,%rbp,1),%ymm0
5737: ff ff
5739: c5 fc 10 8c 2b 40 ff vmovups -0xc0(%rbx,%rbp,1),%ymm1
5740: ff ff
5742: c5 fc 10 94 2b 60 ff vmovups -0xa0(%rbx,%rbp,1),%ymm2
5749: ff ff
574b: c5 fc 10 5c 2b 80 vmovups -0x80(%rbx,%rbp,1),%ymm3
5751: c5 fc 11 84 2f 20 ff vmovups %ymm0,-0xe0(%rdi,%rbp,1)
5758: ff ff
575a: c5 fc 11 8c 2f 40 ff vmovups %ymm1,-0xc0(%rdi,%rbp,1)
5761: ff ff
5763: c5 fc 11 94 2f 60 ff vmovups %ymm2,-0xa0(%rdi,%rbp,1)
576a: ff ff
576c: c5 fc 11 5c 2f 80 vmovups %ymm3,-0x80(%rdi,%rbp,1)
5772: c5 fe 6f 44 2b a0 vmovdqu -0x60(%rbx,%rbp,1),%ymm0
5778: c5 fc 10 4c 2b c0 vmovups -0x40(%rbx,%rbp,1),%ymm1
577e: c5 fc 10 54 2b e0 vmovups -0x20(%rbx,%rbp,1),%ymm2
5784: c5 fc 10 1c 2b vmovups (%rbx,%rbp,1),%ymm3
5789: c5 fe 7f 44 2f a0 vmovdqu %ymm0,-0x60(%rdi,%rbp,1)
578f: c5 fc 11 4c 2f c0 vmovups %ymm1,-0x40(%rdi,%rbp,1)
5795: c5 fc 11 54 2f e0 vmovups %ymm2,-0x20(%rdi,%rbp,1)
579b: c5 fc 11 1c 2f vmovups %ymm3,(%rdi,%rbp,1)
57a0: 48 81 c5 00 02 00 00 add $0x200,%rbp
57a7: 48 83 c0 04 add $0x4,%rax
57ab: 0f 85 ef fe ff ff jne 56a0 <ryu64+0x8f0>
diff --git a/src/ryu32.zig b/src/ryu32.zig
index 176a3a8..cfafad4 100644
--- a/src/ryu32.zig
+++ b/src/ryu32.zig
@@ -317,15 +317,20 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
const c0 = (c % 100) << 1;
const c1 = (c / 100) << 1;
- std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);
- std.mem.copy(u8, result[index + olength - i - 3 ..], DIGIT_TABLE[c1 .. c1 + 2]);
+ result[index + olength - i - 1 + 0] = DIGIT_TABLE[c0 + 0];
+ result[index + olength - i - 1 + 1] = DIGIT_TABLE[c0 + 1];
+ result[index + olength - i - 3 + 0] = DIGIT_TABLE[c1 + 0];
+ result[index + olength - i - 3 + 1] = DIGIT_TABLE[c1 + 1];
+
i += 4;
}
if (output >= 100) {
const c = (output % 100) << 1;
output /= 100;
- std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c .. c + 2]);
+ result[index + olength - i - 1 + 0] = DIGIT_TABLE[c + 0];
+ result[index + olength - i - 1 + 1] = DIGIT_TABLE[c + 1];
+
i += 2;
}
if (output >= 10) {
@@ -357,7 +362,9 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
var expu = @intCast(usize, exp);
if (exp >= 10) {
- std.mem.copy(u8, result[index..], DIGIT_TABLE[2 * expu .. 2 * expu + 2]);
+ result[index + 0] = DIGIT_TABLE[2 * expu + 0];
+ result[index + 1] = DIGIT_TABLE[2 * expu + 1];
+
index += 2;
} else {
result[index] = @intCast(u8, '0' + expu);
Average & Stddev Ryu
32: 26.908 5.560
64: 30.465 5.866
Average & Stddev Ryu
32: 28.352 5.948
64: 36.163 6.815
I'm not sure if this is a failed upstream LLVM optimization or we aren't emitting good enough information from Zig. I expect this would have a fair bit of performance improvement in many tight code spots since std.mem.copy is recommended for any size slice and I would expect it to produce as good assembly as the hand-unrolled for statically known sizes.
I've ported ryu to zig but noticed slightly slower performance than expected. This issue will refer to https://github.com/tiehuis/zig-ryu/tree/724f66d81cf598d53df8eca2a93bf2eb8c7cdb1d.
First, the benchmarks of that tree (best of 5,
build.shin repo changed to useclang++).C Reference
Zig Reference
I checked the disassembly for the Zig benchmark and noticed that the memcpy LLVM emits (avx on my machine) is being inlined when copying from a fixed digit table. The interesting thing here is that these
std.mem.copyinvocations are only two bytes in length and should be statically known by the compiler.For example, the following expands to the assembly
Now, we can easily see this is non-optimal as if we replace the two-byte
std.mem.copywith explicit byte assignments with the following patch (likewise forryu64.zig) we get much better performanceC Reference
Zig with explicit copies
I'm not sure if this is a failed upstream LLVM optimization or we aren't emitting good enough information from Zig. I expect this would have a fair bit of performance improvement in many tight code spots since
std.mem.copyis recommended for any size slice and I would expect it to produce as good assembly as the hand-unrolled for statically known sizes.