non-optimal mem.copy on small buffers

I've ported ryu to zig but noticed slightly slower performance than expected. This issue will refer to https://github.com/tiehuis/zig-ryu/tree/724f66d81cf598d53df8eca2a93bf2eb8c7cdb1d.

First, the benchmarks of that tree (best of 5, `build.sh` in repo changed to use `clang++`).

#### C Reference
```
    Average & Stddev Ryu
32:   26.908    5.560
64:   30.465    5.866
```

#### Zig Reference
```
    Average & Stddev Ryu
32:   32.423    6.206
64:   42.308    7.505
```

I checked the disassembly for the Zig benchmark and noticed that the memcpy LLVM emits (avx on my machine) is being inlined when copying from a fixed digit table. The interesting thing here is that these `std.mem.copy` invocations are only two bytes in length and should be statically known by the compiler.

For example, the following expands to the assembly

```
std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);
```

```
   56a0:       c5 fc 10 84 2b 20 fe    vmovups -0x1e0(%rbx,%rbp,1),%ymm0
    56a7:       ff ff 
    56a9:       c5 fc 10 8c 2b 40 fe    vmovups -0x1c0(%rbx,%rbp,1),%ymm1
    56b0:       ff ff 
    56b2:       c5 fc 10 94 2b 60 fe    vmovups -0x1a0(%rbx,%rbp,1),%ymm2
    56b9:       ff ff 
    56bb:       c5 fc 10 9c 2b 80 fe    vmovups -0x180(%rbx,%rbp,1),%ymm3
    56c2:       ff ff 
    56c4:       c5 fc 11 84 2f 20 fe    vmovups %ymm0,-0x1e0(%rdi,%rbp,1)
    56cb:       ff ff 
    56cd:       c5 fc 11 8c 2f 40 fe    vmovups %ymm1,-0x1c0(%rdi,%rbp,1)
    56d4:       ff ff 
    56d6:       c5 fc 11 94 2f 60 fe    vmovups %ymm2,-0x1a0(%rdi,%rbp,1)
    56dd:       ff ff 
    56df:       c5 fc 11 9c 2f 80 fe    vmovups %ymm3,-0x180(%rdi,%rbp,1)
    56e6:       ff ff 
    56e8:       c5 fc 10 84 2b a0 fe    vmovups -0x160(%rbx,%rbp,1),%ymm0
    56ef:       ff ff 
    56f1:       c5 fc 10 8c 2b c0 fe    vmovups -0x140(%rbx,%rbp,1),%ymm1
    56f8:       ff ff 
    56fa:       c5 fc 10 94 2b e0 fe    vmovups -0x120(%rbx,%rbp,1),%ymm2
    5701:       ff ff 
    5703:       c5 fc 10 9c 2b 00 ff    vmovups -0x100(%rbx,%rbp,1),%ymm3
    570a:       ff ff 
    570c:       c5 fc 11 84 2f a0 fe    vmovups %ymm0,-0x160(%rdi,%rbp,1)
    5713:       ff ff 
    5715:       c5 fc 11 8c 2f c0 fe    vmovups %ymm1,-0x140(%rdi,%rbp,1)
    571c:       ff ff 
    571e:       c5 fc 11 94 2f e0 fe    vmovups %ymm2,-0x120(%rdi,%rbp,1)
    5725:       ff ff 
    5727:       c5 fc 11 9c 2f 00 ff    vmovups %ymm3,-0x100(%rdi,%rbp,1)
    572e:       ff ff 
    5730:       c5 fc 10 84 2b 20 ff    vmovups -0xe0(%rbx,%rbp,1),%ymm0
    5737:       ff ff 
    5739:       c5 fc 10 8c 2b 40 ff    vmovups -0xc0(%rbx,%rbp,1),%ymm1
    5740:       ff ff 
    5742:       c5 fc 10 94 2b 60 ff    vmovups -0xa0(%rbx,%rbp,1),%ymm2
    5749:       ff ff 
    574b:       c5 fc 10 5c 2b 80       vmovups -0x80(%rbx,%rbp,1),%ymm3
    5751:       c5 fc 11 84 2f 20 ff    vmovups %ymm0,-0xe0(%rdi,%rbp,1)
    5758:       ff ff 
    575a:       c5 fc 11 8c 2f 40 ff    vmovups %ymm1,-0xc0(%rdi,%rbp,1)
    5761:       ff ff 
    5763:       c5 fc 11 94 2f 60 ff    vmovups %ymm2,-0xa0(%rdi,%rbp,1)
    576a:       ff ff 
    576c:       c5 fc 11 5c 2f 80       vmovups %ymm3,-0x80(%rdi,%rbp,1)
    5772:       c5 fe 6f 44 2b a0       vmovdqu -0x60(%rbx,%rbp,1),%ymm0
    5778:       c5 fc 10 4c 2b c0       vmovups -0x40(%rbx,%rbp,1),%ymm1
    577e:       c5 fc 10 54 2b e0       vmovups -0x20(%rbx,%rbp,1),%ymm2
    5784:       c5 fc 10 1c 2b          vmovups (%rbx,%rbp,1),%ymm3
    5789:       c5 fe 7f 44 2f a0       vmovdqu %ymm0,-0x60(%rdi,%rbp,1)
    578f:       c5 fc 11 4c 2f c0       vmovups %ymm1,-0x40(%rdi,%rbp,1)
    5795:       c5 fc 11 54 2f e0       vmovups %ymm2,-0x20(%rdi,%rbp,1)
    579b:       c5 fc 11 1c 2f          vmovups %ymm3,(%rdi,%rbp,1)
    57a0:       48 81 c5 00 02 00 00    add    $0x200,%rbp
    57a7:       48 83 c0 04             add    $0x4,%rax
    57ab:       0f 85 ef fe ff ff       jne    56a0 <ryu64+0x8f0>
```

Now, we can easily see this is non-optimal as if we replace the two-byte `std.mem.copy` with explicit byte assignments with the following patch (likewise for `ryu64.zig`) we get much better performance

```patch
diff --git a/src/ryu32.zig b/src/ryu32.zig
index 176a3a8..cfafad4 100644
--- a/src/ryu32.zig
+++ b/src/ryu32.zig
@@ -317,15 +317,20 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
         const c0 = (c % 100) << 1;
         const c1 = (c / 100) << 1;
 
-        std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);
-        std.mem.copy(u8, result[index + olength - i - 3 ..], DIGIT_TABLE[c1 .. c1 + 2]);
+        result[index + olength - i - 1 + 0] = DIGIT_TABLE[c0 + 0];
+        result[index + olength - i - 1 + 1] = DIGIT_TABLE[c0 + 1];
+        result[index + olength - i - 3 + 0] = DIGIT_TABLE[c1 + 0];
+        result[index + olength - i - 3 + 1] = DIGIT_TABLE[c1 + 1];
+
         i += 4;
     }
     if (output >= 100) {
         const c = (output % 100) << 1;
         output /= 100;
 
-        std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c .. c + 2]);
+        result[index + olength - i - 1 + 0] = DIGIT_TABLE[c + 0];
+        result[index + olength - i - 1 + 1] = DIGIT_TABLE[c + 1];
+
         i += 2;
     }
     if (output >= 10) {
@@ -357,7 +362,9 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
     var expu = @intCast(usize, exp);
 
     if (exp >= 10) {
-        std.mem.copy(u8, result[index..], DIGIT_TABLE[2 * expu .. 2 * expu + 2]);
+        result[index + 0] = DIGIT_TABLE[2 * expu + 0];
+        result[index + 1] = DIGIT_TABLE[2 * expu + 1];
+
         index += 2;
     } else {
         result[index] = @intCast(u8, '0' + expu);
```

#### C Reference

```
    Average & Stddev Ryu
32:   26.908    5.560
64:   30.465    5.866
```

#### Zig with explicit copies

```
    Average & Stddev Ryu
32:   28.352    5.948
64:   36.163    6.815
```

I'm not sure if this is a failed upstream LLVM optimization or we aren't emitting good enough information from Zig. I expect this would have a fair bit of performance improvement in many tight code spots since `std.mem.copy` is recommended for any size slice and I would expect it to produce as good assembly as the hand-unrolled for statically known sizes.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

non-optimal mem.copy on small buffers #1329

C Reference

Zig Reference

C Reference

Zig with explicit copies

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

non-optimal mem.copy on small buffers #1329

Description

C Reference

Zig Reference

C Reference

Zig with explicit copies

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions