Skip to content

Commit

Permalink
perf(encode): refactor to use chunks exact (thrpt ~4%)
Browse files Browse the repository at this point in the history
  • Loading branch information
uhmarcel committed Nov 20, 2022
1 parent 9eae60d commit 597988e
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 127 deletions.
224 changes: 116 additions & 108 deletions benches/baseline.md
Original file line number Diff line number Diff line change
@@ -1,164 +1,172 @@
# Profiling Report
```diff

encode/3 time: [33.538 ns 33.626 ns 33.727 ns]
thrpt: [84.830 MiB/s 85.084 MiB/s 85.308 MiB/s]
encode/3 time: [33.819 ns 33.847 ns 33.874 ns]
thrpt: [84.460 MiB/s 84.528 MiB/s 84.597 MiB/s]
change:
time: [-0.8900% -0.6116% -0.2926%] (p = 0.00 < 0.05)
thrpt: [+0.2935% +0.6154% +0.8980%]
time: [+0.4183% +0.6705% +0.9191%] (p = 0.00 < 0.05)
thrpt: [-0.9107% -0.6660% -0.4166%]
Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) high mild
Found 16 outliers among 100 measurements (16.00%)
4 (4.00%) low severe
2 (2.00%) low mild
7 (7.00%) high mild
3 (3.00%) high severe

encode/50 time: [58.352 ns 58.490 ns 58.637 ns]
thrpt: [813.20 MiB/s 815.24 MiB/s 817.17 MiB/s]
encode/50 time: [46.523 ns 46.592 ns 46.660 ns]
thrpt: [1021.9 MiB/s 1023.4 MiB/s 1.0009 GiB/s]
change:
time: [-1.1272% -0.8774% -0.6427%] (p = 0.00 < 0.05)
thrpt: [+0.6468% +0.8851% +1.1400%]
Change within noise threshold.
Found 3 outliers among 100 measurements (3.00%)
2 (2.00%) high mild
1 (1.00%) high severe
+ time: [-20.575% -20.383% -20.196%] (p = 0.00 < 0.05)
+ thrpt: [+25.307% +25.601% +25.905%]
+ Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) low mild

encode/100 time: [74.960 ns 75.106 ns 75.275 ns]
thrpt: [1.2372 GiB/s 1.2400 GiB/s 1.2424 GiB/s]
encode/100 time: [65.344 ns 65.477 ns 65.636 ns]
thrpt: [1.4189 GiB/s 1.4224 GiB/s 1.4253 GiB/s]
change:
time: [-0.9910% -0.7815% -0.5546%] (p = 0.00 < 0.05)
thrpt: [+0.5577% +0.7877% +1.0009%]
Change within noise threshold.
+ time: [-13.333% -13.047% -12.780%] (p = 0.00 < 0.05)
+ thrpt: [+14.653% +15.005% +15.384%]
+ Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
5 (5.00%) high mild

encode/500 time: [219.73 ns 220.25 ns 220.75 ns]
thrpt: [2.1094 GiB/s 2.1142 GiB/s 2.1192 GiB/s]
encode/500 time: [220.04 ns 221.31 ns 222.76 ns]
thrpt: [2.0904 GiB/s 2.1041 GiB/s 2.1163 GiB/s]
change:
+ time: [-2.2033% -1.9038% -1.6263%] (p = 0.00 < 0.05)
+ thrpt: [+1.6531% +1.9407% +2.2529%]
+ time: [-3.1167% -2.5568% -1.9598%] (p = 0.00 < 0.05)
+ thrpt: [+1.9990% +2.6238% +3.2170%]
+ Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
2 (2.00%) high mild
2 (2.00%) high severe
Found 20 outliers among 100 measurements (20.00%)
9 (9.00%) low mild
7 (7.00%) high mild
4 (4.00%) high severe

encode/3072 time: [1.0716 µs 1.0761 µs 1.0824 µs]
thrpt: [2.6432 GiB/s 2.6586 GiB/s 2.6700 GiB/s]
encode/3072 time: [1.0243 µs 1.0266 µs 1.0289 µs]
thrpt: [2.7808 GiB/s 2.7869 GiB/s 2.7930 GiB/s]
change:
time: [-1.0267% -0.7870% -0.4710%] (p = 0.00 < 0.05)
thrpt: [+0.4733% +0.7932% +1.0373%]
Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
1 (1.00%) high mild
1 (1.00%) high severe
+ time: [-4.1981% -3.9159% -3.6448%] (p = 0.00 < 0.05)
+ thrpt: [+3.7827% +4.0755% +4.3821%]
+ Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
5 (5.00%) high mild

encode/1048576 time: [343.10 µs 344.10 µs 345.03 µs]
thrpt: [2.8304 GiB/s 2.8380 GiB/s 2.8463 GiB/s]
encode/1048576 time: [339.25 µs 339.75 µs 340.27 µs]
thrpt: [2.8699 GiB/s 2.8743 GiB/s 2.8786 GiB/s]
change:
time: [-1.0872% -0.7844% -0.4280%] (p = 0.00 < 0.05)
thrpt: [+0.4299% +0.7906% +1.0991%]
Change within noise threshold.
Found 3 outliers among 100 measurements (3.00%)
+ time: [-2.2247% -1.7897% -1.4088%] (p = 0.00 < 0.05)
+ thrpt: [+1.4289% +1.8223% +2.2753%]
+ Performance has improved.
Found 15 outliers among 100 measurements (15.00%)
2 (2.00%) low severe
8 (8.00%) low mild
3 (3.00%) high mild
2 (2.00%) high severe

encode/5242880 time: [1.9936 ms 1.9986 ms 2.0038 ms]
thrpt: [2.4368 GiB/s 2.4431 GiB/s 2.4492 GiB/s]
encode/5242880 time: [1.9869 ms 1.9930 ms 2.0006 ms]
thrpt: [2.4406 GiB/s 2.4500 GiB/s 2.4575 GiB/s]
change:
time: [-1.4849% -1.1372% -0.8000%] (p = 0.00 < 0.05)
thrpt: [+0.8065% +1.1502% +1.5073%]
time: [-1.0978% -0.6490% -0.1858%] (p = 0.01 < 0.05)
thrpt: [+0.1862% +0.6532% +1.1100%]
Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
2 (2.00%) high mild
2 (2.00%) high severe

encode/10485760 time: [4.0803 ms 4.0869 ms 4.0941 ms]
thrpt: [2.3853 GiB/s 2.3895 GiB/s 2.3934 GiB/s]
encode/10485760 time: [4.1619 ms 4.1677 ms 4.1734 ms]
thrpt: [2.3399 GiB/s 2.3432 GiB/s 2.3464 GiB/s]
change:
+ time: [-1.5443% -1.2906% -1.0378%] (p = 0.00 < 0.05)
+ thrpt: [+1.0487% +1.3075% +1.5685%]
+ Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) high mild
time: [+0.8847% +1.1634% +1.4275%] (p = 0.00 < 0.05)
thrpt: [-1.4074% -1.1500% -0.8769%]
Change within noise threshold.



decode/3 time: [30.665 ns 30.689 ns 30.712 ns]
thrpt: [93.158 MiB/s 93.228 MiB/s 93.299 MiB/s]
decode/3 time: [30.721 ns 30.739 ns 30.758 ns]
thrpt: [93.016 MiB/s 93.076 MiB/s 93.129 MiB/s]
change:
time: [-1.0012% -0.8312% -0.6729%] (p = 0.00 < 0.05)
thrpt: [+0.6775% +0.8382% +1.0113%]
time: [-0.7527% -0.5785% -0.3978%] (p = 0.00 < 0.05)
thrpt: [+0.3994% +0.5819% +0.7584%]
Change within noise threshold.
Found 21 outliers among 100 measurements (21.00%)
7 (7.00%) low severe
3 (3.00%) high mild
11 (11.00%) high severe
Found 9 outliers among 100 measurements (9.00%)
2 (2.00%) high mild
7 (7.00%) high severe

decode/50 time: [45.478 ns 45.507 ns 45.539 ns]
thrpt: [1.0226 GiB/s 1.0233 GiB/s 1.0239 GiB/s]
decode/50 time: [45.579 ns 45.609 ns 45.640 ns]
thrpt: [1.0203 GiB/s 1.0210 GiB/s 1.0217 GiB/s]
change:
+ time: [-5.4341% -5.3153% -5.1984%] (p = 0.00 < 0.05)
+ thrpt: [+5.4834% +5.6136% +5.7463%]
+ time: [-5.8084% -5.3462% -5.0350%] (p = 0.00 < 0.05)
+ thrpt: [+5.3019% +5.6482% +6.1666%]
+ Performance has improved.
Found 18 outliers among 100 measurements (18.00%)
3 (3.00%) low severe
2 (2.00%) low mild
13 (13.00%) high severe
Found 6 outliers among 100 measurements (6.00%)
1 (1.00%) low severe
1 (1.00%) low mild
1 (1.00%) high mild
3 (3.00%) high severe

decode/100 time: [60.366 ns 60.488 ns 60.623 ns]
thrpt: [1.5363 GiB/s 1.5397 GiB/s 1.5428 GiB/s]
decode/100 time: [60.543 ns 60.745 ns 61.003 ns]
thrpt: [1.5267 GiB/s 1.5332 GiB/s 1.5383 GiB/s]
change:
+ time: [-6.5171% -6.3238% -6.1317%] (p = 0.00 < 0.05)
+ thrpt: [+6.5323% +6.7507% +6.9714%]
+ time: [-7.4295% -6.9413% -6.4610%] (p = 0.00 < 0.05)
+ thrpt: [+6.9073% +7.4590% +8.0258%]
+ Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
1 (1.00%) low mild
1 (1.00%) high mild
Found 17 outliers among 100 measurements (17.00%)
7 (7.00%) high mild
10 (10.00%) high severe

decode/500 time: [204.10 ns 204.35 ns 204.64 ns]
thrpt: [2.2756 GiB/s 2.2787 GiB/s 2.2816 GiB/s]
decode/500 time: [205.07 ns 205.34 ns 205.62 ns]
thrpt: [2.2647 GiB/s 2.2677 GiB/s 2.2707 GiB/s]
change:
+ time: [-10.938% -10.654% -10.369%] (p = 0.00 < 0.05)
+ thrpt: [+11.568% +11.925% +12.281%]
+ time: [-10.905% -10.754% -10.600%] (p = 0.00 < 0.05)
+ thrpt: [+11.857% +12.050% +12.240%]
+ Performance has improved.
Found 10 outliers among 100 measurements (10.00%)
1 (1.00%) low severe
4 (4.00%) low mild
4 (4.00%) high mild
Found 6 outliers among 100 measurements (6.00%)
3 (3.00%) low mild
2 (2.00%) high mild
1 (1.00%) high severe

decode/3072 time: [992.64 ns 994.64 ns 996.86 ns]
thrpt: [2.8700 GiB/s 2.8764 GiB/s 2.8822 GiB/s]
decode/3072 time: [988.01 ns 989.24 ns 990.89 ns]
thrpt: [2.8873 GiB/s 2.8921 GiB/s 2.8957 GiB/s]
change:
+ time: [-13.163% -12.973% -12.772%] (p = 0.00 < 0.05)
+ thrpt: [+14.642% +14.906% +15.158%]
+ time: [-15.152% -14.993% -14.827%] (p = 0.00 < 0.05)
+ thrpt: [+17.409% +17.638% +17.857%]
+ Performance has improved.
Found 6 outliers among 100 measurements (6.00%)
4 (4.00%) high mild
2 (2.00%) high severe
Found 18 outliers among 100 measurements (18.00%)
2 (2.00%) low severe
4 (4.00%) low mild
6 (6.00%) high mild
6 (6.00%) high severe

decode/1048576 time: [321.77 µs 322.06 µs 322.40 µs]
thrpt: [3.0291 GiB/s 3.0322 GiB/s 3.0350 GiB/s]
decode/1048576 time: [322.66 µs 323.88 µs 326.30 µs]
thrpt: [2.9928 GiB/s 3.0152 GiB/s 3.0266 GiB/s]
change:
+ time: [-12.295% -12.111% -11.926%] (p = 0.00 < 0.05)
+ thrpt: [+13.540% +13.780% +14.019%]
+ time: [-13.349% -12.933% -12.471%] (p = 0.00 < 0.05)
+ thrpt: [+14.248% +14.854% +15.406%]
+ Performance has improved.
Found 10 outliers among 100 measurements (10.00%)
Found 12 outliers among 100 measurements (12.00%)
1 (1.00%) low severe
1 (1.00%) low mild
5 (5.00%) high mild
3 (3.00%) high severe
5 (5.00%) high severe

decode/5242880 time: [1.7669 ms 1.7690 ms 1.7712 ms]
thrpt: [2.7567 GiB/s 2.7602 GiB/s 2.7634 GiB/s]
decode/5242880 time: [1.7719 ms 1.7778 ms 1.7868 ms]
thrpt: [2.7327 GiB/s 2.7465 GiB/s 2.7556 GiB/s]
change:
+ time: [-11.926% -11.800% -11.674%] (p = 0.00 < 0.05)
+ thrpt: [+13.216% +13.378% +13.541%]
+ time: [-12.407% -12.070% -11.644%] (p = 0.00 < 0.05)
+ thrpt: [+13.178% +13.727% +14.164%]
+ Performance has improved.
Found 13 outliers among 100 measurements (13.00%)
10 (10.00%) high mild
3 (3.00%) high severe
Found 10 outliers among 100 measurements (10.00%)
1 (1.00%) high mild
9 (9.00%) high severe

decode/10485760 time: [3.5955 ms 3.6021 ms 3.6094 ms]
thrpt: [2.7056 GiB/s 2.7111 GiB/s 2.7161 GiB/s]
decode/10485760 time: [3.5972 ms 3.6043 ms 3.6119 ms]
thrpt: [2.7038 GiB/s 2.7095 GiB/s 2.7148 GiB/s]
change:
+ time: [-11.655% -11.439% -11.217%] (p = 0.00 < 0.05)
+ thrpt: [+12.634% +12.917% +13.193%]
+ time: [-12.009% -11.800% -11.586%] (p = 0.00 < 0.05)
+ thrpt: [+13.104% +13.379% +13.648%]
+ Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
2 (2.00%) high mild
1 (1.00%) high severe
3 (3.00%) high mild



Expand Down
34 changes: 15 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,23 @@ const DECODE_CHUNK_SIZE: usize = 2;

pub fn encode(bytes: &[u8]) -> String {
let mut buffer = vec![0; ((bytes.len() / 3) + 1) * 4];
let mut in_index = 0;
let mut out_index = 0;

while in_index < bytes.len().saturating_sub(ENCODE_CHUNK_SIZE * 4) {
let in_u128 = read_u128(bytes, in_index);
let chunk = &mut buffer[out_index..out_index + (ENCODE_CHUNK_SIZE * 4)];
let offset = 8 * (ENCODE_CHUNK_SIZE * 4 - 1) + 2;
let total_chunks = bytes.len() / (ENCODE_CHUNK_SIZE * 3);
let in_chunks = bytes.chunks_exact(ENCODE_CHUNK_SIZE * 3);
let out_chunks = buffer.chunks_exact_mut(ENCODE_CHUNK_SIZE * 4);
let offset = 8 * (ENCODE_CHUNK_SIZE * 3 - 1);

for (i, item) in chunk.iter_mut().enumerate() {
*item = encode_byte(((in_u128 >> (offset - 6 * i)) & SIX_BIT_MASK) as u8);
for (in_chunk, out_chunk) in zip(in_chunks, out_chunks) {
let in_u128 = read_u128_partial(in_chunk);

for (i, out_byte) in out_chunk.iter_mut().enumerate() {
*out_byte = encode_byte(((in_u128 >> (2 + offset - 6 * i)) & SIX_BIT_MASK) as u8);
}
out_index += ENCODE_CHUNK_SIZE * 4;
in_index += ENCODE_CHUNK_SIZE * 3;
}

let acc = read_u128_partial(bytes, in_index);
let in_index = ENCODE_CHUNK_SIZE * total_chunks * 3;
let mut out_index = ENCODE_CHUNK_SIZE * total_chunks * 4;
let acc = read_u128_partial(&bytes[in_index..]);
let mut acc_bits = 8 * (bytes.len() - in_index);

while acc_bits >= 6 {
Expand Down Expand Up @@ -118,16 +119,11 @@ fn decode_byte(byte: u8) -> u8 {
}

#[inline(always)]
fn read_u128(bytes: &[u8], from: usize) -> u128 {
u128::from_be_bytes(bytes[from..from + 16].try_into().unwrap())
}

#[inline(always)]
fn read_u128_partial(bytes: &[u8], from: usize) -> u128 {
let size = min(bytes.len() - from, 16);
fn read_u128_partial(bytes: &[u8]) -> u128 {
let size = min(bytes.len(), 16);
let mut buffer = [0u8; 16];

buffer[16 - size..].copy_from_slice(&bytes[from..from + size]);
buffer[16 - size..].copy_from_slice(&bytes[..size]);

u128::from_be_bytes(buffer)
}
Expand Down

0 comments on commit 597988e

Please sign in to comment.