Skip to content

Commit

Permalink
perf: optimize parallel batch size (thrpt +95% on large input)
Browse files Browse the repository at this point in the history
  • Loading branch information
uhmarcel committed Nov 22, 2022
1 parent 0050529 commit cf14f61
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 229 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rbase64"
version = "1.4.1"
version = "1.4.2"
edition = "2021"
description = "A simple base64 encoder / decoder CLI tool made in Rust"
authors = ["Marcel Riera <marcel.riera@outlook.com>"]
Expand Down
367 changes: 182 additions & 185 deletions benches/baseline.md

Large diffs are not rendered by default.

11 changes: 5 additions & 6 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@ pub const ENCODE_MAP: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
pub const DECODE_MAP: &[u8; 256] = &construct_decode_map();

pub const SIX_BIT_MASK: u64 = 0x3f;
pub const BYTE_MASK: u64 = 0xff;
pub const INVALID_BYTE: u8 = 0x40;

pub const ENC_CHUNK_SIZE: usize = 2;
pub const DEC_CHUNK_SIZE: usize = 2;

pub const ENC_U128_OFFSET: usize = (ENC_CHUNK_SIZE * 3 - 1) * 8;
pub const DEC_U64_OFFSET: usize = (DEC_CHUNK_SIZE * 4 - 1) * 6;
pub const SIX_BIT_MASK: u64 = 0x3f;
pub const BYTE_MASK: u64 = 0xff;
pub const INVALID_BYTE: u8 = 0x40;

#[cfg(feature = "parallel")]
pub const PARALLEL_THRESHOLD_BYTES: usize = 2 << 16; // 128 KiB
#[cfg(feature = "parallel")]
pub const PARALLEL_BATCH_SIZE: usize = 256;

const fn construct_decode_map() -> [u8; 256] {
let mut map = [INVALID_BYTE; 256];
Expand Down
35 changes: 17 additions & 18 deletions src/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,42 @@ use std::iter::zip;

#[inline(always)]
#[cfg(not(feature = "parallel"))]
pub(crate) fn decode_u64_chunks(input: &[u8], buffer: &mut [u8], total_chunks: usize) {
decode_u64_chunks_sync(input, buffer, total_chunks);
pub(crate) fn decode_u64_chunks(input: &[u8], buffer: &mut [u8]) {
decode_u64_chunks_sync(input, buffer);
}

#[inline(always)]
#[cfg(feature = "parallel")]
pub(crate) fn decode_u64_chunks(input: &[u8], buffer: &mut [u8], total_chunks: usize) {
pub(crate) fn decode_u64_chunks(input: &[u8], buffer: &mut [u8]) {
if input.len() < PARALLEL_THRESHOLD_BYTES {
decode_u64_chunks_sync(input, buffer, total_chunks);
decode_u64_chunks_sync(input, buffer);
} else {
decode_u64_chunks_parallel(input, buffer, total_chunks);
decode_u64_chunks_parallel(input, buffer);
};
}

#[inline(always)]
fn decode_u64_chunks_sync(input: &[u8], buffer: &mut [u8], total_chunks: usize) {
fn decode_u64_chunks_sync(input: &[u8], buffer: &mut [u8]) {
let in_chunks = input.chunks_exact(DEC_CHUNK_SIZE * 4);
let out_chunks = buffer.chunks_exact_mut(DEC_CHUNK_SIZE * 3);

for (in_chunk, out_chunk) in zip(in_chunks, out_chunks).take(total_chunks) {
for (in_chunk, out_chunk) in zip(in_chunks, out_chunks) {
decode_u64(in_chunk, out_chunk);
}
}

#[inline(always)]
#[cfg(feature = "parallel")]
fn decode_u64_chunks_parallel(input: &[u8], buffer: &mut [u8], total_chunks: usize) {
fn decode_u64_chunks_parallel(input: &[u8], buffer: &mut [u8]) {
use rayon::prelude::*;

let in_chunks = input.par_chunks_exact(DEC_CHUNK_SIZE * 4);
let out_chunks = buffer.par_chunks_exact_mut(DEC_CHUNK_SIZE * 3);
let batch_size = PARALLEL_BATCH_SIZE * DEC_CHUNK_SIZE;
let in_batch = input.par_chunks(batch_size * 4);
let out_batch = buffer.par_chunks_mut(batch_size * 3);

in_chunks
.zip(out_chunks)
.take(total_chunks)
.for_each(|(in_chunk, out_chunk)| {
decode_u64(in_chunk, out_chunk);
});
in_batch.zip(out_batch).for_each(|(in_chunk, out_chunk)| {
decode_u64_chunks_sync(in_chunk, out_chunk);
});
}

#[inline(always)]
Expand All @@ -67,13 +65,14 @@ pub(crate) fn decode_u64_remainder(input: &[u8], buffer: &mut [u8]) -> usize {
#[inline(always)]
fn decode_u64(input: &[u8], buffer: &mut [u8]) {
let mut in_u64 = 0u64;
let offset = (DEC_CHUNK_SIZE * 4 - 1) * 6;

input.iter().enumerate().for_each(|(i, in_byte)| {
in_u64 |= (decode_byte(*in_byte) as u64) << (DEC_U64_OFFSET - 6 * i + 2) as u64;
in_u64 |= (decode_byte(*in_byte) as u64) << (2 + offset - (6 * i)) as u64;
});

buffer.iter_mut().enumerate().for_each(|(i, out_byte)| {
*out_byte = ((in_u64 >> (DEC_U64_OFFSET - (8 * i))) & BYTE_MASK) as u8;
*out_byte = ((in_u64 >> (offset - (i * 8))) & BYTE_MASK) as u8;
});
}

Expand Down
18 changes: 11 additions & 7 deletions src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ fn encode_u64_chunks_sync(input: &[u8], buffer: &mut [u8]) {
fn encode_u64_chunks_parallel(input: &[u8], buffer: &mut [u8]) {
use rayon::prelude::*;

let in_chunks = input.par_chunks_exact(ENC_CHUNK_SIZE * 3);
let out_chunks = buffer.par_chunks_exact_mut(ENC_CHUNK_SIZE * 4);

in_chunks.zip(out_chunks).for_each(|(in_chunk, out_chunk)| {
encode_u64(in_chunk, out_chunk);
});
let batch_size = PARALLEL_BATCH_SIZE * ENC_CHUNK_SIZE;
let in_batches = input.par_chunks(batch_size * 3);
let out_batches = buffer.par_chunks_mut(batch_size * 4);

in_batches
.zip(out_batches)
.for_each(|(in_batch, out_batch)| {
encode_u64_chunks_sync(in_batch, out_batch);
});
}

#[inline(always)]
Expand Down Expand Up @@ -68,9 +71,10 @@ pub(crate) fn encode_u64_remainder(input: &[u8], buffer: &mut [u8]) -> usize {
#[inline(always)]
fn encode_u64(input: &[u8], buffer: &mut [u8]) {
let in_u64 = read_u64_partial(input);
let offset = (ENC_CHUNK_SIZE * 3 - 1) * 8;

buffer.iter_mut().enumerate().for_each(|(i, out_b)| {
*out_b = encode_byte(((in_u64 >> (2 + ENC_U128_OFFSET - 6 * i)) & SIX_BIT_MASK) as u8);
*out_b = encode_byte(((in_u64 >> (2 + offset - (i * 6))) & SIX_BIT_MASK) as u8);
});
}

Expand Down
18 changes: 7 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,17 @@ pub fn encode(input: &[u8]) -> String {

pub fn decode(encoded: &str) -> Vec<u8> {
let input = encoded.as_bytes();
let mut buffer = vec![0; ((encoded.len() + 3) / 4) * 3];
let mut buffer = vec![0; ((input.len() + 3) / 4) * 3];

let total_chunks = input
.len()
.saturating_sub(DEC_CHUNK_SIZE)
.saturating_div(DEC_CHUNK_SIZE * 4);
let total_chunks = input.len().saturating_sub(2) / (DEC_CHUNK_SIZE * 4);
let in_limit = total_chunks * DEC_CHUNK_SIZE * 4;
let out_limit = total_chunks * DEC_CHUNK_SIZE * 3;

decode::decode_u64_chunks(input, &mut buffer, total_chunks);
decode::decode_u64_chunks(&input[..in_limit], &mut buffer);

let bytes_rem = decode::decode_u64_remainder(
&input[DEC_CHUNK_SIZE * total_chunks * 4..],
&mut buffer[DEC_CHUNK_SIZE * total_chunks * 3..],
);
let bytes_rem = decode::decode_u64_remainder(&input[in_limit..], &mut buffer[out_limit..]);

buffer.truncate(3 * DEC_CHUNK_SIZE * total_chunks + bytes_rem);
buffer.truncate(out_limit + bytes_rem);
buffer
}

Expand Down

0 comments on commit cf14f61

Please sign in to comment.