New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
util: compare raw and encoded keys without explicit decoding #5613
Changes from 5 commits
a72cde0
3944db5
1953617
ac24c25
d8820fe
fd5784c
29dc972
4215ba4
5741c61
83d0e43
04819fb
a064fde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,6 +182,15 @@ impl Key { | |
ts_encoded_key[..user_key_len] == user_key[..] | ||
} | ||
} | ||
|
||
/// Returns whether the encoded key is encoded from `raw_key`. | ||
/// | ||
/// # Panics | ||
/// | ||
/// Panics if `self` is not a valid encoded key. | ||
pub fn is_encoded_from(&self, raw_key: &[u8]) -> bool { | ||
bytes::is_encoded_from(&self.0, raw_key, false) | ||
} | ||
} | ||
|
||
impl Clone for Key { | ||
|
@@ -274,4 +283,52 @@ mod tests { | |
assert_eq!(false, eq(b"axcdefghijk87654321", b"abcdefghijk")); | ||
assert_eq!(false, eq(b"abcdeffhijk87654321", b"abcdefghijk")); | ||
} | ||
|
||
#[test] | ||
fn test_is_encoded_from() { | ||
for raw_len in 0..=24 { | ||
let raw: Vec<u8> = (0..raw_len).collect(); | ||
let encoded = Key::from_raw(&raw); | ||
assert!(encoded.is_encoded_from(&raw)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should test negative cases too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. PTAL again Tests are deliberately repeated in tikv_util::codec and keys, in case we change the implementation in keys to components/codec. |
||
|
||
let encoded_len = encoded.as_encoded().len(); | ||
|
||
// Should fail if we modify one byte in raw | ||
for i in 0..raw.len() { | ||
let mut invalid_raw = raw.clone(); | ||
invalid_raw[i] = raw[i].wrapping_add(1); | ||
assert!(!encoded.is_encoded_from(&invalid_raw)); | ||
} | ||
|
||
// Should fail if we modify one byte in encoded | ||
for i in 0..encoded_len { | ||
let mut invalid_encoded = encoded.clone(); | ||
invalid_encoded.0[i] = encoded.0[i].wrapping_add(1); | ||
assert!(!invalid_encoded.is_encoded_from(&raw)); | ||
} | ||
|
||
// Should panic if encoded length is not a multiple of 9 | ||
let res = panic_hook::recover_safe(|| { | ||
let mut invalid_encoded = encoded.clone(); | ||
invalid_encoded.0.pop(); | ||
invalid_encoded.is_encoded_from(&raw) | ||
}); | ||
assert!(res.is_err()); | ||
|
||
// Should panic if encoded has less or more chunks | ||
let shorter_encoded = Key::from_encoded_slice(&encoded.0[..encoded_len - 9]); | ||
assert!(!shorter_encoded.is_encoded_from(&raw)); | ||
let mut longer_encoded = encoded.as_encoded().clone(); | ||
longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); | ||
let longer_encoded = Key::from_encoded(longer_encoded); | ||
assert!(!longer_encoded.is_encoded_from(&raw)); | ||
|
||
// Should fail if raw is longer or shorter | ||
let shorter_raw = &raw[..raw.len() - 1]; | ||
assert!(!encoded.is_encoded_from(shorter_raw)); | ||
let mut longer_raw = raw.to_vec(); | ||
longer_raw.push(0); | ||
assert!(!encoded.is_encoded_from(&longer_raw)); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -278,6 +278,57 @@ pub fn decode_bytes_in_place(data: &mut Vec<u8>, desc: bool) -> Result<()> { | |
} | ||
} | ||
|
||
/// Returns whether `encoded` bytes is encoded from `raw`. | ||
/// | ||
/// # Panics | ||
/// | ||
/// Panics if `encoded` is not valid | ||
pub fn is_encoded_from(encoded: &[u8], raw: &[u8], desc: bool) -> bool { | ||
let check_single_chunk = |encoded: &[u8], raw: &[u8]| { | ||
let len = raw.len(); | ||
let pad = (ENC_GROUP_SIZE - len) as u8; | ||
if desc { | ||
encoded[..len] | ||
.iter() | ||
.zip(raw) | ||
.all(|(&enc, &raw)| enc == !raw) | ||
&& encoded[len..encoded.len() - 1].iter().all(|&v| v == 0xff) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ENC_MARKER |
||
&& encoded[ENC_GROUP_SIZE] == !(ENC_MARKER - pad) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about comparing it first? |
||
} else { | ||
&encoded[..len] == raw | ||
&& encoded[len..encoded.len() - 1].iter().all(|&v| v == 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. !ENC_MARKER |
||
&& encoded[ENC_GROUP_SIZE] == (ENC_MARKER - pad) | ||
} | ||
}; | ||
|
||
let mut rev_encoded_chunks = encoded.rchunks_exact(ENC_GROUP_SIZE + 1); | ||
// Valid encoded bytes must has complete chunks | ||
assert!(rev_encoded_chunks.remainder().is_empty()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better to accept invalid bytes but also return false. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Encoded bytes are all generated and used inside TiKV. Therefore, it's either a bug or data corruption if encoded bytes are invalid. So I choose to panic here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. However as a utility function it should not hold that knowledge. You can't prevent this utility function not to be used elsewhere as a common way to compare some user passed-in bytes or TiDB given bytes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense. |
||
|
||
// Bytes are compared in reverse order because in TiKV, if two keys are different, the last | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this is a TiDB feature instead of TiKV. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes...Anyway, I still think it's all right to depend on this feature. |
||
// a few bytes are more likely to be different. | ||
|
||
let raw_chunks = raw.chunks_exact(ENC_GROUP_SIZE); | ||
// Check the last chunk first | ||
match rev_encoded_chunks.next() { | ||
Some(encoded_chunk) if check_single_chunk(encoded_chunk, raw_chunks.remainder()) => {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what will happen if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then the encoded chunk must be |
||
_ => return false, | ||
} | ||
|
||
// The count of the remaining chunks must be the same. Using `size_hint` here is both safe and | ||
// efficient because chunk iterators implement trait `TrustedLen`. | ||
if rev_encoded_chunks.size_hint() != raw_chunks.size_hint() { | ||
return false; | ||
} | ||
|
||
for (encoded_chunk, raw_chunk) in rev_encoded_chunks.zip(raw_chunks.rev()) { | ||
if !check_single_chunk(encoded_chunk, raw_chunk) { | ||
return false; | ||
} | ||
} | ||
true | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
@@ -392,6 +443,93 @@ mod tests { | |
} | ||
} | ||
|
||
#[test] | ||
fn test_is_encoded_from() { | ||
for raw_len in 0..=24 { | ||
let raw: Vec<u8> = (1..=raw_len).collect(); | ||
for &desc in &[true, false] { | ||
let encoded = encode_order_bytes(&raw, desc); | ||
assert!( | ||
is_encoded_from(&encoded, &raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
encoded, | ||
raw, | ||
desc | ||
); | ||
|
||
// Should fail if we modify one byte in raw | ||
for i in 0..raw.len() { | ||
let mut invalid_raw = raw.clone(); | ||
invalid_raw[i] = raw[i].wrapping_add(1); | ||
assert!( | ||
!is_encoded_from(&encoded, &invalid_raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
encoded, | ||
invalid_raw, | ||
desc | ||
); | ||
} | ||
|
||
// Should fail if we modify one byte in encoded | ||
for i in 0..encoded.len() { | ||
let mut invalid_encoded = encoded.clone(); | ||
invalid_encoded[i] = encoded[i].wrapping_add(1); | ||
assert!( | ||
!is_encoded_from(&invalid_encoded, &raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
invalid_encoded, | ||
raw, | ||
desc | ||
); | ||
} | ||
|
||
// Should panic if encoded length is not a multiple of 9 | ||
let res = panic_hook::recover_safe(|| { | ||
is_encoded_from(&encoded[..encoded.len() - 1], &raw, desc) | ||
}); | ||
assert!(res.is_err()); | ||
|
||
// Should panic if encoded has less or more chunks | ||
youjiali1995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let shorter_encoded = &encoded[..encoded.len() - ENC_GROUP_SIZE - 1]; | ||
assert!( | ||
!is_encoded_from(shorter_encoded, &raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
shorter_encoded, | ||
raw, | ||
desc | ||
); | ||
let mut longer_encoded = encoded.clone(); | ||
longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); | ||
assert!( | ||
!is_encoded_from(&longer_encoded, &raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
longer_encoded, | ||
raw, | ||
desc | ||
); | ||
|
||
// Should fail if raw is longer or shorter | ||
let shorter_raw = &raw[..raw.len() - 1]; | ||
assert!( | ||
!is_encoded_from(&encoded, shorter_raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
encoded, | ||
shorter_raw, | ||
desc | ||
); | ||
let mut longer_raw = raw.to_vec(); | ||
longer_raw.push(0); | ||
assert!( | ||
!is_encoded_from(&encoded, &longer_raw, desc), | ||
"Encoded: {:?}, Raw: {:?}, desc: {}", | ||
encoded, | ||
longer_raw, | ||
desc | ||
); | ||
} | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_encode_bytes_compare() { | ||
let pairs: Vec<(&[u8], &[u8], _)> = vec![ | ||
|
@@ -524,4 +662,22 @@ mod tests { | |
decode_bytes_in_place(&mut encoded, false).unwrap(); | ||
}); | ||
} | ||
|
||
#[bench] | ||
fn bench_is_encoded_from(b: &mut Bencher) { | ||
let key = [b'x'; 10000]; | ||
let encoded = encode_bytes(&key); | ||
b.iter(|| { | ||
assert!(is_encoded_from(&encoded, &key, false)); | ||
}); | ||
} | ||
|
||
#[bench] | ||
fn bench_is_encoded_from_small(b: &mut Bencher) { | ||
let key = [b'x'; 30]; | ||
let encoded = encode_bytes(&key); | ||
b.iter(|| { | ||
assert!(is_encoded_from(&encoded, &key, false)); | ||
}); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems it never panic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. r? @breeswish