From fee9d283d476449d367fcdfd866c6b72ebf8d6ac Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 12:03:52 +0100 Subject: [PATCH 01/99] starting work on typed dictionaries --- src/structure/mod.rs | 1 + src/structure/pfc.rs | 3 +- src/structure/tfc.rs | 133 +++++++++++++++++++++++++++++++++++++++++ src/structure/vbyte.rs | 64 ++++++++++++++++++-- 4 files changed, 194 insertions(+), 7 deletions(-) create mode 100644 src/structure/tfc.rs diff --git a/src/structure/mod.rs b/src/structure/mod.rs index 08353122..78906419 100644 --- a/src/structure/mod.rs +++ b/src/structure/mod.rs @@ -9,6 +9,7 @@ pub mod bititer; pub mod logarray; //pub mod mapped_dict; pub mod pfc; +pub mod tfc; pub mod util; pub mod vbyte; pub mod wavelettree; diff --git a/src/structure/pfc.rs b/src/structure/pfc.rs index d49ee008..892f4bae 100644 --- a/src/structure/pfc.rs +++ b/src/structure/pfc.rs @@ -805,8 +805,7 @@ impl Decoder for PfcDecoder { false => { // This is in the middle of some block. we expect a vbyte followed by some 0-delimited cstring let last = self.last.as_ref().unwrap(); - let (prefix_len, vbyte_len) = vbyte::decode(&bytes).expect("expected vbyte"); - bytes.advance(vbyte_len); + let (prefix_len, vbyte_len) = vbyte::decode_buf(bytes).expect("expected vbyte"); let b = bytes.split_to(pos - vbyte_len); bytes.advance(1); let mut full = BytesMut::with_capacity(prefix_len as usize + b.len()); diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs new file mode 100644 index 00000000..8f934246 --- /dev/null +++ b/src/structure/tfc.rs @@ -0,0 +1,133 @@ +use bytes::{Bytes, Buf, BytesMut, BufMut}; + +use crate::structure::{vbyte::{self,encode_array}, util::find_common_prefix}; + +const BLOCK_SIZE: usize = 8; + +pub struct TfcBlock { + data: Bytes +} + +#[derive(Debug, PartialEq)] +pub struct TfcBlockHeader { + size: u8, + sizes: [u64;BLOCK_SIZE], + shareds: [u64;BLOCK_SIZE-1] +} + +#[derive(Debug)] +pub enum TfcError { + InvalidCoding, + NotEnoughData, +} + +impl From for TfcError { + fn from(e: vbyte::DecodeError) -> Self { + match e { + vbyte::DecodeError::UnexpectedEndOfBuffer => Self::NotEnoughData, + _ => Self::InvalidCoding + } + } +} + +impl TfcBlockHeader { + fn parse(buf: &mut B) -> Result { + let size = buf.get_u8(); + let mut sizes = [0;BLOCK_SIZE]; + let mut shareds = [0;BLOCK_SIZE-1]; + + + let (first_size, _) = vbyte::decode_buf(buf)?; + sizes[0] = first_size; + + for i in 0..(size-1) as usize { + let (shared, _) = vbyte::decode_buf(buf)?; + let (size, _) = vbyte::decode_buf(buf)?; + + sizes[i+1] = size; + shareds[i] = shared; + } + + Ok(Self { + size, + sizes, + shareds + }) + } +} + +pub struct TfcBlockBuilder { +} + +fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { + let slices_len = slices.len(); + debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); + buf.put_u8(slices_len as u8); + + let first = slices[0]; + let (vbyte, vbyte_len) = encode_array(first.len() as u64); + buf.put_slice(&vbyte[..vbyte_len]); + + let mut last = first; + + let mut suffixes: Vec<&[u8]> = Vec::with_capacity(slices.len()); + suffixes.push(last); + for i in 1..slices.len() { + let cur = slices[i]; + let common_prefix = find_common_prefix(last, cur); + let (vbyte, vbyte_len) = encode_array(common_prefix as u64); + buf.put_slice(&vbyte[..vbyte_len]); + + let suffix_len = cur.len() - common_prefix; + let (vbyte, vbyte_len) = encode_array(suffix_len as u64); + buf.put_slice(&vbyte[..vbyte_len]); + suffixes.push(&cur[common_prefix..]); + last = cur; + } + + for suffix in suffixes { + buf.put_slice(suffix); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::Buf; + #[test] + fn blah() { + let slice = b"asdfasfd"; + let mut argh = slice as &[u8]; + let first = argh.get_u8(); + let second = argh.get_u8(); + + panic!("{} {} {:?}", first, second, argh); + } + + #[test] + fn build_and_parse_block() { + let strings: [&[u8];5] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff" + ]; + + let mut buf = BytesMut::new(); + build_block_unchecked(&mut buf, &strings); + let mut bytes: Bytes = buf.freeze(); + + let header = TfcBlockHeader::parse(&mut bytes).unwrap(); + + let expected = TfcBlockHeader { + size: 5, + sizes: [6, 2, 4, 3, 2, 0, 0, 0], + shareds: [2, 0, 1, 2, 0, 0, 0] + }; + + assert_eq!(expected, header); + + assert_eq!(b"aaaaaabbccccdefff", &bytes[..]); + } +} diff --git a/src/structure/vbyte.rs b/src/structure/vbyte.rs index fe261403..5ba74e5f 100644 --- a/src/structure/vbyte.rs +++ b/src/structure/vbyte.rs @@ -17,6 +17,10 @@ use futures::io; use tokio::io::{AsyncWrite, AsyncWriteExt}; +use std::io::Write; + +use bytes::Buf; + /// The maximum number of bytes required for any `u64` in a variable-byte encoding. pub const MAX_ENCODING_LEN: usize = 10; @@ -74,19 +78,40 @@ fn max_byte_too_large(shift: u32, byte: u8) -> bool { /// This function expects the encoded value to start at the beginning of the slice; and the slice /// must be large enough to include all of the encoded bytes of one value. Decoding stops at the /// end of the encoded value, so it doesn't matter if the slice is longer. -pub fn decode(buf: &[u8]) -> Result<(u64, usize), DecodeError> { +pub fn decode(mut buf: &[u8]) -> Result<(u64, usize), DecodeError> { + decode_buf(&mut buf) +} + +/// Decodes a `u64` from a variable-byte-encoded slice. +/// +/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise, +/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError` +/// giving the reason. +/// +/// This function expects the encoded value to start at the beginning of the slice; and the slice +/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the +/// end of the encoded value, so it doesn't matter if the slice is longer. +pub fn decode_buf(buf: &mut B) -> Result<(u64, usize), DecodeError> { // This will be the decoded result. let mut num: u64 = 0; // This is how many bits we shift `num` by on each iteration in increments of 7. let mut shift: u32 = 0; // Loop through each 8-bit byte value with its index. - for (i, &b) in buf.iter().enumerate() { + let mut count = 0; + loop { + if !buf.has_remaining() { + return Err(DecodeError::UnexpectedEndOfBuffer); + } + + let b = buf.get_u8(); + count += 1; + if is_last_encoded_byte(b) { return if max_byte_too_large(shift, b) { Err(DecodeError::EncodedValueTooLarge) } else { // Return the result (clearing the msb) and the encoding length. - Ok((num | ((clear_msb(b) as u64) << shift), i + 1)) + Ok((num | ((clear_msb(b) as u64) << shift), count)) }; } // This is not the last byte. Update the result. @@ -100,8 +125,6 @@ pub fn decode(buf: &[u8]) -> Result<(u64, usize), DecodeError> { return Err(DecodeError::UnexpectedEncodingLen); } } - // We have reached the end of the buffer without encountering the last encoded byte. - Err(DecodeError::UnexpectedEndOfBuffer) } /// Returns `true` if more than 7 bits remain to be encoded. @@ -163,6 +186,37 @@ pub fn encode_vec(num: u64) -> Vec { vec } +/// Encodes a `u64` with a variable-byte encoding in an array. +/// +/// The array is always length 10. Additinally, the actual size of the vbyte is returned. +pub fn encode_array(num: u64) -> ([u8;10],usize) { + // Allocate a `Vec` of the right size. + let mut buf = [0;10]; + // Safety: We have created `vec` with the length of the encoded bytes of `num`. + let size = unsafe { encode_unchecked(&mut buf, num) }; + (buf, size) +} + +/* +pub fn encode_into_writer(writer: &mut W, mut num: u64) -> std::io::Result { + let mut i = 0; + // Loop through all 7-bit strings of the number. + while more_than_7bits_remain(num) { + // This is not the last encoded byte. + let b = clear_msb(num as u8); + writer.write_u8(b)?; + // Get the next 7 bits. + num >>= 7; + i+=1; + } + // This is the last encoded byte. + let b = set_msb(num as u8); + // Return the encoding length. + writer.write_u8(b)?; + Ok(i + 1) +} +*/ + /// Encodes a `u64` with a variable-byte encoding in a `Vec` and writes that `Vec` to the /// destination `dest` in a future. pub async fn write_async(dest: &mut A, num: u64) -> io::Result From fde3b53c8b28e3f0bfb240000e9106a41784eb91 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 16:22:24 +0100 Subject: [PATCH 02/99] tfc block entry retrieval --- src/structure/tfc.rs | 204 ++++++++++++++++++++++++++++++++--------- src/structure/vbyte.rs | 8 +- 2 files changed, 164 insertions(+), 48 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 8f934246..36c00148 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -1,65 +1,151 @@ -use bytes::{Bytes, Buf, BytesMut, BufMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; -use crate::structure::{vbyte::{self,encode_array}, util::find_common_prefix}; +use crate::structure::{ + util::find_common_prefix, + vbyte::{self, encode_array}, +}; const BLOCK_SIZE: usize = 8; -pub struct TfcBlock { - data: Bytes -} - -#[derive(Debug, PartialEq)] -pub struct TfcBlockHeader { - size: u8, - sizes: [u64;BLOCK_SIZE], - shareds: [u64;BLOCK_SIZE-1] -} - #[derive(Debug)] pub enum TfcError { InvalidCoding, NotEnoughData, } +#[derive(Debug, PartialEq)] +pub struct TfcBlockHeader { + num_entries: u8, + buffer_length: usize, + sizes: [usize; BLOCK_SIZE], + shareds: [usize; BLOCK_SIZE - 1], +} + impl From for TfcError { fn from(e: vbyte::DecodeError) -> Self { match e { vbyte::DecodeError::UnexpectedEndOfBuffer => Self::NotEnoughData, - _ => Self::InvalidCoding + _ => Self::InvalidCoding, } } } impl TfcBlockHeader { - fn parse(buf: &mut B) -> Result { - let size = buf.get_u8(); - let mut sizes = [0;BLOCK_SIZE]; - let mut shareds = [0;BLOCK_SIZE-1]; + fn parse(buf: &mut B) -> Result { + let num_entries = buf.get_u8(); + let mut sizes = [0_usize; BLOCK_SIZE]; + let mut shareds = [0_usize; BLOCK_SIZE - 1]; - let (first_size, _) = vbyte::decode_buf(buf)?; - sizes[0] = first_size; + sizes[0] = first_size as usize; - for i in 0..(size-1) as usize { + for i in 0..(num_entries - 1) as usize { let (shared, _) = vbyte::decode_buf(buf)?; let (size, _) = vbyte::decode_buf(buf)?; - sizes[i+1] = size; - shareds[i] = shared; + sizes[i + 1] = size as usize; + shareds[i] = shared as usize; } + let buffer_length = sizes.iter().sum(); + Ok(Self { - size, + num_entries, + buffer_length, sizes, - shareds + shareds, }) } } -pub struct TfcBlockBuilder { +#[derive(Debug)] +pub struct TfcEntry<'a>(Vec<&'a [u8]>); + +impl<'a> TfcEntry<'a> { + fn as_vec(&self) -> Vec { + let mut v = Vec::with_capacity(self.0.iter().map(|s| s.len()).sum()); + + for slice in self.0.iter() { + v.extend_from_slice(slice); + } + + v + } } -fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { +pub struct TfcBlock { + header: TfcBlockHeader, + data: Bytes, +} + +impl TfcBlock { + pub fn parse(bytes: &mut Bytes) -> Result { + let header = TfcBlockHeader::parse(bytes)?; + if bytes.remaining() < header.buffer_length { + return Err(TfcError::NotEnoughData); + } + + let data = bytes.split_to(header.buffer_length); + + Ok(Self { header, data }) + } + + pub fn is_incomplete(&self) -> bool { + self.header.num_entries != BLOCK_SIZE as u8 + } + + pub fn entry(&self, index: usize) -> TfcEntry { + if index == 0 { + return TfcEntry(vec![&self.data[..self.header.sizes[0]]]); + } + + let mut v = Vec::with_capacity(7); + let mut last = self.header.shareds[index - 1]; + if last != 0 { + v.push(last); + } + if last != 0 { + for i in (0..index - 1).rev() { + let shared = self.header.shareds[i]; + if shared == 0 { + break; + } + + if shared < last { + v.push(shared); + last = shared; + } else { + v.push(last); + } + } + } + + let start = index - v.len(); + + let mut taken = 0; + let mut slices = Vec::with_capacity(v.len() + 1); + + let mut offset = self.header.sizes.iter().take(start).sum(); + for (ix, shared) in v.iter().rev().enumerate() { + let have_to_take = shared - taken; + let cur_offset = offset; + offset += self.header.sizes[start + ix]; + if have_to_take == 0 { + continue; + } + let slice = &self.data[cur_offset..cur_offset + have_to_take]; + slices.push(slice); + taken += have_to_take; + } + + let suffix_size = self.header.sizes[index]; + slices.push(&self.data[offset..offset + suffix_size]); + + TfcEntry(slices) + } +} + +fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); buf.put_u8(slices_len as u8); @@ -77,7 +163,7 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { let common_prefix = find_common_prefix(last, cur); let (vbyte, vbyte_len) = encode_array(common_prefix as u64); buf.put_slice(&vbyte[..vbyte_len]); - + let suffix_len = cur.len() - common_prefix; let (vbyte, vbyte_len) = encode_array(suffix_len as u64); buf.put_slice(&vbyte[..vbyte_len]); @@ -104,30 +190,60 @@ mod tests { panic!("{} {} {:?}", first, second, argh); } - #[test] - fn build_and_parse_block() { - let strings: [&[u8];5] = [ - b"aaaaaa", - b"aabb", - b"cccc", - b"cdef", - b"cdff" - ]; - + fn build_incomplete_block(strings: &[&[u8]]) -> TfcBlock { let mut buf = BytesMut::new(); build_block_unchecked(&mut buf, &strings); + let mut bytes: Bytes = buf.freeze(); - let header = TfcBlockHeader::parse(&mut bytes).unwrap(); + TfcBlock::parse(&mut bytes).unwrap() + } + + #[test] + fn build_and_parse_block() { + let strings: [&[u8]; 5] = [b"aaaaaa", b"aabb", b"cccc", b"cdef", b"cdff"]; + + let block = build_incomplete_block(&strings); - let expected = TfcBlockHeader { - size: 5, + let expected_header = TfcBlockHeader { + num_entries: 5, + buffer_length: 17, sizes: [6, 2, 4, 3, 2, 0, 0, 0], - shareds: [2, 0, 1, 2, 0, 0, 0] + shareds: [2, 0, 1, 2, 0, 0, 0], }; - assert_eq!(expected, header); + assert_eq!(expected_header, block.header); + + let expected_bytes = b"aaaaaabbccccdefff"; + assert_eq!(expected_bytes, &block.data[..]); + } + + #[test] + fn entry_in_block() { + let strings: [&[u8]; 5] = [b"aaaaaa", b"aabb", b"cccc", b"cdef", b"cdff"]; + let block = build_incomplete_block(&strings); + + for (ix, string) in strings.iter().enumerate() { + assert_eq!(*string, &block.entry(ix).as_vec()[..]); + } + } - assert_eq!(b"aaaaaabbccccdefff", &bytes[..]); + #[test] + fn entry_in_complete_block() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_incomplete_block(&strings); + + for (ix, string) in strings.iter().enumerate() { + assert_eq!(*string, &block.entry(ix).as_vec()[..]); + } } } diff --git a/src/structure/vbyte.rs b/src/structure/vbyte.rs index 5ba74e5f..9ad5bce5 100644 --- a/src/structure/vbyte.rs +++ b/src/structure/vbyte.rs @@ -91,7 +91,7 @@ pub fn decode(mut buf: &[u8]) -> Result<(u64, usize), DecodeError> { /// This function expects the encoded value to start at the beginning of the slice; and the slice /// must be large enough to include all of the encoded bytes of one value. Decoding stops at the /// end of the encoded value, so it doesn't matter if the slice is longer. -pub fn decode_buf(buf: &mut B) -> Result<(u64, usize), DecodeError> { +pub fn decode_buf(buf: &mut B) -> Result<(u64, usize), DecodeError> { // This will be the decoded result. let mut num: u64 = 0; // This is how many bits we shift `num` by on each iteration in increments of 7. @@ -105,7 +105,7 @@ pub fn decode_buf(buf: &mut B) -> Result<(u64, usize), DecodeError> { let b = buf.get_u8(); count += 1; - + if is_last_encoded_byte(b) { return if max_byte_too_large(shift, b) { Err(DecodeError::EncodedValueTooLarge) @@ -189,9 +189,9 @@ pub fn encode_vec(num: u64) -> Vec { /// Encodes a `u64` with a variable-byte encoding in an array. /// /// The array is always length 10. Additinally, the actual size of the vbyte is returned. -pub fn encode_array(num: u64) -> ([u8;10],usize) { +pub fn encode_array(num: u64) -> ([u8; 10], usize) { // Allocate a `Vec` of the right size. - let mut buf = [0;10]; + let mut buf = [0; 10]; // Safety: We have created `vec` with the length of the encoded bytes of `num`. let size = unsafe { encode_unchecked(&mut buf, num) }; (buf, size) From c54eddca2a2702d2a534cdf1acb0e02a151919c9 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 17:13:44 +0100 Subject: [PATCH 03/99] buf implementation for tfc dict entry --- src/structure/tfc.rs | 159 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 158 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 36c00148..4f874888 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -58,7 +58,7 @@ impl TfcBlockHeader { } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct TfcEntry<'a>(Vec<&'a [u8]>); impl<'a> TfcEntry<'a> { @@ -71,6 +71,118 @@ impl<'a> TfcEntry<'a> { v } + + fn as_buf(&self) -> TfcEntryBuf { + TfcEntryBuf { + entry: self, + slice_ix: 0, + pos_in_slice: 0 + } + } + + fn into_buf(self) -> OwnedTfcEntryBuf<'a> { + OwnedTfcEntryBuf { + entry: self, + slice_ix: 0, + pos_in_slice: 0 + } + } + + fn len(&self) -> usize { + self.0.iter().map(|s|s.len()).sum() + } +} + +pub struct TfcEntryBuf<'a>{ + entry: &'a TfcEntry<'a>, + slice_ix: usize, + pos_in_slice: usize +} + +fn calculate_remaining<'a>(entry: &TfcEntry<'a>, slice_ix: usize, pos_in_slice: usize) -> usize { + let total: usize = entry.0.iter().skip(slice_ix).map(|s|s.len()).sum(); + total - pos_in_slice +} + +fn calculate_chunk<'a>(entry: &'a TfcEntry<'a>, slice_ix: usize, pos_in_slice: usize) -> &[u8] { + if slice_ix >= entry.0.len() { + &[] + } + else { + let slice = entry.0[slice_ix]; + &slice[pos_in_slice..] + } +} + +fn calculate_advance<'a>(entry: &'a TfcEntry<'a>, slice_ix: &mut usize, pos_in_slice: &mut usize, mut cnt: usize) { + if *slice_ix < entry.0.len() { + let slice = entry.0[*slice_ix]; + let remaining_in_slice = slice.len() - *pos_in_slice; + + if remaining_in_slice > cnt { + // we remain in the slice we're at. + *pos_in_slice += cnt; + } + else { + // we are starting at the next slice + cnt -= remaining_in_slice; + *slice_ix += 1; + + loop { + if entry.0.len() >= *slice_ix { + // past the end + *pos_in_slice = 0; + break; + } + + let slice_len = entry.0[*slice_ix].len(); + + if cnt < slice_len { + // this is our slice + *pos_in_slice = cnt; + break; + } + + // not our slice, so advance to next + cnt -= entry.0.len(); + *slice_ix += 1; + } + } + } +} + +impl<'a> Buf for TfcEntryBuf<'a> { + fn remaining(&self) -> usize { + calculate_remaining(self.entry, self.slice_ix, self.pos_in_slice) + } + + fn chunk(&self) -> &[u8] { + calculate_chunk(self.entry, self.slice_ix, self.pos_in_slice) + } + + fn advance(&mut self, cnt: usize) { + calculate_advance(self.entry, &mut self.slice_ix, &mut self.pos_in_slice, cnt) + } +} + +pub struct OwnedTfcEntryBuf<'a>{ + entry: TfcEntry<'a>, + slice_ix: usize, + pos_in_slice: usize +} + +impl<'a> Buf for OwnedTfcEntryBuf<'a> { + fn remaining(&self) -> usize { + calculate_remaining(&self.entry, self.slice_ix, self.pos_in_slice) + } + + fn chunk(&self) -> &[u8] { + calculate_chunk(&self.entry, self.slice_ix, self.pos_in_slice) + } + + fn advance(&mut self, cnt: usize) { + calculate_advance(&self.entry, &mut self.slice_ix, &mut self.pos_in_slice, cnt) + } } pub struct TfcBlock { @@ -246,4 +358,49 @@ mod tests { assert_eq!(*string, &block.entry(ix).as_vec()[..]); } } + + #[test] + fn entry_buf_in_complete_block() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_incomplete_block(&strings); + + for (ix, string) in strings.iter().enumerate() { + let entry = block.entry(ix); + let mut buf = entry.as_buf(); + let len = buf.remaining(); + let bytes = buf.copy_to_bytes(len); + assert_eq!(*string, &bytes[..]); + } + } + + #[test] + fn entry_owned_buf_in_complete_block() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_incomplete_block(&strings); + + for (ix, string) in strings.iter().enumerate() { + let mut buf = block.entry(ix).into_buf(); + let len = buf.remaining(); + let bytes = buf.copy_to_bytes(len); + assert_eq!(*string, &bytes[..]); + } + } } From 1da44ba44533fd68b1634725cde6bd4f22a9ea54 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 17:14:45 +0100 Subject: [PATCH 04/99] remove dummy test --- src/structure/tfc.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 4f874888..33be3baf 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -292,15 +292,6 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { mod tests { use super::*; use bytes::Buf; - #[test] - fn blah() { - let slice = b"asdfasfd"; - let mut argh = slice as &[u8]; - let first = argh.get_u8(); - let second = argh.get_u8(); - - panic!("{} {} {:?}", first, second, argh); - } fn build_incomplete_block(strings: &[&[u8]]) -> TfcBlock { let mut buf = BytesMut::new(); From 6ad263ee06dcc97ef41cae88d626df3c88529a35 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 18:19:01 +0100 Subject: [PATCH 05/99] replicate all pfc comparison logic --- src/structure/tfc.rs | 255 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 220 insertions(+), 35 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 33be3baf..24b7f9ba 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -1,3 +1,6 @@ +use std::cmp::Ordering; +use std::hash::{Hash, Hasher}; + use bytes::{Buf, BufMut, Bytes, BytesMut}; use crate::structure::{ @@ -59,11 +62,34 @@ impl TfcBlockHeader { } #[derive(Clone, Debug)] -pub struct TfcEntry<'a>(Vec<&'a [u8]>); +pub struct TfcDictEntry(Vec); + +impl TfcDictEntry { + pub fn new(parts: Vec) -> Self { + Self(parts) + } + + pub fn new_optimized(parts: Vec) -> Self { + let mut entry = Self::new(parts); + entry.optimize(); + + entry + } -impl<'a> TfcEntry<'a> { - fn as_vec(&self) -> Vec { - let mut v = Vec::with_capacity(self.0.iter().map(|s| s.len()).sum()); + fn to_bytes(&self) -> Bytes { + if self.0.len() == 1 { + self.0[0].clone() + } else { + let mut buf = BytesMut::with_capacity(self.len()); + for slice in self.0.iter() { + buf.extend_from_slice(&slice[..]); + } + + buf.freeze() + } + } + fn to_vec(&self) -> Vec { + let mut v = Vec::with_capacity(self.len()); for slice in self.0.iter() { v.extend_from_slice(slice); @@ -76,58 +102,216 @@ impl<'a> TfcEntry<'a> { TfcEntryBuf { entry: self, slice_ix: 0, - pos_in_slice: 0 + pos_in_slice: 0, } } - fn into_buf(self) -> OwnedTfcEntryBuf<'a> { + fn into_buf(self) -> OwnedTfcEntryBuf { OwnedTfcEntryBuf { entry: self, slice_ix: 0, - pos_in_slice: 0 + pos_in_slice: 0, } } fn len(&self) -> usize { - self.0.iter().map(|s|s.len()).sum() + self.0.iter().map(|s| s.len()).sum() + } + + /// optimize size + /// + /// For short strings, a list of pointers may be much less + /// efficient than a copy of the string. This will copy the + /// underlying string if that is the case. + pub fn optimize(&mut self) { + let overhead_size = std::mem::size_of::() * self.0.len(); + + if std::mem::size_of::() + self.len() < overhead_size { + let mut bytes = BytesMut::with_capacity(self.len()); + for part in self.0.iter() { + bytes.extend(part); + } + + self.0 = vec![bytes.freeze()]; + } + } + + pub fn buf_eq(&self, mut b: B) -> bool { + if self.len() != b.remaining() { + false + } else if self.len() == 0 { + true + } else { + let mut it = self.0.iter(); + let mut part = it.next().unwrap(); + loop { + let slice = b.chunk(); + + match part.len().cmp(&slice.len()) { + Ordering::Less => { + if part.as_ref() != &slice[..part.len()] { + return false; + } + } + Ordering::Equal => { + if part != slice { + return false; + } + + assert!(it.next().is_none()); + return true; + } + Ordering::Greater => { + panic!("This should never happen because it'd mean our entry is larger than the buffer passed in, but we already checked to make sure that is not the case"); + } + } + + b.advance(part.len()); + part = it.next().unwrap(); + } + } + } +} + +impl PartialEq for TfcDictEntry { + fn eq(&self, other: &Self) -> bool { + // unequal length, so can't be equal + if self.len() != other.len() { + return false; + } + + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for TfcDictEntry {} + +impl Hash for TfcDictEntry { + fn hash(&self, state: &mut H) { + for part in self.0.iter() { + state.write(part); + } + } +} + +impl Ord for TfcDictEntry { + fn cmp(&self, other: &Self) -> Ordering { + // both are empty, so equal + if self.len() == 0 && other.len() == 0 { + return Ordering::Equal; + } + + let mut it1 = self.0.iter(); + let mut it2 = other.0.iter(); + let mut part1 = it1.next().unwrap().clone(); + let mut part2 = it2.next().unwrap().clone(); + + loop { + match part1.len().cmp(&part2.len()) { + Ordering::Equal => { + match part1.cmp(&part2) { + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, + Ordering::Equal => {} + } + + let p1_next = it1.next(); + let p2_next = it2.next(); + + if let (Some(p1), Some(p2)) = (p1_next, p2_next) { + part1 = p1.clone(); + part2 = p2.clone(); + } else if p1_next.is_none() && p2_next.is_none() { + // done! everything has been compared equally and nothign remains. + return Ordering::Equal; + } else if p1_next.is_none() { + // the left side is a prefix of the right side + + return Ordering::Less; + } else { + return Ordering::Greater; + } + } + Ordering::Less => { + let part2_slice = part2.slice(0..part1.len()); + match part1.cmp(&part2_slice) { + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, + Ordering::Equal => {} + } + + part2 = part2.slice(part1.len()..); + let part1_option = it1.next(); + if part1_option.is_none() { + return Ordering::Less; + } + part1 = part1_option.unwrap().clone(); + } + Ordering::Greater => { + let part1_slice = part1.slice(0..part2.len()); + match part1_slice.cmp(&part2) { + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, + Ordering::Equal => {} + } + + part1 = part1.slice(part2.len()..); + let part2_option = it2.next(); + if part2_option.is_none() { + return Ordering::Greater; + } + part2 = part2_option.unwrap().clone(); + } + } + } } } -pub struct TfcEntryBuf<'a>{ - entry: &'a TfcEntry<'a>, +impl PartialOrd for TfcDictEntry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[derive(Clone)] +pub struct TfcEntryBuf<'a> { + entry: &'a TfcDictEntry, slice_ix: usize, - pos_in_slice: usize + pos_in_slice: usize, } -fn calculate_remaining<'a>(entry: &TfcEntry<'a>, slice_ix: usize, pos_in_slice: usize) -> usize { - let total: usize = entry.0.iter().skip(slice_ix).map(|s|s.len()).sum(); +fn calculate_remaining<'a>(entry: &TfcDictEntry, slice_ix: usize, pos_in_slice: usize) -> usize { + let total: usize = entry.0.iter().skip(slice_ix).map(|s| s.len()).sum(); total - pos_in_slice } -fn calculate_chunk<'a>(entry: &'a TfcEntry<'a>, slice_ix: usize, pos_in_slice: usize) -> &[u8] { +fn calculate_chunk<'a>(entry: &'a TfcDictEntry, slice_ix: usize, pos_in_slice: usize) -> &[u8] { if slice_ix >= entry.0.len() { &[] - } - else { - let slice = entry.0[slice_ix]; + } else { + let slice = &entry.0[slice_ix]; &slice[pos_in_slice..] } } -fn calculate_advance<'a>(entry: &'a TfcEntry<'a>, slice_ix: &mut usize, pos_in_slice: &mut usize, mut cnt: usize) { +fn calculate_advance<'a>( + entry: &'a TfcDictEntry, + slice_ix: &mut usize, + pos_in_slice: &mut usize, + mut cnt: usize, +) { if *slice_ix < entry.0.len() { - let slice = entry.0[*slice_ix]; + let slice = &entry.0[*slice_ix]; let remaining_in_slice = slice.len() - *pos_in_slice; if remaining_in_slice > cnt { - // we remain in the slice we're at. + // we remain in the slice we're at. *pos_in_slice += cnt; - } - else { + } else { // we are starting at the next slice cnt -= remaining_in_slice; *slice_ix += 1; - + loop { if entry.0.len() >= *slice_ix { // past the end @@ -165,13 +349,13 @@ impl<'a> Buf for TfcEntryBuf<'a> { } } -pub struct OwnedTfcEntryBuf<'a>{ - entry: TfcEntry<'a>, +pub struct OwnedTfcEntryBuf { + entry: TfcDictEntry, slice_ix: usize, - pos_in_slice: usize + pos_in_slice: usize, } -impl<'a> Buf for OwnedTfcEntryBuf<'a> { +impl Buf for OwnedTfcEntryBuf { fn remaining(&self) -> usize { calculate_remaining(&self.entry, self.slice_ix, self.pos_in_slice) } @@ -206,9 +390,10 @@ impl TfcBlock { self.header.num_entries != BLOCK_SIZE as u8 } - pub fn entry(&self, index: usize) -> TfcEntry { + pub fn entry(&self, index: usize) -> TfcDictEntry { if index == 0 { - return TfcEntry(vec![&self.data[..self.header.sizes[0]]]); + let b = self.data.slice(..self.header.sizes[0]); + return TfcDictEntry(vec![b]); } let mut v = Vec::with_capacity(7); @@ -237,7 +422,7 @@ impl TfcBlock { let mut taken = 0; let mut slices = Vec::with_capacity(v.len() + 1); - let mut offset = self.header.sizes.iter().take(start).sum(); + let mut offset: usize = self.header.sizes.iter().take(start).sum(); for (ix, shared) in v.iter().rev().enumerate() { let have_to_take = shared - taken; let cur_offset = offset; @@ -245,15 +430,15 @@ impl TfcBlock { if have_to_take == 0 { continue; } - let slice = &self.data[cur_offset..cur_offset + have_to_take]; + let slice = self.data.slice(cur_offset..cur_offset + have_to_take); slices.push(slice); taken += have_to_take; } let suffix_size = self.header.sizes[index]; - slices.push(&self.data[offset..offset + suffix_size]); + slices.push(self.data.slice(offset..offset + suffix_size)); - TfcEntry(slices) + TfcDictEntry(slices) } } @@ -327,7 +512,7 @@ mod tests { let block = build_incomplete_block(&strings); for (ix, string) in strings.iter().enumerate() { - assert_eq!(*string, &block.entry(ix).as_vec()[..]); + assert_eq!(*string, &block.entry(ix).to_vec()[..]); } } @@ -346,7 +531,7 @@ mod tests { let block = build_incomplete_block(&strings); for (ix, string) in strings.iter().enumerate() { - assert_eq!(*string, &block.entry(ix).as_vec()[..]); + assert_eq!(*string, &block.entry(ix).to_vec()[..]); } } From e97e15797d3a477176750223c4ab13e889a7a68e Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 23 Nov 2022 18:20:26 +0100 Subject: [PATCH 06/99] also optimize dict entries --- src/structure/tfc.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 24b7f9ba..cf8e518d 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -393,7 +393,7 @@ impl TfcBlock { pub fn entry(&self, index: usize) -> TfcDictEntry { if index == 0 { let b = self.data.slice(..self.header.sizes[0]); - return TfcDictEntry(vec![b]); + return TfcDictEntry::new(vec![b]); } let mut v = Vec::with_capacity(7); @@ -438,7 +438,7 @@ impl TfcBlock { let suffix_size = self.header.sizes[index]; slices.push(self.data.slice(offset..offset + suffix_size)); - TfcDictEntry(slices) + TfcDictEntry::new_optimized(slices) } } From 72653d8f25a90d73b095a750a276518275c18997 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 10:37:52 +0100 Subject: [PATCH 07/99] move tfc head to the start of the block --- src/structure/tfc.rs | 64 +++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index cf8e518d..583e4bdc 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -18,9 +18,10 @@ pub enum TfcError { #[derive(Debug, PartialEq)] pub struct TfcBlockHeader { + head: Bytes, num_entries: u8, buffer_length: usize, - sizes: [usize; BLOCK_SIZE], + sizes: [usize; BLOCK_SIZE - 1], shareds: [usize; BLOCK_SIZE - 1], } @@ -34,25 +35,27 @@ impl From for TfcError { } impl TfcBlockHeader { - fn parse(buf: &mut B) -> Result { - let num_entries = buf.get_u8(); - let mut sizes = [0_usize; BLOCK_SIZE]; + fn parse(buf: &mut Bytes) -> Result { + let mut sizes = [0_usize; BLOCK_SIZE - 1]; let mut shareds = [0_usize; BLOCK_SIZE - 1]; - let (first_size, _) = vbyte::decode_buf(buf)?; - sizes[0] = first_size as usize; + + let head = buf.split_to(first_size as usize); + + let num_entries = buf.get_u8(); for i in 0..(num_entries - 1) as usize { let (shared, _) = vbyte::decode_buf(buf)?; let (size, _) = vbyte::decode_buf(buf)?; - sizes[i + 1] = size as usize; + sizes[i] = size as usize; shareds[i] = shared as usize; } let buffer_length = sizes.iter().sum(); Ok(Self { + head, num_entries, buffer_length, sizes, @@ -392,8 +395,7 @@ impl TfcBlock { pub fn entry(&self, index: usize) -> TfcDictEntry { if index == 0 { - let b = self.data.slice(..self.header.sizes[0]); - return TfcDictEntry::new(vec![b]); + return TfcDictEntry::new(vec![self.header.head.clone()]); } let mut v = Vec::with_capacity(7); @@ -417,25 +419,45 @@ impl TfcBlock { } } + let start = index - v.len(); let mut taken = 0; let mut slices = Vec::with_capacity(v.len() + 1); - let mut offset: usize = self.header.sizes.iter().take(start).sum(); + let mut offset: usize; + if start == 0 { + offset = 0; + } + else { + offset = self.header.sizes.iter().take(start - 1).sum(); + } for (ix, shared) in v.iter().rev().enumerate() { let have_to_take = shared - taken; let cur_offset = offset; - offset += self.header.sizes[start + ix]; + + if !(ix == 0 && start == 0) { + // the head slice does not contribute to the offset + offset += self.header.sizes[start + ix - 1]; + } + if have_to_take == 0 { continue; } - let slice = self.data.slice(cur_offset..cur_offset + have_to_take); + + let slice; + if ix == 0 && start == 0 { + // the slice has to come out of the header + slice = self.header.head.slice(..have_to_take); + } + else { + slice = self.data.slice(cur_offset..cur_offset + have_to_take); + } slices.push(slice); taken += have_to_take; } - let suffix_size = self.header.sizes[index]; + let suffix_size = self.header.sizes[index-1]; slices.push(self.data.slice(offset..offset + suffix_size)); TfcDictEntry::new_optimized(slices) @@ -445,11 +467,15 @@ impl TfcBlock { fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); - buf.put_u8(slices_len as u8); let first = slices[0]; let (vbyte, vbyte_len) = encode_array(first.len() as u64); + + // write the head first buf.put_slice(&vbyte[..vbyte_len]); + buf.put_slice(slices[0]); + + buf.put_u8(slices_len as u8); let mut last = first; @@ -468,7 +494,8 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { last = cur; } - for suffix in suffixes { + // write the rest of the slices + for suffix in suffixes.into_iter().skip(1) { buf.put_slice(suffix); } } @@ -494,15 +521,16 @@ mod tests { let block = build_incomplete_block(&strings); let expected_header = TfcBlockHeader { + head: Bytes::copy_from_slice(b"aaaaaa"), num_entries: 5, - buffer_length: 17, - sizes: [6, 2, 4, 3, 2, 0, 0, 0], + buffer_length: 11, + sizes: [2, 4, 3, 2, 0, 0, 0], shareds: [2, 0, 1, 2, 0, 0, 0], }; assert_eq!(expected_header, block.header); - let expected_bytes = b"aaaaaabbccccdefff"; + let expected_bytes = b"bbccccdefff"; assert_eq!(expected_bytes, &block.data[..]); } From 169448c6ce05d350f9cdebb0c1b62e8f42467bfe Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 12:16:00 +0100 Subject: [PATCH 08/99] lookup slice in tfc block --- src/structure/tfc.rs | 163 ++++++++++++++++++++++++++++++++++++++---- src/structure/util.rs | 15 ++++ 2 files changed, 165 insertions(+), 13 deletions(-) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 583e4bdc..647edc95 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -8,6 +8,8 @@ use crate::structure::{ vbyte::{self, encode_array}, }; +use super::util::find_common_prefix_ord; + const BLOCK_SIZE: usize = 8; #[derive(Debug)] @@ -419,7 +421,6 @@ impl TfcBlock { } } - let start = index - v.len(); let mut taken = 0; @@ -428,8 +429,7 @@ impl TfcBlock { let mut offset: usize; if start == 0 { offset = 0; - } - else { + } else { offset = self.header.sizes.iter().take(start - 1).sum(); } for (ix, shared) in v.iter().rev().enumerate() { @@ -449,19 +449,74 @@ impl TfcBlock { if ix == 0 && start == 0 { // the slice has to come out of the header slice = self.header.head.slice(..have_to_take); - } - else { + } else { slice = self.data.slice(cur_offset..cur_offset + have_to_take); } slices.push(slice); taken += have_to_take; } - let suffix_size = self.header.sizes[index-1]; + let suffix_size = self.header.sizes[index - 1]; slices.push(self.data.slice(offset..offset + suffix_size)); TfcDictEntry::new_optimized(slices) } + + fn suffixes<'a>(&'a self) -> impl Iterator + 'a { + let head = Some(self.header.head.clone()); + let mut offset = 0; + let tail = self.header.sizes.iter().map(move |s| { + let slice = self.data.slice(offset..*s + offset); + offset += s; + + slice + }); + + head.into_iter().chain(tail) + } + + pub fn id(&self, slice: &[u8]) -> IdLookupResult { + let (mut common_prefix, ordering) = find_common_prefix_ord(slice, &self.header.head); + match ordering { + Ordering::Equal => return IdLookupResult::Found(0), + Ordering::Less => return IdLookupResult::NotFound, + // We have to traverse the block + Ordering::Greater => {} + } + + for (ix, (shared, suffix)) in self + .header + .shareds + .iter() + .zip(self.suffixes().skip(1)) + .enumerate() + { + if *shared < common_prefix { + return IdLookupResult::Closest(ix as u64); + } else if *shared > common_prefix { + continue; + } + + let (new_common_prefix, ordering) = + find_common_prefix_ord(&slice[common_prefix..], &suffix[..]); + match ordering { + Ordering::Equal => return IdLookupResult::Found(ix as u64 + 1), + Ordering::Less => return IdLookupResult::Closest(ix as u64), + Ordering::Greater => { + common_prefix += new_common_prefix; + } + } + } + + IdLookupResult::Closest(self.header.num_entries as u64 - 1) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum IdLookupResult { + Found(u64), + Closest(u64), + NotFound, } fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { @@ -500,16 +555,25 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { } } +pub fn block_head(mut block: Bytes) -> Result { + let (size, _) = vbyte::decode_buf(&mut block)?; + Ok(block.split_to(size as usize)) +} + #[cfg(test)] mod tests { use super::*; use bytes::Buf; - fn build_incomplete_block(strings: &[&[u8]]) -> TfcBlock { + fn build_block_bytes(strings: &[&[u8]]) -> Bytes { let mut buf = BytesMut::new(); build_block_unchecked(&mut buf, &strings); - let mut bytes: Bytes = buf.freeze(); + buf.freeze() + } + + fn build_block(strings: &[&[u8]]) -> TfcBlock { + let mut bytes = build_block_bytes(strings); TfcBlock::parse(&mut bytes).unwrap() } @@ -518,7 +582,7 @@ mod tests { fn build_and_parse_block() { let strings: [&[u8]; 5] = [b"aaaaaa", b"aabb", b"cccc", b"cdef", b"cdff"]; - let block = build_incomplete_block(&strings); + let block = build_block(&strings); let expected_header = TfcBlockHeader { head: Bytes::copy_from_slice(b"aaaaaa"), @@ -537,7 +601,7 @@ mod tests { #[test] fn entry_in_block() { let strings: [&[u8]; 5] = [b"aaaaaa", b"aabb", b"cccc", b"cdef", b"cdff"]; - let block = build_incomplete_block(&strings); + let block = build_block(&strings); for (ix, string) in strings.iter().enumerate() { assert_eq!(*string, &block.entry(ix).to_vec()[..]); @@ -556,7 +620,7 @@ mod tests { b"cdffeeee", b"ceeeeeeeeeeeeeee", ]; - let block = build_incomplete_block(&strings); + let block = build_block(&strings); for (ix, string) in strings.iter().enumerate() { assert_eq!(*string, &block.entry(ix).to_vec()[..]); @@ -575,7 +639,7 @@ mod tests { b"cdffeeee", b"ceeeeeeeeeeeeeee", ]; - let block = build_incomplete_block(&strings); + let block = build_block(&strings); for (ix, string) in strings.iter().enumerate() { let entry = block.entry(ix); @@ -598,7 +662,7 @@ mod tests { b"cdffeeee", b"ceeeeeeeeeeeeeee", ]; - let block = build_incomplete_block(&strings); + let block = build_block(&strings); for (ix, string) in strings.iter().enumerate() { let mut buf = block.entry(ix).into_buf(); @@ -607,4 +671,77 @@ mod tests { assert_eq!(*string, &bytes[..]); } } + + #[test] + fn head_from_complete_block() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_block_bytes(&strings); + let head = block_head(block).unwrap(); + + assert_eq!(b"aaaaaa", &head[..]); + } + + #[test] + fn slices_iter() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_block(&strings); + + let expected_slices: Vec<&[u8]> = vec![ + b"aaaaaa", + b"bb", + b"cccc", + b"def", + b"ff", + b"asdf", + b"eeee", + b"eeeeeeeeeeeeeee", + ]; + + let expected_bytes: Vec<_> = expected_slices + .into_iter() + .map(|b| Bytes::from(b)) + .collect(); + + let actual: Vec<_> = block.suffixes().collect(); + + assert_eq!(expected_bytes, actual); + } + + #[test] + fn block_id_lookup() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_block(&strings); + + for (ix, string) in strings.iter().enumerate() { + let index = block.id(string); + assert_eq!(IdLookupResult::Found(ix as u64), index); + } + } } diff --git a/src/structure/util.rs b/src/structure/util.rs index 2b674275..6bb0f721 100644 --- a/src/structure/util.rs +++ b/src/structure/util.rs @@ -1,6 +1,7 @@ use futures::io::Result; use futures::stream::{Peekable, Stream, StreamExt}; use futures::task::{Context, Poll}; +use std::cmp::Ordering; use std::marker::Unpin; use std::pin::Pin; use tokio::io::{AsyncWrite, AsyncWriteExt}; @@ -18,6 +19,20 @@ pub fn find_common_prefix(b1: &[u8], b2: &[u8]) -> usize { common } +pub fn find_common_prefix_ord(b1: &[u8], b2: &[u8]) -> (usize, Ordering) { + let common_prefix = find_common_prefix(b1, b2); + + if common_prefix == b1.len() && b1.len() == b2.len() { + (common_prefix, Ordering::Equal) + } else if b1.len() == common_prefix { + (common_prefix, Ordering::Less) + } else if b2.len() == common_prefix { + (common_prefix, Ordering::Greater) + } else { + (common_prefix, b1[common_prefix].cmp(&b2[common_prefix])) + } +} + pub async fn write_nul_terminated_bytes( w: &mut W, bytes: &[u8], From 1c8c4a2f78fa35456ae2749412e4dfbd8e25646f Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 12:18:33 +0100 Subject: [PATCH 09/99] test for close matches in tfc --- src/structure/tfc.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/structure/tfc.rs b/src/structure/tfc.rs index 647edc95..28073028 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc.rs @@ -744,4 +744,32 @@ mod tests { assert_eq!(IdLookupResult::Found(ix as u64), index); } } + + #[test] + fn block_id_lookup_nonmatches() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_block(&strings); + + assert_eq!(IdLookupResult::NotFound, + block.id(b"aa")); + + assert_eq!(IdLookupResult::Closest(0), + block.id(b"aaab")); + + assert_eq!(IdLookupResult::Closest(1), + block.id(b"aabba")); + + assert_eq!(IdLookupResult::Closest(7), + block.id(b"f")); + + } } From 080764339e604654f1e944f924d42834fd8a5623 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 14:05:33 +0100 Subject: [PATCH 10/99] implement tfcdict --- Cargo.toml | 1 + src/structure/logarray.rs | 100 ++++++++++++++++++- src/structure/{tfc.rs => tfc/block.rs} | 31 ++++-- src/structure/tfc/dict.rs | 131 +++++++++++++++++++++++++ src/structure/tfc/mod.rs | 2 + 5 files changed, 254 insertions(+), 11 deletions(-) rename src/structure/{tfc.rs => tfc/block.rs} (97%) create mode 100644 src/structure/tfc/dict.rs create mode 100644 src/structure/tfc/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 75c68d60..dd626ad9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ flate2 = "1.0" rayon = "1.4" thiserror = "1.0" async-trait = "0.1" +itertools = "0.10" [dev-dependencies] tempfile = "3.1" diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 2bd6b627..48d8a15e 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -52,7 +52,7 @@ use super::util; use crate::storage::*; use byteorder::{BigEndian, ByteOrder}; -use bytes::{Bytes, BytesMut}; +use bytes::{Bytes, BytesMut, BufMut}; use futures::stream::{Stream, StreamExt}; use std::{cmp::Ordering, convert::TryFrom, error, fmt, io}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -311,6 +311,104 @@ impl LogArray { } } +/// write a logarray directly to an AsyncWrite +pub struct LogArrayBufBuilder<'a, B: BufMut> { + /// Destination of the log array data + buf: &'a mut B, + /// Bit width of an element + width: u8, + /// Storage for the next word to be written to the buffer + current: u64, + /// Bit offset in `current` for the msb of the next encoded element + offset: u8, + /// Number of elements written to the buffer + count: u32, +} + +impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { + pub fn new(buf: &'a mut B, width: u8) -> Self { + Self { + buf, + width, + // Zero is needed for bitwise OR-ing new values. + current: 0, + // Start at the beginning of `current`. + offset: 0, + // No elements have been written. + count: 0, + } + } + + pub fn count(&self) -> u32 { + self.count + } + + pub fn push(&mut self, val: u64) { + // This is the minimum number of leading zeros that a decoded value should have. + let leading_zeros = 64 - self.width; + + // If `val` does not fit in the `width`, return an error. + if val.leading_zeros() < u32::from(leading_zeros) { + panic!("expected value ({}) to fit in {} bits", val, self.width); + } + + // Otherwise, push `val` onto the log array. + // Advance the element count since we know we're going to write `val`. + self.count += 1; + + // Write the first part of `val` to `current`, putting the msb of `val` at the `offset` + // bit. This may be either the upper bits of `val` only or all of it. We check later. + self.current |= val << leading_zeros >> self.offset; + + // Increment `offset` past `val`. + self.offset += self.width; + + // Check if the new `offset` is larger than 64. + if self.offset >= 64 { + // We have filled `current`, so write it to the destination. + //util::write_u64(&mut self.file, self.current).await?; + self.buf.put_u64(self.current); + // Wrap the offset with the word size. + self.offset -= 64; + + // Initialize the new `current`. + self.current = if self.offset == 0 { + // Zero is needed for bitwise OR-ing new values. + 0 + } else { + // This is the second part of `val`: the lower bits. + val << 64 - self.offset + }; + } + } + + pub fn push_vec(&mut self, vals: Vec) { + for val in vals { + self.push(val); + } + } + + fn finalize_data(&mut self) { + if u64::from(self.count) * u64::from(self.width) & 0b11_1111 != 0 { + self.buf.put_u64(self.current); + } + } + + pub fn finalize(mut self) { + let len = self.count; + let width = self.width; + + // Write the final data word. + self.finalize_data(); + + // Write the control word. + let mut buf = [0; 8]; + BigEndian::write_u32(&mut buf, len); + buf[4] = width; + self.buf.put_slice(&buf); + } +} + /// write a logarray directly to an AsyncWrite pub struct LogArrayFileBuilder { /// Destination of the log array data diff --git a/src/structure/tfc.rs b/src/structure/tfc/block.rs similarity index 97% rename from src/structure/tfc.rs rename to src/structure/tfc/block.rs index 28073028..77d51b0e 100644 --- a/src/structure/tfc.rs +++ b/src/structure/tfc/block.rs @@ -4,13 +4,11 @@ use std::hash::{Hash, Hasher}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crate::structure::{ - util::find_common_prefix, + util::{find_common_prefix, find_common_prefix_ord}, vbyte::{self, encode_array}, }; -use super::util::find_common_prefix_ord; - -const BLOCK_SIZE: usize = 8; +pub const BLOCK_SIZE: usize = 8; #[derive(Debug)] pub enum TfcError { @@ -81,7 +79,7 @@ impl TfcDictEntry { entry } - fn to_bytes(&self) -> Bytes { + pub fn to_bytes(&self) -> Bytes { if self.0.len() == 1 { self.0[0].clone() } else { @@ -93,7 +91,7 @@ impl TfcDictEntry { buf.freeze() } } - fn to_vec(&self) -> Vec { + pub fn to_vec(&self) -> Vec { let mut v = Vec::with_capacity(self.len()); for slice in self.0.iter() { @@ -103,7 +101,7 @@ impl TfcDictEntry { v } - fn as_buf(&self) -> TfcEntryBuf { + pub fn as_buf(&self) -> TfcEntryBuf { TfcEntryBuf { entry: self, slice_ix: 0, @@ -111,7 +109,7 @@ impl TfcDictEntry { } } - fn into_buf(self) -> OwnedTfcEntryBuf { + pub fn into_buf(self) -> OwnedTfcEntryBuf { OwnedTfcEntryBuf { entry: self, slice_ix: 0, @@ -119,7 +117,7 @@ impl TfcDictEntry { } } - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.0.iter().map(|s| s.len()).sum() } @@ -391,6 +389,10 @@ impl TfcBlock { Ok(Self { header, data }) } + pub fn num_entries(&self) -> u8 { + self.header.num_entries + } + pub fn is_incomplete(&self) -> bool { self.header.num_entries != BLOCK_SIZE as u8 } @@ -519,7 +521,8 @@ pub enum IdLookupResult { NotFound, } -fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { +pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> usize { + let mut size = 0; let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); @@ -529,8 +532,10 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { // write the head first buf.put_slice(&vbyte[..vbyte_len]); buf.put_slice(slices[0]); + size += vbyte_len + slices[0].len(); buf.put_u8(slices_len as u8); + size += 1; let mut last = first; @@ -541,10 +546,13 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { let common_prefix = find_common_prefix(last, cur); let (vbyte, vbyte_len) = encode_array(common_prefix as u64); buf.put_slice(&vbyte[..vbyte_len]); + size += vbyte_len; let suffix_len = cur.len() - common_prefix; let (vbyte, vbyte_len) = encode_array(suffix_len as u64); buf.put_slice(&vbyte[..vbyte_len]); + size += vbyte_len; + suffixes.push(&cur[common_prefix..]); last = cur; } @@ -552,7 +560,10 @@ fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) { // write the rest of the slices for suffix in suffixes.into_iter().skip(1) { buf.put_slice(suffix); + size += suffix.len(); } + + size } pub fn block_head(mut block: Bytes) -> Result { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs new file mode 100644 index 00000000..1936a3e8 --- /dev/null +++ b/src/structure/tfc/dict.rs @@ -0,0 +1,131 @@ +use itertools::Itertools; +use bytes::{BufMut, Bytes}; +use crate::structure::{util::calculate_width, LogArrayBufBuilder, LogArray}; + +use super::block::*; + +fn build_dict_unchecked<'a, B1:BufMut, B2:BufMut,I:Iterator>(array_buf: &mut B1, data_buf: &mut B2, iter: I) { + let chunk_iter = iter.chunks(BLOCK_SIZE); + let mut offsets = Vec::new(); + + let mut offset = 0; + for chunk in &chunk_iter { + let slices: Vec<&[u8]> = chunk.collect(); + let size = build_block_unchecked(data_buf, &slices); + offset += size; + offsets.push(offset as u64); + } + + offsets.pop(); + + let largest_element = offsets.last().cloned().unwrap_or(0); + let width = calculate_width(largest_element); + let mut array_builder = LogArrayBufBuilder::new(array_buf, width); + + array_builder.push_vec(offsets); + array_builder.finalize(); +} + +pub struct TfcDict { + offsets: LogArray, + data: Bytes +} + +impl TfcDict { + pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { + let offsets = LogArray::parse(offsets).unwrap(); + Self { + offsets, data + } + } + + pub fn block_bytes(&self, block_index:usize) -> Bytes { + let offset: usize; + if block_index == 0 { + offset = 0; + } + else { + offset = self.offsets.entry(block_index-1) as usize; + } + + let block_bytes; + if block_index == self.offsets.len() { + block_bytes = self.data.slice(offset..); + } + else { + let end = self.offsets.entry(block_index) as usize; + block_bytes = self.data.slice(offset..end); + } + + block_bytes + } + + pub fn block(&self, block_index: usize) -> TfcBlock { + let mut block_bytes = self.block_bytes(block_index); + TfcBlock::parse(&mut block_bytes).unwrap() + } + + pub fn block_head(&self, block_index: usize) -> Bytes { + let block_bytes = self.block_bytes(block_index); + block_head(block_bytes).unwrap() + } + + pub fn num_blocks(&self) -> usize { + self.offsets.len() + 1 + } + + pub fn entry(&self, index: u64) -> TfcDictEntry { + let block = self.block((index / 8) as usize); + block.entry((index % 8) as usize) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::BytesMut; + + #[test] + fn build_dict_of_two_blocks() { + let strings: Vec<&[u8]> = vec![ + b"aaaaaaaa", + b"bbbbbbbb", + b"bbbcccdaaaa", + b"f", + b"fafasdfas", + b"gafovp", + b"gdfasfa", + b"gdfbbbbbb", + + b"hello", + b"iguana", + b"illusion", + b"illustrated", + b"jetengine", + b"jetplane", + + ]; + + let mut array_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + + let array_bytes = array_buf.freeze(); + let data_bytes = data_buf.freeze(); + let dict =TfcDict::from_parts(array_bytes, data_bytes); + + assert_eq!(2, dict.num_blocks()); + assert_eq!(b"aaaaaaaa", &dict.block_head(0)[..]); + assert_eq!(b"hello", &dict.block_head(1)[..]); + + let block0 = dict.block(0); + let block1 = dict.block(1); + + assert_eq!(8, block0.num_entries()); + assert_eq!(6, block1.num_entries()); + + for (ix, s) in strings.into_iter().enumerate() { + assert_eq!(s, &dict.entry(ix as u64).to_bytes()[..]); + } + } +} diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs new file mode 100644 index 00000000..01c22bfa --- /dev/null +++ b/src/structure/tfc/mod.rs @@ -0,0 +1,2 @@ +pub mod block; +pub mod dict; From dfa4834157f2c615c6c8a63e7ef2f4bcc46f7a9b Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 14:44:26 +0100 Subject: [PATCH 11/99] look up id of entry in tfcdict --- src/structure/logarray.rs | 2 +- src/structure/tfc/block.rs | 38 +++++++--- src/structure/tfc/dict.rs | 140 ++++++++++++++++++++++++++++++++----- src/structure/vbyte.rs | 2 - 4 files changed, 153 insertions(+), 29 deletions(-) diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 48d8a15e..629dafa7 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -52,7 +52,7 @@ use super::util; use crate::storage::*; use byteorder::{BigEndian, ByteOrder}; -use bytes::{Bytes, BytesMut, BufMut}; +use bytes::{BufMut, Bytes, BytesMut}; use futures::stream::{Stream, StreamExt}; use std::{cmp::Ordering, convert::TryFrom, error, fmt, io}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 77d51b0e..4b5fc07b 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -521,6 +521,29 @@ pub enum IdLookupResult { NotFound, } +impl IdLookupResult { + pub fn offset(self, offset: u64) -> Self { + match self { + Self::Found(i) => Self::Found(i + offset), + Self::Closest(i) => Self::Closest(i + offset), + Self::NotFound => Self::NotFound, + } + } + + pub fn default(self, previous_block_num: usize, mut previous_block: Bytes) -> Self { + match self { + Self::NotFound => { + // we should move num elements to start of block so we don't hae to parse a full header + let previous_header = TfcBlockHeader::parse(&mut previous_block).unwrap(); + let id = previous_header.num_entries as u64 - 1 + previous_block_num as u64; + + Self::Closest(id) + } + _ => self, + } + } +} + pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> usize { let mut size = 0; let slices_len = slices.len(); @@ -552,7 +575,7 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> let (vbyte, vbyte_len) = encode_array(suffix_len as u64); buf.put_slice(&vbyte[..vbyte_len]); size += vbyte_len; - + suffixes.push(&cur[common_prefix..]); last = cur; } @@ -770,17 +793,12 @@ mod tests { ]; let block = build_block(&strings); - assert_eq!(IdLookupResult::NotFound, - block.id(b"aa")); - - assert_eq!(IdLookupResult::Closest(0), - block.id(b"aaab")); + assert_eq!(IdLookupResult::NotFound, block.id(b"aa")); - assert_eq!(IdLookupResult::Closest(1), - block.id(b"aabba")); + assert_eq!(IdLookupResult::Closest(0), block.id(b"aaab")); - assert_eq!(IdLookupResult::Closest(7), - block.id(b"f")); + assert_eq!(IdLookupResult::Closest(1), block.id(b"aabba")); + assert_eq!(IdLookupResult::Closest(7), block.id(b"f")); } } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 1936a3e8..53d7cef0 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,10 +1,16 @@ -use itertools::Itertools; +use std::cmp::Ordering; + +use crate::structure::{util::calculate_width, LogArray, LogArrayBufBuilder}; use bytes::{BufMut, Bytes}; -use crate::structure::{util::calculate_width, LogArrayBufBuilder, LogArray}; +use itertools::Itertools; use super::block::*; -fn build_dict_unchecked<'a, B1:BufMut, B2:BufMut,I:Iterator>(array_buf: &mut B1, data_buf: &mut B2, iter: I) { +fn build_dict_unchecked<'a, B1: BufMut, B2: BufMut, I: Iterator>( + array_buf: &mut B1, + data_buf: &mut B2, + iter: I, +) { let chunk_iter = iter.chunks(BLOCK_SIZE); let mut offsets = Vec::new(); @@ -28,31 +34,27 @@ fn build_dict_unchecked<'a, B1:BufMut, B2:BufMut,I:Iterator>(arra pub struct TfcDict { offsets: LogArray, - data: Bytes + data: Bytes, } impl TfcDict { pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { let offsets = LogArray::parse(offsets).unwrap(); - Self { - offsets, data - } + Self { offsets, data } } - pub fn block_bytes(&self, block_index:usize) -> Bytes { + pub fn block_bytes(&self, block_index: usize) -> Bytes { let offset: usize; if block_index == 0 { offset = 0; - } - else { - offset = self.offsets.entry(block_index-1) as usize; + } else { + offset = self.offsets.entry(block_index - 1) as usize; } let block_bytes; if block_index == self.offsets.len() { block_bytes = self.data.slice(offset..); - } - else { + } else { let end = self.offsets.entry(block_index) as usize; block_bytes = self.data.slice(offset..end); } @@ -78,6 +80,48 @@ impl TfcDict { let block = self.block((index / 8) as usize); block.entry((index % 8) as usize) } + + pub fn id(&self, slice: &[u8]) -> IdLookupResult { + // let's binary search + let mut min = 0; + let mut max = self.offsets.len(); + let mut mid: usize; + + while min <= max { + mid = (min + max) / 2; + + let head_slice = self.block_head(mid); + + match slice.cmp(&head_slice[..]) { + Ordering::Less => { + if mid == 0 { + // we checked the first block and determined that the string should be in the previous block, if it exists. + // but since this is the first block, the string doesn't exist. + return IdLookupResult::NotFound; + } + max = mid - 1; + } + Ordering::Greater => min = mid + 1, + Ordering::Equal => return IdLookupResult::Found((mid * BLOCK_SIZE) as u64), // what luck! turns out the string we were looking for was the block head + } + } + + let found = max; + + // we found the block the string should be part of. + let block = self.block(found); + let block_id = block.id(slice); + let result = block_id.offset((found * BLOCK_SIZE) as u64); + if found != 0 { + // the default value will fill in the last index of the + // previous block if the entry was not found in the + // current block. This is only possible if the block as + // not the very first one. + result.default(found - 1, self.block_bytes(found - 1)) + } else { + result + } + } } #[cfg(test)] @@ -96,14 +140,12 @@ mod tests { b"gafovp", b"gdfasfa", b"gdfbbbbbb", - b"hello", b"iguana", b"illusion", b"illustrated", b"jetengine", b"jetplane", - ]; let mut array_buf = BytesMut::new(); @@ -112,7 +154,7 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict =TfcDict::from_parts(array_bytes, data_bytes); + let dict = TfcDict::from_parts(array_bytes, data_bytes); assert_eq!(2, dict.num_blocks()); assert_eq!(b"aaaaaaaa", &dict.block_head(0)[..]); @@ -128,4 +170,70 @@ mod tests { assert_eq!(s, &dict.entry(ix as u64).to_bytes()[..]); } } + + #[test] + fn lookup_entries_by_slice() { + let strings: Vec<&[u8]> = vec![ + b"aaaaaaaa", + b"bbbbbbbb", + b"bbbcccdaaaa", + b"f", + b"fafasdfas", + b"gafovp", + b"gdfasfa", + b"gdfbbbbbb", + b"hello", + b"iguana", + b"illusion", + b"illustrated", + b"jetengine", + b"jetplane", + ]; + + let mut array_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + + let array_bytes = array_buf.freeze(); + let data_bytes = data_buf.freeze(); + let dict = TfcDict::from_parts(array_bytes, data_bytes); + + for (ix, s) in strings.into_iter().enumerate() { + assert_eq!(IdLookupResult::Found(ix as u64), dict.id(s)); + } + } + + #[test] + fn lookup_nonmatching_entries_by_slice() { + let strings: Vec<&[u8]> = vec![ + b"aaaaaaaa", + b"bbbbbbbb", + b"bbbcccdaaaa", + b"f", + b"fafasdfas", + b"gafovp", + b"gdfasfa", + b"gdfbbbbbb", + b"hello", + b"iguana", + b"illusion", + b"illustrated", + b"jetengine", + b"jetplane", + ]; + + let mut array_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + + let array_bytes = array_buf.freeze(); + let data_bytes = data_buf.freeze(); + let dict = TfcDict::from_parts(array_bytes, data_bytes); + + assert_eq!(IdLookupResult::NotFound, dict.id(b"a")); + assert_eq!(IdLookupResult::Closest(0), dict.id(b"ab")); + assert_eq!(IdLookupResult::Closest(7), dict.id(b"hallo")); + assert_eq!(IdLookupResult::Closest(8), dict.id(b"hello!")); + assert_eq!(IdLookupResult::Closest(13), dict.id(b"zebra")); + } } diff --git a/src/structure/vbyte.rs b/src/structure/vbyte.rs index 9ad5bce5..a2396d8d 100644 --- a/src/structure/vbyte.rs +++ b/src/structure/vbyte.rs @@ -17,8 +17,6 @@ use futures::io; use tokio::io::{AsyncWrite, AsyncWriteExt}; -use std::io::Write; - use bytes::Buf; /// The maximum number of bytes required for any `u64` in a variable-byte encoding. From de19515c3f65e037e0fe999b6f63a8383a665103 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 15:05:58 +0100 Subject: [PATCH 12/99] move block size to start for easier search --- src/structure/tfc/block.rs | 22 +++++++++------------- src/structure/tfc/dict.rs | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 4b5fc07b..6f1dd5d2 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -36,14 +36,14 @@ impl From for TfcError { impl TfcBlockHeader { fn parse(buf: &mut Bytes) -> Result { + let num_entries = buf.get_u8(); + let mut sizes = [0_usize; BLOCK_SIZE - 1]; let mut shareds = [0_usize; BLOCK_SIZE - 1]; let (first_size, _) = vbyte::decode_buf(buf)?; let head = buf.split_to(first_size as usize); - let num_entries = buf.get_u8(); - for i in 0..(num_entries - 1) as usize { let (shared, _) = vbyte::decode_buf(buf)?; let (size, _) = vbyte::decode_buf(buf)?; @@ -530,14 +530,10 @@ impl IdLookupResult { } } - pub fn default(self, previous_block_num: usize, mut previous_block: Bytes) -> Self { + pub fn default(self, default: u64) -> Self { match self { Self::NotFound => { - // we should move num elements to start of block so we don't hae to parse a full header - let previous_header = TfcBlockHeader::parse(&mut previous_block).unwrap(); - let id = previous_header.num_entries as u64 - 1 + previous_block_num as u64; - - Self::Closest(id) + Self::Closest(default) } _ => self, } @@ -549,6 +545,9 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); + buf.put_u8(slices_len as u8); + size += 1; + let first = slices[0]; let (vbyte, vbyte_len) = encode_array(first.len() as u64); @@ -557,13 +556,9 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> buf.put_slice(slices[0]); size += vbyte_len + slices[0].len(); - buf.put_u8(slices_len as u8); - size += 1; - let mut last = first; let mut suffixes: Vec<&[u8]> = Vec::with_capacity(slices.len()); - suffixes.push(last); for i in 1..slices.len() { let cur = slices[i]; let common_prefix = find_common_prefix(last, cur); @@ -581,7 +576,7 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> } // write the rest of the slices - for suffix in suffixes.into_iter().skip(1) { + for suffix in suffixes.into_iter() { buf.put_slice(suffix); size += suffix.len(); } @@ -590,6 +585,7 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> } pub fn block_head(mut block: Bytes) -> Result { + block.advance(1); let (size, _) = vbyte::decode_buf(&mut block)?; Ok(block.split_to(size as usize)) } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 53d7cef0..0221095f 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -43,7 +43,7 @@ impl TfcDict { Self { offsets, data } } - pub fn block_bytes(&self, block_index: usize) -> Bytes { + fn block_offset(&self, block_index: usize) -> usize { let offset: usize; if block_index == 0 { offset = 0; @@ -51,6 +51,11 @@ impl TfcDict { offset = self.offsets.entry(block_index - 1) as usize; } + offset + } + + pub fn block_bytes(&self, block_index: usize) -> Bytes { + let offset = self.block_offset(block_index); let block_bytes; if block_index == self.offsets.len() { block_bytes = self.data.slice(offset..); @@ -72,6 +77,12 @@ impl TfcDict { block_head(block_bytes).unwrap() } + pub fn block_num_elements(&self, block_index: usize) -> u8 { + let offset = self.block_offset(block_index); + + self.data[offset] + } + pub fn num_blocks(&self) -> usize { self.offsets.len() + 1 } @@ -111,13 +122,14 @@ impl TfcDict { // we found the block the string should be part of. let block = self.block(found); let block_id = block.id(slice); - let result = block_id.offset((found * BLOCK_SIZE) as u64); + let offset = (found * BLOCK_SIZE) as u64; + let result = block_id.offset(offset); if found != 0 { // the default value will fill in the last index of the // previous block if the entry was not found in the // current block. This is only possible if the block as // not the very first one. - result.default(found - 1, self.block_bytes(found - 1)) + result.default(self.block_num_elements(found-1) as u64 + offset - 1) } else { result } From 07fe95ea380e60762282efd89c88c22e7a41360e Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 15:37:39 +0100 Subject: [PATCH 13/99] renamed TfcDict and related types to SizedDict and related --- src/structure/tfc/block.rs | 76 +++++++++++++++++++------------------- src/structure/tfc/dict.rs | 16 ++++---- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 6f1dd5d2..10adee23 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -11,13 +11,13 @@ use crate::structure::{ pub const BLOCK_SIZE: usize = 8; #[derive(Debug)] -pub enum TfcError { +pub enum SizedDictError { InvalidCoding, NotEnoughData, } #[derive(Debug, PartialEq)] -pub struct TfcBlockHeader { +pub struct SizedBlockHeader { head: Bytes, num_entries: u8, buffer_length: usize, @@ -25,7 +25,7 @@ pub struct TfcBlockHeader { shareds: [usize; BLOCK_SIZE - 1], } -impl From for TfcError { +impl From for SizedDictError { fn from(e: vbyte::DecodeError) -> Self { match e { vbyte::DecodeError::UnexpectedEndOfBuffer => Self::NotEnoughData, @@ -34,8 +34,8 @@ impl From for TfcError { } } -impl TfcBlockHeader { - fn parse(buf: &mut Bytes) -> Result { +impl SizedBlockHeader { + fn parse(buf: &mut Bytes) -> Result { let num_entries = buf.get_u8(); let mut sizes = [0_usize; BLOCK_SIZE - 1]; @@ -65,9 +65,9 @@ impl TfcBlockHeader { } #[derive(Clone, Debug)] -pub struct TfcDictEntry(Vec); +pub struct SizedDictEntry(Vec); -impl TfcDictEntry { +impl SizedDictEntry { pub fn new(parts: Vec) -> Self { Self(parts) } @@ -101,16 +101,16 @@ impl TfcDictEntry { v } - pub fn as_buf(&self) -> TfcEntryBuf { - TfcEntryBuf { + pub fn as_buf(&self) -> SizedDictEntryBuf { + SizedDictEntryBuf { entry: self, slice_ix: 0, pos_in_slice: 0, } } - pub fn into_buf(self) -> OwnedTfcEntryBuf { - OwnedTfcEntryBuf { + pub fn into_buf(self) -> OwnedSizedDictEntryBuf { + OwnedSizedDictEntryBuf { entry: self, slice_ix: 0, pos_in_slice: 0, @@ -176,7 +176,7 @@ impl TfcDictEntry { } } -impl PartialEq for TfcDictEntry { +impl PartialEq for SizedDictEntry { fn eq(&self, other: &Self) -> bool { // unequal length, so can't be equal if self.len() != other.len() { @@ -187,9 +187,9 @@ impl PartialEq for TfcDictEntry { } } -impl Eq for TfcDictEntry {} +impl Eq for SizedDictEntry {} -impl Hash for TfcDictEntry { +impl Hash for SizedDictEntry { fn hash(&self, state: &mut H) { for part in self.0.iter() { state.write(part); @@ -197,7 +197,7 @@ impl Hash for TfcDictEntry { } } -impl Ord for TfcDictEntry { +impl Ord for SizedDictEntry { fn cmp(&self, other: &Self) -> Ordering { // both are empty, so equal if self.len() == 0 && other.len() == 0 { @@ -270,25 +270,25 @@ impl Ord for TfcDictEntry { } } -impl PartialOrd for TfcDictEntry { +impl PartialOrd for SizedDictEntry { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } #[derive(Clone)] -pub struct TfcEntryBuf<'a> { - entry: &'a TfcDictEntry, +pub struct SizedDictEntryBuf<'a> { + entry: &'a SizedDictEntry, slice_ix: usize, pos_in_slice: usize, } -fn calculate_remaining<'a>(entry: &TfcDictEntry, slice_ix: usize, pos_in_slice: usize) -> usize { +fn calculate_remaining<'a>(entry: &SizedDictEntry, slice_ix: usize, pos_in_slice: usize) -> usize { let total: usize = entry.0.iter().skip(slice_ix).map(|s| s.len()).sum(); total - pos_in_slice } -fn calculate_chunk<'a>(entry: &'a TfcDictEntry, slice_ix: usize, pos_in_slice: usize) -> &[u8] { +fn calculate_chunk<'a>(entry: &'a SizedDictEntry, slice_ix: usize, pos_in_slice: usize) -> &[u8] { if slice_ix >= entry.0.len() { &[] } else { @@ -298,7 +298,7 @@ fn calculate_chunk<'a>(entry: &'a TfcDictEntry, slice_ix: usize, pos_in_slice: u } fn calculate_advance<'a>( - entry: &'a TfcDictEntry, + entry: &'a SizedDictEntry, slice_ix: &mut usize, pos_in_slice: &mut usize, mut cnt: usize, @@ -338,7 +338,7 @@ fn calculate_advance<'a>( } } -impl<'a> Buf for TfcEntryBuf<'a> { +impl<'a> Buf for SizedDictEntryBuf<'a> { fn remaining(&self) -> usize { calculate_remaining(self.entry, self.slice_ix, self.pos_in_slice) } @@ -352,13 +352,13 @@ impl<'a> Buf for TfcEntryBuf<'a> { } } -pub struct OwnedTfcEntryBuf { - entry: TfcDictEntry, +pub struct OwnedSizedDictEntryBuf { + entry: SizedDictEntry, slice_ix: usize, pos_in_slice: usize, } -impl Buf for OwnedTfcEntryBuf { +impl Buf for OwnedSizedDictEntryBuf { fn remaining(&self) -> usize { calculate_remaining(&self.entry, self.slice_ix, self.pos_in_slice) } @@ -372,16 +372,16 @@ impl Buf for OwnedTfcEntryBuf { } } -pub struct TfcBlock { - header: TfcBlockHeader, +pub struct SizedDictBlock { + header: SizedBlockHeader, data: Bytes, } -impl TfcBlock { - pub fn parse(bytes: &mut Bytes) -> Result { - let header = TfcBlockHeader::parse(bytes)?; +impl SizedDictBlock { + pub fn parse(bytes: &mut Bytes) -> Result { + let header = SizedBlockHeader::parse(bytes)?; if bytes.remaining() < header.buffer_length { - return Err(TfcError::NotEnoughData); + return Err(SizedDictError::NotEnoughData); } let data = bytes.split_to(header.buffer_length); @@ -397,9 +397,9 @@ impl TfcBlock { self.header.num_entries != BLOCK_SIZE as u8 } - pub fn entry(&self, index: usize) -> TfcDictEntry { + pub fn entry(&self, index: usize) -> SizedDictEntry { if index == 0 { - return TfcDictEntry::new(vec![self.header.head.clone()]); + return SizedDictEntry::new(vec![self.header.head.clone()]); } let mut v = Vec::with_capacity(7); @@ -461,7 +461,7 @@ impl TfcBlock { let suffix_size = self.header.sizes[index - 1]; slices.push(self.data.slice(offset..offset + suffix_size)); - TfcDictEntry::new_optimized(slices) + SizedDictEntry::new_optimized(slices) } fn suffixes<'a>(&'a self) -> impl Iterator + 'a { @@ -584,7 +584,7 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> size } -pub fn block_head(mut block: Bytes) -> Result { +pub fn block_head(mut block: Bytes) -> Result { block.advance(1); let (size, _) = vbyte::decode_buf(&mut block)?; Ok(block.split_to(size as usize)) @@ -602,10 +602,10 @@ mod tests { buf.freeze() } - fn build_block(strings: &[&[u8]]) -> TfcBlock { + fn build_block(strings: &[&[u8]]) -> SizedDictBlock { let mut bytes = build_block_bytes(strings); - TfcBlock::parse(&mut bytes).unwrap() + SizedDictBlock::parse(&mut bytes).unwrap() } #[test] @@ -614,7 +614,7 @@ mod tests { let block = build_block(&strings); - let expected_header = TfcBlockHeader { + let expected_header = SizedBlockHeader { head: Bytes::copy_from_slice(b"aaaaaa"), num_entries: 5, buffer_length: 11, diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 0221095f..0aef6b89 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -32,12 +32,12 @@ fn build_dict_unchecked<'a, B1: BufMut, B2: BufMut, I: Iterator array_builder.finalize(); } -pub struct TfcDict { +pub struct SizedDict { offsets: LogArray, data: Bytes, } -impl TfcDict { +impl SizedDict { pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { let offsets = LogArray::parse(offsets).unwrap(); Self { offsets, data } @@ -67,9 +67,9 @@ impl TfcDict { block_bytes } - pub fn block(&self, block_index: usize) -> TfcBlock { + pub fn block(&self, block_index: usize) -> SizedDictBlock { let mut block_bytes = self.block_bytes(block_index); - TfcBlock::parse(&mut block_bytes).unwrap() + SizedDictBlock::parse(&mut block_bytes).unwrap() } pub fn block_head(&self, block_index: usize) -> Bytes { @@ -87,7 +87,7 @@ impl TfcDict { self.offsets.len() + 1 } - pub fn entry(&self, index: u64) -> TfcDictEntry { + pub fn entry(&self, index: u64) -> SizedDictEntry { let block = self.block((index / 8) as usize); block.entry((index % 8) as usize) } @@ -166,7 +166,7 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = TfcDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::from_parts(array_bytes, data_bytes); assert_eq!(2, dict.num_blocks()); assert_eq!(b"aaaaaaaa", &dict.block_head(0)[..]); @@ -208,7 +208,7 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = TfcDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::from_parts(array_bytes, data_bytes); for (ix, s) in strings.into_iter().enumerate() { assert_eq!(IdLookupResult::Found(ix as u64), dict.id(s)); @@ -240,7 +240,7 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = TfcDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::from_parts(array_bytes, data_bytes); assert_eq!(IdLookupResult::NotFound, dict.id(b"a")); assert_eq!(IdLookupResult::Closest(0), dict.id(b"ab")); From 1124d43823341268a81b7fef7930e593a7a65e25 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 24 Nov 2022 16:42:51 +0100 Subject: [PATCH 14/99] typed dictionary segments --- src/structure/tfc/block.rs | 4 +- src/structure/tfc/dict.rs | 9 ++- src/structure/tfc/mod.rs | 1 + src/structure/tfc/typed.rs | 156 +++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 7 deletions(-) create mode 100644 src/structure/tfc/typed.rs diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 10adee23..0d204c4f 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -532,9 +532,7 @@ impl IdLookupResult { pub fn default(self, default: u64) -> Self { match self { - Self::NotFound => { - Self::Closest(default) - } + Self::NotFound => Self::Closest(default), _ => self, } } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 0aef6b89..e9d3c3e3 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -6,7 +6,7 @@ use itertools::Itertools; use super::block::*; -fn build_dict_unchecked<'a, B1: BufMut, B2: BufMut, I: Iterator>( +pub fn build_dict_unchecked, I: Iterator>( array_buf: &mut B1, data_buf: &mut B2, iter: I, @@ -16,8 +16,9 @@ fn build_dict_unchecked<'a, B1: BufMut, B2: BufMut, I: Iterator let mut offset = 0; for chunk in &chunk_iter { - let slices: Vec<&[u8]> = chunk.collect(); - let size = build_block_unchecked(data_buf, &slices); + let slices: Vec = chunk.collect(); + let borrows: Vec<&[u8]> = slices.iter().map(|s| s.as_ref()).collect(); + let size = build_block_unchecked(data_buf, &borrows); offset += size; offsets.push(offset as u64); } @@ -129,7 +130,7 @@ impl SizedDict { // previous block if the entry was not found in the // current block. This is only possible if the block as // not the very first one. - result.default(self.block_num_elements(found-1) as u64 + offset - 1) + result.default(self.block_num_elements(found - 1) as u64 + offset - 1) } else { result } diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index 01c22bfa..5813c5f4 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -1,2 +1,3 @@ pub mod block; pub mod dict; +pub mod typed; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs new file mode 100644 index 00000000..ed83b7f6 --- /dev/null +++ b/src/structure/tfc/typed.rs @@ -0,0 +1,156 @@ +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use std::marker::PhantomData; + +use crate::structure::MonotonicLogArray; + +use super::{ + block::IdLookupResult, + dict::{build_dict_unchecked, SizedDict}, +}; + +pub struct TypedDict { + types_present: MonotonicLogArray, + type_offsets: Option, + data: Bytes, +} + +pub struct TypedDictSegment { + dict: SizedDict, + _x: PhantomData, +} + +impl TypedDictSegment { + pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { + let dict = SizedDict::from_parts(offsets, data); + Self { + dict, + _x: Default::default(), + } + } + + pub fn get(&self, index: u64) -> T { + let entry = self.dict.entry(index); + T::from_lexical(entry.into_buf()) + } + + pub fn id(&self, val: &T) -> IdLookupResult { + let slice = val.to_lexical(); + self.dict.id(&slice[..]) + } +} + +pub enum Datatype { + String, + UInt64, +} + +pub trait TdbDataType { + fn datatype() -> Datatype; + + fn to_lexical(&self) -> Bytes; + + fn from_lexical(b: B) -> Self; +} + +impl TdbDataType for String { + fn datatype() -> Datatype { + Datatype::String + } + + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self.as_bytes()) + } + + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + String::from_utf8(vec).unwrap() + } +} + +impl TdbDataType for u64 { + fn datatype() -> Datatype { + Datatype::UInt64 + } + + fn to_lexical(&self) -> Bytes { + let mut buf = BytesMut::new().writer(); + buf.write_u64::(*self).unwrap(); + + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + b.reader().read_u64::().unwrap() + } +} + +pub fn build_segment>( + array_buf: &mut B1, + data_buf: &mut B2, + iter: I, +) { + let slices = iter.map(|val| val.to_lexical()); + + build_dict_unchecked(array_buf, data_buf, slices); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_and_parse_string_dictionary() { + let strings: Vec<_> = [ + "aaaaaaaa", + "bbbbbbbb", + "bbbcccdaaaa", + "f", + "fafasdfas", + "gafovp", + "gdfasfa", + "gdfbbbbbb", + "hello", + "iguana", + "illusion", + "illustrated", + "jetengine", + "jetplane", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + + let mut offsets = BytesMut::new(); + let mut data = BytesMut::new(); + + build_segment(&mut offsets, &mut data, strings.clone().into_iter()); + + let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); + + for (ix, s) in strings.into_iter().enumerate() { + assert_eq!(IdLookupResult::Found(ix as u64), segment.id(&s)); + assert_eq!(s, segment.get(ix as u64)); + } + } + + #[test] + fn build_and_parse_u64_dictionary() { + let nums: Vec<_> = vec![ + 2, 5, 42, 2324, 256463, 256464, 1234567, 803050303, 999999999, 9999999999, + ]; + + let mut offsets = BytesMut::new(); + let mut data = BytesMut::new(); + + build_segment(&mut offsets, &mut data, nums.clone().into_iter()); + + let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); + + for (ix, s) in nums.into_iter().enumerate() { + assert_eq!(IdLookupResult::Found(ix as u64), segment.id(&s)); + assert_eq!(s, segment.get(ix as u64)); + } + } +} From a2df5f37144b9f39aa3c4d1e2ce11b8e9fb3d3e7 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 25 Nov 2022 15:23:09 +0100 Subject: [PATCH 15/99] Add decimals and bigints --- Cargo.toml | 1 + src/structure/tfc/mod.rs | 2 + src/structure/tfc/typed.rs | 236 ++++++++++++++++++++++++++++++++++++- 3 files changed, 236 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dd626ad9..ccc95d7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ rayon = "1.4" thiserror = "1.0" async-trait = "0.1" itertools = "0.10" +rug = "1.16" [dev-dependencies] tempfile = "3.1" diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index 5813c5f4..4b0b89cd 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -1,3 +1,5 @@ pub mod block; +pub mod decimal; pub mod dict; +pub mod integer; pub mod typed; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index ed83b7f6..40146dd6 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,12 +1,14 @@ +use crate::structure::MonotonicLogArray; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use rug::Integer; use std::marker::PhantomData; -use crate::structure::MonotonicLogArray; - use super::{ block::IdLookupResult, + decimal::{decimal_to_storage, storage_to_decimal}, dict::{build_dict_unchecked, SizedDict}, + integer::{bigint_to_storage, storage_to_bigint}, }; pub struct TypedDict { @@ -43,6 +45,13 @@ impl TypedDictSegment { pub enum Datatype { String, UInt64, + UInt32, + Int64, + Int32, + Float32, + Float64, + Decimal, + BigInt, } pub trait TdbDataType { @@ -86,6 +95,135 @@ impl TdbDataType for u64 { } } +const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); +impl TdbDataType for i64 { + fn datatype() -> Datatype { + Datatype::Int64 + } + + fn to_lexical(&self) -> Bytes { + let sign_flip = I64_BYTE_MASK ^ (*self as u64); + let mut buf = BytesMut::new().writer(); + buf.write_u64::(sign_flip).unwrap(); + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + (I64_BYTE_MASK ^ i) as i64 + } +} + +const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); +impl TdbDataType for i32 { + fn datatype() -> Datatype { + Datatype::Int32 + } + + fn to_lexical(&self) -> Bytes { + let sign_flip = I32_BYTE_MASK ^ (*self as u32); + let mut buf = BytesMut::new().writer(); + buf.write_u32::(sign_flip).unwrap(); + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + (I32_BYTE_MASK ^ i) as i32 + } +} + +const F32_SIGN_MASK: u32 = 0x8000_0000; +const F32_COMPLEMENT: u32 = 0xffff_ffff; +impl TdbDataType for f32 { + fn datatype() -> Datatype { + Datatype::Float32 + } + + fn to_lexical(&self) -> Bytes { + let f = *self; + let g: u32; + if f.signum() == -1.0 { + g = f.to_bits() ^ F32_COMPLEMENT; + } else { + g = f.to_bits() ^ F32_SIGN_MASK; + }; + let mut buf = BytesMut::new().writer(); + buf.write_u32::(g).unwrap(); + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + if i & F32_SIGN_MASK > 0 { + f32::from_bits(i ^ F32_SIGN_MASK) + } else { + f32::from_bits(i ^ F32_COMPLEMENT) + } + } +} + +const F64_SIGN_MASK: u64 = 0x8000_0000_0000_0000; +const F64_COMPLEMENT: u64 = 0xffff_ffff_ffff_ffff; +impl TdbDataType for f64 { + fn datatype() -> Datatype { + Datatype::Float64 + } + + fn to_lexical(&self) -> Bytes { + let f = *self; + let g: u64; + if f.signum() == -1.0 { + g = f.to_bits() ^ F64_COMPLEMENT; + } else { + g = f.to_bits() ^ F64_SIGN_MASK; + }; + let mut buf = BytesMut::new().writer(); + buf.write_u64::(g).unwrap(); + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + if i & F64_SIGN_MASK > 0 { + f64::from_bits(i ^ F64_SIGN_MASK) + } else { + f64::from_bits(i ^ F64_COMPLEMENT) + } + } +} + +impl TdbDataType for Integer { + fn datatype() -> Datatype { + Datatype::Float64 + } + + fn to_lexical(&self) -> Bytes { + Bytes::from(bigint_to_storage(self.clone())) + } + + fn from_lexical(mut b: B) -> Self { + storage_to_bigint(&mut b) + } +} + +#[derive(PartialEq, Debug)] +pub struct Decimal(String); + +impl TdbDataType for Decimal { + fn datatype() -> Datatype { + Datatype::Decimal + } + + fn to_lexical(&self) -> Bytes { + Bytes::from(decimal_to_storage(&self.0)) + } + + fn from_lexical(mut b: B) -> Self { + Decimal(storage_to_decimal(&mut b)) + } +} + pub fn build_segment>( array_buf: &mut B1, data_buf: &mut B2, @@ -137,7 +275,7 @@ mod tests { #[test] fn build_and_parse_u64_dictionary() { - let nums: Vec<_> = vec![ + let nums: Vec = vec![ 2, 5, 42, 2324, 256463, 256464, 1234567, 803050303, 999999999, 9999999999, ]; @@ -153,4 +291,96 @@ mod tests { assert_eq!(s, segment.get(ix as u64)); } } + + use std::fmt::Debug; + + fn cycle(d: D) + where + D: TdbDataType + PartialEq + Debug, + { + let j = D::from_lexical(d.to_lexical()); + assert_eq!(d, j) + } + + #[test] + fn cycle_i64() { + cycle(-1_i64); + cycle(-23423423_i64); + cycle(0_i64); + cycle(i64::MAX); + cycle(i64::MIN); + cycle(324323_i64); + } + + #[test] + fn cycle_i32() { + cycle(-1_i32); + cycle(-23423423_i32); + cycle(0_i32); + cycle(i32::MAX); + cycle(i32::MIN); + cycle(324323_i32); + } + + #[test] + fn cycle_f32() { + cycle(-1_f32); + cycle(-23423423_f32); + cycle(0_f32); + cycle(324323_f32); + cycle(324323.2343_f32); + cycle(-324323.2343_f32); + cycle(f32::MAX); + cycle(f32::MIN); + cycle(f32::NEG_INFINITY); + cycle(f32::INFINITY); + + let j = f32::from_lexical(f32::NAN.to_lexical()); + assert!(j.is_nan()) + } + + #[test] + fn cycle_f64() { + cycle(-1_f64); + cycle(-23423423_f64); + cycle(0_f64); + cycle(-0_f64); + cycle(324323_f64); + cycle(324323.2343_f64); + cycle(-324323.2343_f64); + cycle(f64::MAX); + cycle(f64::MIN); + cycle(f64::NEG_INFINITY); + cycle(f64::INFINITY); + + let j = f64::from_lexical(f64::NAN.to_lexical()); + assert!(j.is_nan()) + } + + fn int(s: &str) -> Integer { + s.parse::().unwrap() + } + + #[test] + fn cycle_integer() { + cycle(int("-1")); + cycle(int("-12342343")); + cycle(int("0")); + cycle(int("234239847938724")); + cycle(int("983423984793872423423423432312698")); + cycle(int("-983423984793872423423423432312698")); + } + + #[test] + fn cycle_decimal() { + cycle(Decimal("-1".to_string())); + cycle(Decimal("-12342343".to_string())); + cycle(Decimal("0".to_string())); + cycle(Decimal("-0.1".to_string())); + cycle(Decimal("-0.0".to_string())); + cycle(Decimal("-0.1239343".to_string())); + cycle(Decimal("234239847938724.23423421".to_string())); + cycle(Decimal("983423984793872423423423432312698".to_string())); + cycle(Decimal("-983423984793872423423423432312698".to_string())); + } } From 1b917104ac6937a6e0329695e54d2f13bd89f9f2 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 25 Nov 2022 15:43:04 +0100 Subject: [PATCH 16/99] Implementaiton of decimal and integer --- src/structure/tfc/decimal.rs | 119 ++++++++++++++++++++++++++++++++++ src/structure/tfc/integer.rs | 121 +++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 src/structure/tfc/decimal.rs create mode 100644 src/structure/tfc/integer.rs diff --git a/src/structure/tfc/decimal.rs b/src/structure/tfc/decimal.rs new file mode 100644 index 00000000..7fa0143d --- /dev/null +++ b/src/structure/tfc/decimal.rs @@ -0,0 +1,119 @@ +use bytes::Buf; +use rug::Integer; + +use crate::structure::tfc::integer; + +use super::integer::{bigint_to_storage, storage_to_bigint_and_sign, NEGATIVE_ZERO}; + +fn encode_fraction(fraction: Option<&str>) -> Vec { + if let Some(f) = fraction { + if f.is_empty() { + return vec![0x00]; // a "false zero" so we don't represent it at all. + } + let len = f.len(); + let size = len / 2 + usize::from(len % 2 != 0); + let mut bcd = Vec::with_capacity(size); + for i in 0..size { + let last = if i * 2 + 2 > len { + i * 2 + 1 + } else { + i * 2 + 2 + }; + let two = &f[2 * i..last]; + let mut this_int = centary_decimal_encode(two); + this_int <<= 1; + if i != size - 1 { + this_int |= 1 // add continuation bit. + } + bcd.push(this_int) + } + bcd + } else { + vec![0x00] // a "false zero" so we don't represent no fraction as a fraction + } +} + +fn centary_decimal_encode(s: &str) -> u8 { + if s.len() == 1 { + let i = s.parse::().unwrap(); + i * 11 + 1 + } else { + let i = s.parse::().unwrap(); + let o = i / 10 + 1; + i + o + 1 + } +} + +fn centary_decimal_decode(i: u8) -> String { + let j = i - 1; + if j % 11 == 0 { + let num = j / 11; + format!("{num:}") + } else { + let d = j / 11; + let num = j - d - 1; + format!("{num:02}") + } +} + +fn decode_fraction(fraction_buf: &mut B, is_pos: bool) -> String { + let mut first_byte = fraction_buf.chunk()[0]; + if !is_pos { + first_byte = !first_byte; + } + if first_byte == 0x00 { + "".to_string() + } else { + let mut s = String::new(); + while fraction_buf.has_remaining() { + let mut byte = fraction_buf.get_u8(); + if !is_pos { + byte = !byte; + } + let num = byte >> 1; + let res = centary_decimal_decode(num); + s.push_str(&res); + if res.len() == 1 || byte & 1 == 0 { + break; + } + } + s + } +} + +pub fn decimal_to_storage(decimal: &str) -> Vec { + let mut parts = decimal.split('.'); + let bigint = parts.next().unwrap_or(decimal); + let fraction = parts.next(); + let integer_part = bigint.parse::().unwrap(); + let is_neg = decimal.starts_with('-'); + let prefix = bigint_to_storage(integer_part.clone()); + let mut prefix = if integer_part == 0 && is_neg { + vec![NEGATIVE_ZERO] // negative zero + } else { + prefix + }; + let suffix = if is_neg { + let mut suffix = encode_fraction(fraction); + for i in 0..suffix.len() { + suffix[i] = !suffix[i] + } + suffix + } else { + encode_fraction(fraction) + }; + prefix.extend(suffix); + prefix +} + +pub fn storage_to_decimal(bytes: &mut B) -> String { + let (int, is_pos) = storage_to_bigint_and_sign(bytes); + let fraction = decode_fraction(bytes, is_pos); + let decimal = if fraction.is_empty() { + format!("{int:}") + } else { + let sign = if int == 0 && !is_pos { "-" } else { "" }; + format!("{sign:}{int:}.{fraction:}") + }; + decimal +} diff --git a/src/structure/tfc/integer.rs b/src/structure/tfc/integer.rs new file mode 100644 index 00000000..25617add --- /dev/null +++ b/src/structure/tfc/integer.rs @@ -0,0 +1,121 @@ +use bytes::Buf; +use rug::Integer; + +const TERMINAL: u8 = 0; +const FIRST_SIGN: u8 = 0b1000_0000u8; +const FIRST_TERMINAL: u8 = 0b0000_0000u8; +const CONTINUATION: u8 = 0b1000_0000u8; +const FIRST_CONTINUATION: u8 = 0b0100_0000u8; +const BASE_MASK: u8 = !CONTINUATION; +const FIRST_MASK: u8 = !(FIRST_SIGN | FIRST_CONTINUATION); +const FIRST_MAX: u8 = FIRST_CONTINUATION; +pub const NEGATIVE_ZERO: u8 = 0b0111_1111; + +// Leave in reverse order for the convenience of the caller +fn size_encode(size: u32) -> Vec { + if size == 0 { + return vec![NEGATIVE_ZERO]; // just the positive sign bit (allows negative zero) + } + let mut remainder = size; + let mut v = vec![]; + let mut last = true; + while remainder > 0 { + if remainder >= CONTINUATION as u32 { + let continued = if last { TERMINAL } else { CONTINUATION }; + let byte = continued | ((remainder & BASE_MASK as u32) as u8); + v.push(byte); + } else if remainder >= FIRST_MAX as u32 { + // special case where we fit in 7 bits but not 6 + // and we need a zero padded initial byte. + let continued = if last { TERMINAL } else { CONTINUATION }; + let byte = continued | ((remainder & BASE_MASK as u32) as u8); + v.push(byte); + let byte = FIRST_SIGN | FIRST_CONTINUATION; + v.push(byte) + } else { + let continued = if last { + FIRST_TERMINAL + } else { + FIRST_CONTINUATION + }; + let byte = FIRST_SIGN | continued | ((remainder & FIRST_MASK as u32) as u8); + v.push(byte) + } + remainder >>= 7; + last = false; + } + v +} + +fn size_decode(v: &mut B) -> (bool, u32, usize) { + let mut size: u32 = 0; + let mut sign = true; + let mut i = 0; + while v.has_remaining() { + let vi = v.get_u8(); + if i == 0 { + sign = vi & FIRST_SIGN != 0; + let vi = if sign { vi } else { !vi }; + let val = (vi & FIRST_MASK) as u32; + if vi & FIRST_CONTINUATION == 0 { + return (sign, val, i + 1); + } else { + size += val + } + } else { + let vi = if sign { vi } else { !vi }; + let val = (vi & BASE_MASK) as u32; + if vi & CONTINUATION == 0 { + return (sign, size + val, i + 1); + } else { + size += val + } + } + size <<= 7; + i += 1; + } + (sign, size, i) +} + +pub fn bigint_to_storage(bigint: Integer) -> Vec { + let is_neg = bigint < 0; + let mut int = bigint.abs(); + let size = int.significant_bits() + 1; + let num_bytes = (size / 8) + u32::from(size % 8 != 0); + let size_bytes = size_encode(num_bytes); + let mut number_vec = Vec::with_capacity(size_bytes.len() + num_bytes as usize + 1); + for _ in 0..num_bytes { + let byte = int.to_u8_wrapping(); + number_vec.push(byte); + int >>= 8; + } + number_vec.extend(size_bytes); + if is_neg { + for i in 0..number_vec.len() { + number_vec[i] = !number_vec[i] + } + } + number_vec.reverse(); + number_vec +} + +pub fn storage_to_bigint_and_sign(bytes: &mut B) -> (Integer, bool) { + let (is_pos, size, _) = size_decode(bytes); + let mut int = Integer::new(); + if size == 0 { + return (int, is_pos); + } + for _ in 0..size { + int <<= 8; + let b = bytes.get_u8(); + int += if is_pos { b } else { !b }; + } + if !is_pos { + int = -int; + } + (int, is_pos) +} + +pub fn storage_to_bigint(bytes: &mut B) -> Integer { + storage_to_bigint_and_sign(bytes).0 +} From b1ddd01966097774d037e38302de31145461a22d Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Fri, 25 Nov 2022 15:46:37 +0100 Subject: [PATCH 17/99] only enable rug features we need --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ccc95d7c..96e14b42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ rayon = "1.4" thiserror = "1.0" async-trait = "0.1" itertools = "0.10" -rug = "1.16" +rug = {version="1.16", default-features=false, features=["integer","rational"]} [dev-dependencies] tempfile = "3.1" From 699bfbc197577c53667111483d41873e3c88b9f5 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Fri, 25 Nov 2022 16:10:49 +0100 Subject: [PATCH 18/99] refactor segment building to use one continuous offset array --- src/structure/tfc/dict.rs | 25 ++++++++++++++++--------- src/structure/tfc/typed.rs | 24 ++++++++++++++++++------ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index e9d3c3e3..7f0eb358 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -6,13 +6,12 @@ use itertools::Itertools; use super::block::*; -pub fn build_dict_unchecked, I: Iterator>( - array_buf: &mut B1, - data_buf: &mut B2, +pub fn build_dict_unchecked, I: Iterator>( + offsets: &mut Vec, + data_buf: &mut B, iter: I, ) { let chunk_iter = iter.chunks(BLOCK_SIZE); - let mut offsets = Vec::new(); let mut offset = 0; for chunk in &chunk_iter { @@ -22,12 +21,14 @@ pub fn build_dict_unchecked, I: Iterator< offset += size; offsets.push(offset as u64); } - +} +pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { + // the last offset doesn't matter as it's implied by the total size offsets.pop(); let largest_element = offsets.last().cloned().unwrap_or(0); let width = calculate_width(largest_element); - let mut array_builder = LogArrayBufBuilder::new(array_buf, width); + let mut array_builder = LogArrayBufBuilder::new(buf, width); array_builder.push_vec(offsets); array_builder.finalize(); @@ -142,6 +143,12 @@ mod tests { use super::*; use bytes::BytesMut; + fn build_dict_and_offsets, I: Iterator>(array_buf: &mut B1, data_buf: &mut B2, vals: I) { + let mut offsets = Vec::new(); + build_dict_unchecked(&mut offsets, data_buf, vals); + build_offset_logarray(array_buf, offsets); + } + #[test] fn build_dict_of_two_blocks() { let strings: Vec<&[u8]> = vec![ @@ -163,7 +170,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -205,7 +212,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -237,7 +244,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_unchecked(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 40146dd6..2e823eb9 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -224,20 +224,32 @@ impl TdbDataType for Decimal { } } -pub fn build_segment>( - array_buf: &mut B1, - data_buf: &mut B2, +pub fn build_segment>( + offsets: &mut Vec, + data_buf: &mut B, iter: I, ) { let slices = iter.map(|val| val.to_lexical()); - build_dict_unchecked(array_buf, data_buf, slices); + build_dict_unchecked(offsets, data_buf, slices); } #[cfg(test)] mod tests { + use crate::structure::tfc::dict::build_offset_logarray; + use super::*; + fn build_segment_and_offsets>( + array_buf: &mut B1, + data_buf: &mut B2, + iter: I, + ) { + let mut offsets = Vec::new(); + build_segment(&mut offsets, data_buf, iter); + build_offset_logarray(array_buf, offsets); + } + #[test] fn build_and_parse_string_dictionary() { let strings: Vec<_> = [ @@ -263,7 +275,7 @@ mod tests { let mut offsets = BytesMut::new(); let mut data = BytesMut::new(); - build_segment(&mut offsets, &mut data, strings.clone().into_iter()); + build_segment_and_offsets(&mut offsets, &mut data, strings.clone().into_iter()); let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); @@ -282,7 +294,7 @@ mod tests { let mut offsets = BytesMut::new(); let mut data = BytesMut::new(); - build_segment(&mut offsets, &mut data, nums.clone().into_iter()); + build_segment_and_offsets(&mut offsets, &mut data, nums.clone().into_iter()); let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); From ab5e12f8ca1ed0f7f9a998690f1050a6bc362de0 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Fri, 25 Nov 2022 17:05:02 +0100 Subject: [PATCH 19/99] write mutliple segments into one go --- src/structure/tfc/typed.rs | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 2e823eb9..6e86f5fa 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,13 +1,14 @@ -use crate::structure::MonotonicLogArray; +use crate::structure::{MonotonicLogArray, util::calculate_width, LogArrayBufBuilder}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use rug::Integer; use std::marker::PhantomData; +use itertools::*; use super::{ block::IdLookupResult, decimal::{decimal_to_storage, storage_to_decimal}, - dict::{build_dict_unchecked, SizedDict}, + dict::{build_dict_unchecked, SizedDict, build_offset_logarray}, integer::{bigint_to_storage, storage_to_bigint}, }; @@ -42,8 +43,9 @@ impl TypedDictSegment { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Datatype { - String, + String = 0, UInt64, UInt32, Int64, @@ -234,6 +236,34 @@ pub fn build_segment>( build_dict_unchecked(offsets, data_buf, slices); } +pub fn build_multiple_segments, I: Iterator>(used_types: &mut B1, type_offsets: &mut B2, block_offsets: &mut B3, data: &mut B4, iter: I) { + let mut types: Vec<(Datatype, u64)> = Vec::new(); + let mut offsets = Vec::with_capacity(iter.size_hint().0); + for (key, group) in iter.group_by(|v|v.0).into_iter() { + let start_offset = offsets.len(); + types.push((key, start_offset as u64)); + build_dict_unchecked(&mut offsets, data, group.map(|v|v.1)); + } + offsets.pop(); + build_offset_logarray(block_offsets, offsets); + + let largest = types.last().unwrap(); + + let types_width = calculate_width(largest.0 as u64); + let type_offsets_width = calculate_width(largest.1); + + let mut types_builder = LogArrayBufBuilder::new(used_types, types_width); + let mut type_offsets_builder = LogArrayBufBuilder::new(type_offsets, type_offsets_width); + + for (t,o) in types { + types_builder.push(t as u64); + type_offsets_builder.push(o); + } + + types_builder.finalize(); + type_offsets_builder.finalize(); +} + #[cfg(test)] mod tests { use crate::structure::tfc::dict::build_offset_logarray; From 71df537d3a2b10706c1bcef87de5a12d134e4dc1 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 26 Nov 2022 11:05:16 +0100 Subject: [PATCH 20/99] Working test, added start offset parameter, Need to fix offsets --- src/structure/tfc/dict.rs | 15 ++-- src/structure/tfc/typed.rs | 154 +++++++++++++++++++++++++++++-------- 2 files changed, 133 insertions(+), 36 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 7f0eb358..2f3c0a6e 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -7,19 +7,20 @@ use itertools::Itertools; use super::block::*; pub fn build_dict_unchecked, I: Iterator>( + start_offset: u64, offsets: &mut Vec, data_buf: &mut B, iter: I, ) { let chunk_iter = iter.chunks(BLOCK_SIZE); - let mut offset = 0; + let mut offset = start_offset; for chunk in &chunk_iter { let slices: Vec = chunk.collect(); let borrows: Vec<&[u8]> = slices.iter().map(|s| s.as_ref()).collect(); let size = build_block_unchecked(data_buf, &borrows); - offset += size; - offsets.push(offset as u64); + offset += size as u64; + offsets.push(offset); } } pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { @@ -143,9 +144,13 @@ mod tests { use super::*; use bytes::BytesMut; - fn build_dict_and_offsets, I: Iterator>(array_buf: &mut B1, data_buf: &mut B2, vals: I) { + fn build_dict_and_offsets, I: Iterator>( + array_buf: &mut B1, + data_buf: &mut B2, + vals: I, + ) { let mut offsets = Vec::new(); - build_dict_unchecked(&mut offsets, data_buf, vals); + build_dict_unchecked(0, &mut offsets, data_buf, vals); build_offset_logarray(array_buf, offsets); } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 6e86f5fa..a1d9e2d5 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,14 +1,14 @@ -use crate::structure::{MonotonicLogArray, util::calculate_width, LogArrayBufBuilder}; +use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use itertools::*; use rug::Integer; use std::marker::PhantomData; -use itertools::*; use super::{ block::IdLookupResult, decimal::{decimal_to_storage, storage_to_decimal}, - dict::{build_dict_unchecked, SizedDict, build_offset_logarray}, + dict::{build_dict_unchecked, build_offset_logarray, SizedDict}, integer::{bigint_to_storage, storage_to_bigint}, }; @@ -17,6 +17,17 @@ pub struct TypedDict { type_offsets: Option, data: Bytes, } +/* +impl TypedDict { + pub fn id(&self, slice: &[u8], dt: Datatype) -> IdLookupResult { + if let Some(i) = self.types_present.index_of(dt as u64) { + let offset = types_offsets[i]; + + } else { + IdLookupResult::NotFound + } + } +}*/ pub struct TypedDictSegment { dict: SizedDict, @@ -46,10 +57,10 @@ impl TypedDictSegment { #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Datatype { String = 0, - UInt64, UInt32, - Int64, Int32, + UInt64, + Int64, Float32, Float64, Decimal, @@ -80,58 +91,75 @@ impl TdbDataType for String { } } -impl TdbDataType for u64 { +impl TdbDataType for u32 { fn datatype() -> Datatype { Datatype::UInt64 } fn to_lexical(&self) -> Bytes { let mut buf = BytesMut::new().writer(); - buf.write_u64::(*self).unwrap(); + buf.write_u32::(*self).unwrap(); buf.into_inner().freeze() } fn from_lexical(b: B) -> Self { - b.reader().read_u64::().unwrap() + b.reader().read_u32::().unwrap() } } -const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); -impl TdbDataType for i64 { +const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); +impl TdbDataType for i32 { fn datatype() -> Datatype { - Datatype::Int64 + Datatype::Int32 } fn to_lexical(&self) -> Bytes { - let sign_flip = I64_BYTE_MASK ^ (*self as u64); + let sign_flip = I32_BYTE_MASK ^ (*self as u32); let mut buf = BytesMut::new().writer(); - buf.write_u64::(sign_flip).unwrap(); + buf.write_u32::(sign_flip).unwrap(); buf.into_inner().freeze() } fn from_lexical(b: B) -> Self { - let i = b.reader().read_u64::().unwrap(); - (I64_BYTE_MASK ^ i) as i64 + let i = b.reader().read_u32::().unwrap(); + (I32_BYTE_MASK ^ i) as i32 } } -const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); -impl TdbDataType for i32 { +impl TdbDataType for u64 { fn datatype() -> Datatype { - Datatype::Int32 + Datatype::UInt64 } fn to_lexical(&self) -> Bytes { - let sign_flip = I32_BYTE_MASK ^ (*self as u32); let mut buf = BytesMut::new().writer(); - buf.write_u32::(sign_flip).unwrap(); + buf.write_u64::(*self).unwrap(); + buf.into_inner().freeze() } fn from_lexical(b: B) -> Self { - let i = b.reader().read_u32::().unwrap(); - (I32_BYTE_MASK ^ i) as i32 + b.reader().read_u64::().unwrap() + } +} + +const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); +impl TdbDataType for i64 { + fn datatype() -> Datatype { + Datatype::Int64 + } + + fn to_lexical(&self) -> Bytes { + let sign_flip = I64_BYTE_MASK ^ (*self as u64); + let mut buf = BytesMut::new().writer(); + buf.write_u64::(sign_flip).unwrap(); + buf.into_inner().freeze() + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + (I64_BYTE_MASK ^ i) as i64 } } @@ -233,20 +261,33 @@ pub fn build_segment>( ) { let slices = iter.map(|val| val.to_lexical()); - build_dict_unchecked(offsets, data_buf, slices); + build_dict_unchecked(0, offsets, data_buf, slices); } -pub fn build_multiple_segments, I: Iterator>(used_types: &mut B1, type_offsets: &mut B2, block_offsets: &mut B3, data: &mut B4, iter: I) { +pub fn build_multiple_segments< + B1: BufMut, + B2: BufMut, + B3: BufMut, + B4: BufMut, + R: AsRef<[u8]>, + I: Iterator, +>( + used_types: &mut B1, + type_offsets: &mut B2, + block_offsets: &mut B3, + data: &mut B4, + iter: I, +) { let mut types: Vec<(Datatype, u64)> = Vec::new(); let mut offsets = Vec::with_capacity(iter.size_hint().0); - for (key, group) in iter.group_by(|v|v.0).into_iter() { - let start_offset = offsets.len(); + for (key, group) in iter.group_by(|v| v.0).into_iter() { + let start_offset = offsets.last().map(|t| *t).unwrap_or(0_u64); types.push((key, start_offset as u64)); - build_dict_unchecked(&mut offsets, data, group.map(|v|v.1)); + build_dict_unchecked(start_offset, &mut offsets, data, group.map(|v| v.1)); } offsets.pop(); build_offset_logarray(block_offsets, offsets); - + eprintln!("types: {types:?}"); let largest = types.last().unwrap(); let types_width = calculate_width(largest.0 as u64); @@ -254,8 +295,8 @@ pub fn build_multiple_segments(t: T) -> (Datatype, Bytes) { + (T::datatype(), t.to_lexical()) + } + + #[test] + fn test_multi_segment() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + make_entry(Decimal("-1".to_string())), + make_entry("asdf".to_string()), + make_entry(Decimal("-12342343.2348973".to_string())), + make_entry("Batty".to_string()), + make_entry("Batman".to_string()), + make_entry(-3_i64), + make_entry(Decimal("2348973".to_string())), + make_entry(4.389832_f32), + make_entry("apple".to_string()), + make_entry(23434.389832_f32), + make_entry("apply".to_string()), + make_entry(-500_i32), + make_entry(20_u32), + ]; + vec.sort(); + let mut used_types = Vec::new(); + let mut type_offsets = Vec::new(); + let mut block_offsets = Vec::new(); + let mut data = BytesMut::new(); + build_multiple_segments( + &mut used_types, + &mut type_offsets, + &mut block_offsets, + &mut data, + vec.clone().into_iter(), + ); + eprintln!("used_types : {used_types:?}"); + eprintln!("type_offsets : {type_offsets:?}"); + eprintln!("block_offsets : {block_offsets:?}"); + eprintln!("data : {data:?}"); + + let used_types_vec: Vec = LogArray::parse(Bytes::from(used_types)) + .unwrap() + .iter() + .collect(); + + let expected_types_vec: Vec = vec.iter().map(|x| x.0 as u64).dedup().collect(); + assert_eq!(used_types_vec, expected_types_vec); + + eprintln!("expected_types_vec: {expected_types_vec:?}"); + + panic!(); + } } From 46b9a284b1aaa376e4daaa10cc51e9b7aff65cf2 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Sat, 26 Nov 2022 11:09:37 +0100 Subject: [PATCH 21/99] fix offsets --- src/structure/tfc/typed.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index a1d9e2d5..ab48c678 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -282,7 +282,8 @@ pub fn build_multiple_segments< let mut offsets = Vec::with_capacity(iter.size_hint().0); for (key, group) in iter.group_by(|v| v.0).into_iter() { let start_offset = offsets.last().map(|t| *t).unwrap_or(0_u64); - types.push((key, start_offset as u64)); + let start_type_offset = offsets.len(); + types.push((key, start_type_offset as u64)); build_dict_unchecked(start_offset, &mut offsets, data, group.map(|v| v.1)); } offsets.pop(); From 1b8393090956a3e85ba5faba46e5a8bbba5ce14b Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Sat, 26 Nov 2022 17:15:10 +0100 Subject: [PATCH 22/99] typed dict retrieval --- Cargo.toml | 2 + src/structure/logarray.rs | 24 ++++ src/structure/tfc/block.rs | 1 + src/structure/tfc/dict.rs | 67 +++++---- src/structure/tfc/typed.rs | 280 +++++++++++++++++++++++++++++-------- 5 files changed, 287 insertions(+), 87 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 96e14b42..51034a67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,8 @@ thiserror = "1.0" async-trait = "0.1" itertools = "0.10" rug = {version="1.16", default-features=false, features=["integer","rational"]} +num-derive = "0.3" +num-traits = "0.2" [dev-dependencies] tempfile = "3.1" diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 629dafa7..bb0d99c5 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -58,6 +58,8 @@ use std::{cmp::Ordering, convert::TryFrom, error, fmt, io}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio_util::codec::{Decoder, FramedRead}; +use itertools::Itertools; + // Static assertion: We expect the system architecture bus width to be >= 32 bits. If it is not, // the following line will cause a compiler error. (Ignore the unrelated error message itself.) const _: usize = 0 - !(std::mem::size_of::() >= 32 >> 3) as usize; @@ -86,6 +88,12 @@ pub struct LogArray { input_buf: Bytes, } +impl std::fmt::Debug for LogArray { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LogArray([{}])", self.iter().format(", ")) + } +} + /// An error that occurred during a log array operation. #[derive(Debug, PartialEq)] pub enum LogArrayError { @@ -681,6 +689,12 @@ pub async fn logarray_stream_entries( #[derive(Clone)] pub struct MonotonicLogArray(LogArray); +impl std::fmt::Debug for MonotonicLogArray { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "MonotonicLogArray([{}])", self.iter().format(", ")) + } +} + impl MonotonicLogArray { pub fn from_logarray(logarray: LogArray) -> MonotonicLogArray { if cfg!(debug_assertions) { @@ -702,6 +716,12 @@ impl MonotonicLogArray { MonotonicLogArray(logarray) } + pub fn parse(bytes: Bytes) -> Result { + let logarray = LogArray::parse(bytes)?; + + Ok(Self::from_logarray(logarray)) + } + pub fn len(&self) -> usize { self.0.len() } @@ -750,6 +770,10 @@ impl MonotonicLogArray { (min + max) / 2 + 1 } + + pub fn slice(&self, offset: usize, len: usize) -> MonotonicLogArray { + Self(self.0.slice(offset, len)) + } } impl From for MonotonicLogArray { diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 0d204c4f..7335091a 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -372,6 +372,7 @@ impl Buf for OwnedSizedDictEntryBuf { } } +#[derive(Debug)] pub struct SizedDictBlock { header: SizedBlockHeader, data: Bytes, diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 2f3c0a6e..7a85cddc 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use crate::structure::{util::calculate_width, LogArray, LogArrayBufBuilder}; +use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; use bytes::{BufMut, Bytes}; use itertools::Itertools; @@ -35,15 +35,21 @@ pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { array_builder.finalize(); } +#[derive(Debug)] pub struct SizedDict { - offsets: LogArray, - data: Bytes, + offsets: MonotonicLogArray, + pub(crate) data: Bytes, + dict_offset: u64, } impl SizedDict { - pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { - let offsets = LogArray::parse(offsets).unwrap(); - Self { offsets, data } + pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { + let offsets = MonotonicLogArray::parse(offsets).unwrap(); + Self::from_parts(offsets, data, dict_offset) + } + + pub fn from_parts(offsets: MonotonicLogArray, data: Bytes, dict_offset: u64) -> Self { + Self { offsets, data, dict_offset } } fn block_offset(&self, block_index: usize) -> usize { @@ -51,21 +57,26 @@ impl SizedDict { if block_index == 0 { offset = 0; } else { - offset = self.offsets.entry(block_index - 1) as usize; + offset = (self.offsets.entry(block_index - 1) - self.dict_offset) as usize; } offset } pub fn block_bytes(&self, block_index: usize) -> Bytes { + dbg!(block_index); let offset = self.block_offset(block_index); let block_bytes; - if block_index == self.offsets.len() { + dbg!(block_index); + dbg!(self.offsets.len()); + //if block_index == self.offsets.len() { + dbg!(offset..); block_bytes = self.data.slice(offset..); - } else { - let end = self.offsets.entry(block_index) as usize; - block_bytes = self.data.slice(offset..end); - } + //} else { + // let end = self.block_offset(block_index+1); + // dbg!(offset..end); + // block_bytes = self.data.slice(offset..end); + //} block_bytes } @@ -91,8 +102,8 @@ impl SizedDict { } pub fn entry(&self, index: u64) -> SizedDictEntry { - let block = self.block((index / 8) as usize); - block.entry((index % 8) as usize) + let block = self.block(((index - 1) / 8) as usize); + block.entry(((index-1) % 8) as usize) } pub fn id(&self, slice: &[u8]) -> IdLookupResult { @@ -116,7 +127,7 @@ impl SizedDict { max = mid - 1; } Ordering::Greater => min = mid + 1, - Ordering::Equal => return IdLookupResult::Found((mid * BLOCK_SIZE) as u64), // what luck! turns out the string we were looking for was the block head + Ordering::Equal => return IdLookupResult::Found((mid * BLOCK_SIZE + 1) as u64), // what luck! turns out the string we were looking for was the block head } } @@ -125,8 +136,9 @@ impl SizedDict { // we found the block the string should be part of. let block = self.block(found); let block_id = block.id(slice); - let offset = (found * BLOCK_SIZE) as u64; - let result = block_id.offset(offset); + let offset = (found * BLOCK_SIZE) as u64 + 1; + let result = block_id.offset(offset).default(offset-1); + /* if found != 0 { // the default value will fill in the last index of the // previous block if the entry was not found in the @@ -136,6 +148,9 @@ impl SizedDict { } else { result } + */ + + result } } @@ -179,7 +194,7 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = SizedDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::parse(array_bytes, data_bytes, 0); assert_eq!(2, dict.num_blocks()); assert_eq!(b"aaaaaaaa", &dict.block_head(0)[..]); @@ -192,7 +207,7 @@ mod tests { assert_eq!(6, block1.num_entries()); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(s, &dict.entry(ix as u64).to_bytes()[..]); + assert_eq!(s, &dict.entry((ix+1) as u64).to_bytes()[..]); } } @@ -221,10 +236,10 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = SizedDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::parse(array_bytes, data_bytes, 0); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found(ix as u64), dict.id(s)); + assert_eq!(IdLookupResult::Found((ix+1) as u64), dict.id(s)); } } @@ -253,12 +268,12 @@ mod tests { let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); - let dict = SizedDict::from_parts(array_bytes, data_bytes); + let dict = SizedDict::parse(array_bytes, data_bytes, 0); assert_eq!(IdLookupResult::NotFound, dict.id(b"a")); - assert_eq!(IdLookupResult::Closest(0), dict.id(b"ab")); - assert_eq!(IdLookupResult::Closest(7), dict.id(b"hallo")); - assert_eq!(IdLookupResult::Closest(8), dict.id(b"hello!")); - assert_eq!(IdLookupResult::Closest(13), dict.id(b"zebra")); + assert_eq!(IdLookupResult::Closest(1), dict.id(b"ab")); + assert_eq!(IdLookupResult::Closest(8), dict.id(b"hallo")); + assert_eq!(IdLookupResult::Closest(9), dict.id(b"hello!")); + assert_eq!(IdLookupResult::Closest(14), dict.id(b"zebra")); } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index ab48c678..74aab1f7 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,33 +1,171 @@ -use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; +use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, tfc::block::BLOCK_SIZE}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::*; +use num_derive::FromPrimitive; +use num_traits::FromPrimitive; use rug::Integer; use std::marker::PhantomData; use super::{ - block::IdLookupResult, + block::{IdLookupResult, SizedDictEntry}, decimal::{decimal_to_storage, storage_to_decimal}, dict::{build_dict_unchecked, build_offset_logarray, SizedDict}, integer::{bigint_to_storage, storage_to_bigint}, }; +#[derive(Debug)] pub struct TypedDict { types_present: MonotonicLogArray, - type_offsets: Option, + type_offsets: MonotonicLogArray, + block_offsets: MonotonicLogArray, + type_id_offsets: Vec, data: Bytes, } -/* + impl TypedDict { - pub fn id(&self, slice: &[u8], dt: Datatype) -> IdLookupResult { + pub fn from_parts(types_present: Bytes, type_offsets: Bytes, block_offsets: Bytes, data: Bytes) -> Self { + let types_present = MonotonicLogArray::parse(types_present).unwrap(); + let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); + let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); + + let mut tally: u64 = 0; + let mut type_id_offsets = Vec::with_capacity(types_present.len()-1); + dbg!(&type_offsets); + for type_offset in type_offsets.iter() { + let last_block_len; + if type_offset == 0 { + last_block_len = data[0]; + } + else { + let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); + dbg!(last_block_offset_of_previous_type); + last_block_len = data[last_block_offset_of_previous_type as usize]; + } + let gap = BLOCK_SIZE as u8 - last_block_len; + dbg!(gap); + tally += gap as u64; + dbg!(tally); + type_id_offsets.push((type_offset + 1)*8 - tally); + } + + dbg!(&type_id_offsets); + + Self { + types_present, + type_offsets, + block_offsets, + type_id_offsets, + data, + } + } + + pub fn id(&self, v:&T) -> IdLookupResult { + let (datatype, bytes) = v.make_entry(); + + self.id_slice(datatype, bytes.as_ref()) + } + + pub fn get(&self, id:u64) -> T { + let (datatype, slice) = self.entry(id); + datatype.cast(slice.into_buf()) + } + + fn inner_type_segment(&self, i: usize) -> (SizedDict, u64) { + dbg!(i); + let type_offset; + let block_offset; + let id_offset; + if i == 0 { + type_offset = 0; + block_offset = 0; + id_offset = 0; + } + else { + type_offset = self.type_offsets.entry(i-1) as usize; + id_offset = self.type_id_offsets[type_offset]; + block_offset = self.block_offsets.entry(type_offset as usize) as usize; + } + dbg!(type_offset); + dbg!(block_offset); + + let len; + if i == self.types_present.len()-1 { + eprintln!("last type"); + if i == 0 { + len = self.block_offsets.len() - type_offset; + } + else { + len = self.block_offsets.len() - type_offset - 1; + } + } + else { + let next_offset = self.type_offsets.entry(i) as usize; + if i == 0 { + len = next_offset - type_offset; + } + else { + len = next_offset - type_offset - 1; + } + + } + dbg!(len); + dbg!(self.data.len()); + + let logarray_slice = self.block_offsets.slice(type_offset+1, len); + let data_slice = self.data.slice(block_offset..); + dbg!(data_slice.len()); + + (SizedDict::from_parts(logarray_slice, data_slice, type_offset as u64), id_offset as u64) + } + + pub fn type_segment(&self, dt: Datatype) -> Option<(SizedDict, u64)> { if let Some(i) = self.types_present.index_of(dt as u64) { - let offset = types_offsets[i]; + Some(self.inner_type_segment(i)) + } else { + None + } + } + + pub fn id_slice(&self, dt: Datatype, slice: &[u8]) -> IdLookupResult { + if let Some((dict, offset)) = self.type_segment(dt) { + dbg!(&dict.data); + let result = dict.id(slice) + .offset(offset); + if offset != 0 { + result.default(offset) + } + else { + result + } } else { IdLookupResult::NotFound } } -}*/ + + fn type_index_for_id(&self, id: u64) -> usize { + for (ix, offset) in self.type_id_offsets.iter().enumerate() { + if *offset > (id-1) { + return ix; + } + } + + self.type_id_offsets.len() + } + + fn type_for_type_index(&self, type_index: usize) -> Datatype { + FromPrimitive::from_u64(self.types_present.entry(type_index)).unwrap() + } + + pub fn entry(&self, id: u64) -> (Datatype, SizedDictEntry) { + let type_index = self.type_index_for_id(id); + + let (dict, offset) = self.inner_type_segment(type_index); + let dt = self.type_for_type_index(type_index); + (dt, dict.entry(id - offset)) + } +} pub struct TypedDictSegment { dict: SizedDict, @@ -35,8 +173,8 @@ pub struct TypedDictSegment { } impl TypedDictSegment { - pub fn from_parts(offsets: Bytes, data: Bytes) -> Self { - let dict = SizedDict::from_parts(offsets, data); + pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { + let dict = SizedDict::parse(offsets, data, dict_offset); Self { dict, _x: Default::default(), @@ -54,7 +192,7 @@ impl TypedDictSegment { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] pub enum Datatype { String = 0, UInt32, @@ -67,12 +205,26 @@ pub enum Datatype { BigInt, } +impl Datatype { + pub fn cast(self, b: B) -> T { + if T::datatype() != self { + panic!("not the right datatype"); + } + + T::from_lexical(b) + } +} + pub trait TdbDataType { fn datatype() -> Datatype; fn to_lexical(&self) -> Bytes; fn from_lexical(b: B) -> Self; + + fn make_entry(&self) -> (Datatype, Bytes) { + (Self::datatype(), self.to_lexical()) + } } impl TdbDataType for String { @@ -93,7 +245,7 @@ impl TdbDataType for String { impl TdbDataType for u32 { fn datatype() -> Datatype { - Datatype::UInt64 + Datatype::UInt32 } fn to_lexical(&self) -> Bytes { @@ -272,34 +424,40 @@ pub fn build_multiple_segments< R: AsRef<[u8]>, I: Iterator, >( - used_types: &mut B1, - type_offsets: &mut B2, - block_offsets: &mut B3, - data: &mut B4, + used_types_buf: &mut B1, + type_offsets_buf: &mut B2, + block_offsets_buf: &mut B3, + data_buf: &mut B4, iter: I, ) { - let mut types: Vec<(Datatype, u64)> = Vec::new(); + let mut types: Vec = Vec::new(); + let mut type_offsets: Vec = Vec::new(); let mut offsets = Vec::with_capacity(iter.size_hint().0); for (key, group) in iter.group_by(|v| v.0).into_iter() { let start_offset = offsets.last().map(|t| *t).unwrap_or(0_u64); let start_type_offset = offsets.len(); - types.push((key, start_type_offset as u64)); - build_dict_unchecked(start_offset, &mut offsets, data, group.map(|v| v.1)); + types.push(key); + type_offsets.push(start_type_offset as u64); + build_dict_unchecked(start_offset, &mut offsets, data_buf, group.map(|v| v.1)); } - offsets.pop(); - build_offset_logarray(block_offsets, offsets); + + build_offset_logarray(block_offsets_buf, offsets); eprintln!("types: {types:?}"); - let largest = types.last().unwrap(); + let largest_type = types.last().unwrap(); + let largest_type_offset = type_offsets.last().unwrap(); - let types_width = calculate_width(largest.0 as u64); - let type_offsets_width = calculate_width(largest.1); + let types_width = calculate_width(*largest_type as u64); + let type_offsets_width = calculate_width(*largest_type_offset); - let mut types_builder = LogArrayBufBuilder::new(used_types, types_width); - let mut type_offsets_builder = LogArrayBufBuilder::new(type_offsets, type_offsets_width); + let mut types_builder = LogArrayBufBuilder::new(used_types_buf, types_width); + let mut type_offsets_builder = LogArrayBufBuilder::new(type_offsets_buf, type_offsets_width); - for (t, o) in types { + for t in types { types_builder.push(t as u64); - type_offsets_builder.push(o); + } + + for o in type_offsets.into_iter().skip(1) { + type_offsets_builder.push(o - 1); } types_builder.finalize(); @@ -308,7 +466,7 @@ pub fn build_multiple_segments< #[cfg(test)] mod tests { - use crate::structure::{tfc::dict::build_offset_logarray, LogArray}; + use crate::structure::{tfc::dict::build_offset_logarray}; use super::*; @@ -349,11 +507,11 @@ mod tests { build_segment_and_offsets(&mut offsets, &mut data, strings.clone().into_iter()); - let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); + let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found(ix as u64), segment.id(&s)); - assert_eq!(s, segment.get(ix as u64)); + assert_eq!(IdLookupResult::Found((ix+1) as u64), segment.id(&s)); + assert_eq!(s, segment.get((ix+1) as u64)); } } @@ -368,11 +526,11 @@ mod tests { build_segment_and_offsets(&mut offsets, &mut data, nums.clone().into_iter()); - let segment = TypedDictSegment::from_parts(offsets.freeze(), data.freeze()); + let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); for (ix, s) in nums.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found(ix as u64), segment.id(&s)); - assert_eq!(s, segment.get(ix as u64)); + assert_eq!(IdLookupResult::Found((ix+1) as u64), segment.id(&s)); + assert_eq!(s, segment.get((ix+1) as u64)); } } @@ -468,31 +626,27 @@ mod tests { cycle(Decimal("-983423984793872423423423432312698".to_string())); } - fn make_entry(t: T) -> (Datatype, Bytes) { - (T::datatype(), t.to_lexical()) - } - #[test] fn test_multi_segment() { let mut vec: Vec<(Datatype, Bytes)> = vec![ - make_entry(Decimal("-1".to_string())), - make_entry("asdf".to_string()), - make_entry(Decimal("-12342343.2348973".to_string())), - make_entry("Batty".to_string()), - make_entry("Batman".to_string()), - make_entry(-3_i64), - make_entry(Decimal("2348973".to_string())), - make_entry(4.389832_f32), - make_entry("apple".to_string()), - make_entry(23434.389832_f32), - make_entry("apply".to_string()), - make_entry(-500_i32), - make_entry(20_u32), + Decimal("-1".to_string()).make_entry(), + "asdf".to_string().make_entry(), + Decimal("-12342343.2348973".to_string()).make_entry(), + "Batty".to_string().make_entry(), + "Batman".to_string().make_entry(), + (-3_i64).make_entry(), + Decimal("2348973".to_string()).make_entry(), + 4.389832_f32.make_entry(), + "apple".to_string().make_entry(), + 23434.389832_f32.make_entry(), + "apply".to_string().make_entry(), + (-500_i32).make_entry(), + 20_u32.make_entry(), ]; vec.sort(); - let mut used_types = Vec::new(); - let mut type_offsets = Vec::new(); - let mut block_offsets = Vec::new(); + let mut used_types = BytesMut::new(); + let mut type_offsets = BytesMut::new(); + let mut block_offsets = BytesMut::new(); let mut data = BytesMut::new(); build_multiple_segments( &mut used_types, @@ -506,15 +660,19 @@ mod tests { eprintln!("block_offsets : {block_offsets:?}"); eprintln!("data : {data:?}"); - let used_types_vec: Vec = LogArray::parse(Bytes::from(used_types)) - .unwrap() - .iter() - .collect(); + let dict = TypedDict::from_parts(used_types.freeze(), type_offsets.freeze(), block_offsets.freeze(), data.freeze()); + eprintln!("{dict:?}"); + + let id = dict.id(&"Batty".to_string()); + assert_eq!(IdLookupResult::Found(2), id); + assert_eq!(IdLookupResult::Found(6), dict.id(&20_u32)); + assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); - let expected_types_vec: Vec = vec.iter().map(|x| x.0 as u64).dedup().collect(); - assert_eq!(used_types_vec, expected_types_vec); + for i in 1..vec.len()+1 { + eprintln!("!!!!!!!!!!!! {i} {:?}", dict.entry(i as u64)); + } - eprintln!("expected_types_vec: {expected_types_vec:?}"); + assert_eq!(Decimal("-12342343.2348973".to_string()), dict.get(11)); panic!(); } From 593d227ce5181437c54b492669670680c0115c4b Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 26 Nov 2022 23:05:56 +0100 Subject: [PATCH 23/99] Two significant changes and some formatting. typed.rs L93: don't use type_offset but index. typed.rs L123 use block_offsets in from_parts. --- src/structure/tfc/dict.rs | 18 +++-- src/structure/tfc/typed.rs | 151 +++++++++++++++++++++++++++---------- 2 files changed, 124 insertions(+), 45 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 7a85cddc..8283bfc4 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -49,7 +49,11 @@ impl SizedDict { } pub fn from_parts(offsets: MonotonicLogArray, data: Bytes, dict_offset: u64) -> Self { - Self { offsets, data, dict_offset } + Self { + offsets, + data, + dict_offset, + } } fn block_offset(&self, block_index: usize) -> usize { @@ -70,8 +74,8 @@ impl SizedDict { dbg!(block_index); dbg!(self.offsets.len()); //if block_index == self.offsets.len() { - dbg!(offset..); - block_bytes = self.data.slice(offset..); + dbg!(offset..); + block_bytes = dbg!(self.data.slice(offset..)); //} else { // let end = self.block_offset(block_index+1); // dbg!(offset..end); @@ -103,7 +107,7 @@ impl SizedDict { pub fn entry(&self, index: u64) -> SizedDictEntry { let block = self.block(((index - 1) / 8) as usize); - block.entry(((index-1) % 8) as usize) + block.entry(((index - 1) % 8) as usize) } pub fn id(&self, slice: &[u8]) -> IdLookupResult { @@ -137,7 +141,7 @@ impl SizedDict { let block = self.block(found); let block_id = block.id(slice); let offset = (found * BLOCK_SIZE) as u64 + 1; - let result = block_id.offset(offset).default(offset-1); + let result = block_id.offset(offset).default(offset - 1); /* if found != 0 { // the default value will fill in the last index of the @@ -207,7 +211,7 @@ mod tests { assert_eq!(6, block1.num_entries()); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(s, &dict.entry((ix+1) as u64).to_bytes()[..]); + assert_eq!(s, &dict.entry((ix + 1) as u64).to_bytes()[..]); } } @@ -239,7 +243,7 @@ mod tests { let dict = SizedDict::parse(array_bytes, data_bytes, 0); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found((ix+1) as u64), dict.id(s)); + assert_eq!(IdLookupResult::Found((ix + 1) as u64), dict.id(s)); } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 74aab1f7..75e82104 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,4 +1,6 @@ -use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, tfc::block::BLOCK_SIZE}; +use crate::structure::{ + tfc::block::BLOCK_SIZE, util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, +}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::*; @@ -24,21 +26,26 @@ pub struct TypedDict { } impl TypedDict { - pub fn from_parts(types_present: Bytes, type_offsets: Bytes, block_offsets: Bytes, data: Bytes) -> Self { + pub fn from_parts( + types_present: Bytes, + type_offsets: Bytes, + block_offsets: Bytes, + data: Bytes, + ) -> Self { let types_present = MonotonicLogArray::parse(types_present).unwrap(); let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); let mut tally: u64 = 0; - let mut type_id_offsets = Vec::with_capacity(types_present.len()-1); + let mut type_id_offsets = Vec::with_capacity(types_present.len() - 1); dbg!(&type_offsets); for type_offset in type_offsets.iter() { let last_block_len; if type_offset == 0 { last_block_len = data[0]; - } - else { - let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); + } else { + let last_block_offset_of_previous_type = + block_offsets.entry(type_offset as usize - 1); dbg!(last_block_offset_of_previous_type); last_block_len = data[last_block_offset_of_previous_type as usize]; } @@ -46,7 +53,7 @@ impl TypedDict { dbg!(gap); tally += gap as u64; dbg!(tally); - type_id_offsets.push((type_offset + 1)*8 - tally); + type_id_offsets.push((type_offset + 1) * 8 - tally); } dbg!(&type_id_offsets); @@ -60,13 +67,14 @@ impl TypedDict { } } - pub fn id(&self, v:&T) -> IdLookupResult { + pub fn id(&self, v: &T) -> IdLookupResult { let (datatype, bytes) = v.make_entry(); self.id_slice(datatype, bytes.as_ref()) } - pub fn get(&self, id:u64) -> T { + pub fn get(&self, id: u64) -> T { + eprintln!("id: {id}"); let (datatype, slice) = self.entry(id); datatype.cast(slice.into_buf()) } @@ -80,43 +88,41 @@ impl TypedDict { type_offset = 0; block_offset = 0; id_offset = 0; - } - else { - type_offset = self.type_offsets.entry(i-1) as usize; - id_offset = self.type_id_offsets[type_offset]; + } else { + type_offset = self.type_offsets.entry(i - 1) as usize; + id_offset = dbg!(self.type_id_offsets[i - 1]); block_offset = self.block_offsets.entry(type_offset as usize) as usize; } dbg!(type_offset); dbg!(block_offset); let len; - if i == self.types_present.len()-1 { + if i == self.types_present.len() - 1 { eprintln!("last type"); if i == 0 { len = self.block_offsets.len() - type_offset; - } - else { + } else { len = self.block_offsets.len() - type_offset - 1; } - } - else { + } else { let next_offset = self.type_offsets.entry(i) as usize; if i == 0 { len = next_offset - type_offset; - } - else { + } else { len = next_offset - type_offset - 1; } - } dbg!(len); dbg!(self.data.len()); - let logarray_slice = self.block_offsets.slice(type_offset+1, len); + let logarray_slice = self.block_offsets.slice(type_offset + 1, len); let data_slice = self.data.slice(block_offset..); dbg!(data_slice.len()); - (SizedDict::from_parts(logarray_slice, data_slice, type_offset as u64), id_offset as u64) + ( + SizedDict::from_parts(logarray_slice, data_slice, block_offset as u64), + id_offset as u64, + ) } pub fn type_segment(&self, dt: Datatype) -> Option<(SizedDict, u64)> { @@ -130,13 +136,11 @@ impl TypedDict { pub fn id_slice(&self, dt: Datatype, slice: &[u8]) -> IdLookupResult { if let Some((dict, offset)) = self.type_segment(dt) { dbg!(&dict.data); - let result = dict.id(slice) - .offset(offset); + let result = dict.id(slice).offset(offset); if offset != 0 { result.default(offset) - } - else { + } else { result } } else { @@ -146,7 +150,7 @@ impl TypedDict { fn type_index_for_id(&self, id: u64) -> usize { for (ix, offset) in self.type_id_offsets.iter().enumerate() { - if *offset > (id-1) { + if *offset > (id - 1) { return ix; } } @@ -159,10 +163,11 @@ impl TypedDict { } pub fn entry(&self, id: u64) -> (Datatype, SizedDictEntry) { + eprintln!("entry(id): {id}"); let type_index = self.type_index_for_id(id); - let (dict, offset) = self.inner_type_segment(type_index); - let dt = self.type_for_type_index(type_index); + let (dict, offset) = dbg!(self.inner_type_segment(type_index)); + let dt = dbg!(self.type_for_type_index(type_index)); (dt, dict.entry(id - offset)) } } @@ -206,7 +211,7 @@ pub enum Datatype { } impl Datatype { - pub fn cast(self, b: B) -> T { + pub fn cast(self, b: B) -> T { if T::datatype() != self { panic!("not the right datatype"); } @@ -466,7 +471,7 @@ pub fn build_multiple_segments< #[cfg(test)] mod tests { - use crate::structure::{tfc::dict::build_offset_logarray}; + use crate::structure::tfc::dict::build_offset_logarray; use super::*; @@ -510,8 +515,8 @@ mod tests { let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found((ix+1) as u64), segment.id(&s)); - assert_eq!(s, segment.get((ix+1) as u64)); + assert_eq!(IdLookupResult::Found((ix + 1) as u64), segment.id(&s)); + assert_eq!(s, segment.get((ix + 1) as u64)); } } @@ -529,8 +534,8 @@ mod tests { let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); for (ix, s) in nums.into_iter().enumerate() { - assert_eq!(IdLookupResult::Found((ix+1) as u64), segment.id(&s)); - assert_eq!(s, segment.get((ix+1) as u64)); + assert_eq!(IdLookupResult::Found((ix + 1) as u64), segment.id(&s)); + assert_eq!(s, segment.get((ix + 1) as u64)); } } @@ -660,7 +665,12 @@ mod tests { eprintln!("block_offsets : {block_offsets:?}"); eprintln!("data : {data:?}"); - let dict = TypedDict::from_parts(used_types.freeze(), type_offsets.freeze(), block_offsets.freeze(), data.freeze()); + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); eprintln!("{dict:?}"); let id = dict.id(&"Batty".to_string()); @@ -668,7 +678,7 @@ mod tests { assert_eq!(IdLookupResult::Found(6), dict.id(&20_u32)); assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); - for i in 1..vec.len()+1 { + for i in 1..vec.len() + 1 { eprintln!("!!!!!!!!!!!! {i} {:?}", dict.entry(i as u64)); } @@ -676,4 +686,69 @@ mod tests { panic!(); } + + #[test] + fn test_full_blocks() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + "fdsa".to_string().make_entry(), + "a".to_string().make_entry(), + "bc".to_string().make_entry(), + "bcd".to_string().make_entry(), + "z".to_string().make_entry(), + "Batty".to_string().make_entry(), + "Batman".to_string().make_entry(), + "apple".to_string().make_entry(), + (-500_i32).make_entry(), + 20_u32.make_entry(), + 22_u32.make_entry(), + 23_u32.make_entry(), + 24_u32.make_entry(), + 25_u32.make_entry(), + 26_u32.make_entry(), + 27_u32.make_entry(), + 28_u32.make_entry(), + 3000_u32.make_entry(), + (-3_i64).make_entry(), + Decimal("-12342343.2348973".to_string()).make_entry(), + Decimal("234.8973".to_string()).make_entry(), + Decimal("0.2348973".to_string()).make_entry(), + Decimal("23423423.8973".to_string()).make_entry(), + Decimal("3.3".to_string()).make_entry(), + Decimal("0.001".to_string()).make_entry(), + Decimal("-0.001".to_string()).make_entry(), + Decimal("2".to_string()).make_entry(), + Decimal("0".to_string()).make_entry(), + 4.389832_f32.make_entry(), + 23434.389832_f32.make_entry(), + ]; + vec.sort(); + let mut used_types = BytesMut::new(); + let mut type_offsets = BytesMut::new(); + let mut block_offsets = BytesMut::new(); + let mut data = BytesMut::new(); + build_multiple_segments( + &mut used_types, + &mut type_offsets, + &mut block_offsets, + &mut data, + vec.clone().into_iter(), + ); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + eprintln!("{dict:?}"); + + for i in 1..vec.len() + 1 { + eprintln!("!!!!!!!!!!!! {i} {:?}", dict.entry(i as u64)); + } + + assert_eq!("Batman".to_string(), dict.get::(1)); + assert_eq!("fdsa".to_string(), dict.get::(7)); + assert_eq!(26_u32, dict.get::(14)); + assert_eq!(Decimal("234.8973".to_string()), dict.get(29)); + } } From 4ab91b38f489c88742caf286af43bb3287def117 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 26 Nov 2022 23:35:45 +0100 Subject: [PATCH 24/99] Remove dbgs and add better tests --- src/structure/tfc/block.rs | 2 +- src/structure/tfc/dict.rs | 12 +--------- src/structure/tfc/typed.rs | 47 ++++++++++++++++++-------------------- 3 files changed, 24 insertions(+), 37 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 7335091a..caba2706 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -65,7 +65,7 @@ impl SizedBlockHeader { } #[derive(Clone, Debug)] -pub struct SizedDictEntry(Vec); +pub struct SizedDictEntry(pub Vec); impl SizedDictEntry { pub fn new(parts: Vec) -> Self { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 8283bfc4..78626bff 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -68,19 +68,9 @@ impl SizedDict { } pub fn block_bytes(&self, block_index: usize) -> Bytes { - dbg!(block_index); let offset = self.block_offset(block_index); let block_bytes; - dbg!(block_index); - dbg!(self.offsets.len()); - //if block_index == self.offsets.len() { - dbg!(offset..); - block_bytes = dbg!(self.data.slice(offset..)); - //} else { - // let end = self.block_offset(block_index+1); - // dbg!(offset..end); - // block_bytes = self.data.slice(offset..end); - //} + block_bytes = self.data.slice(offset..); block_bytes } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 75e82104..1303c5e4 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -38,7 +38,6 @@ impl TypedDict { let mut tally: u64 = 0; let mut type_id_offsets = Vec::with_capacity(types_present.len() - 1); - dbg!(&type_offsets); for type_offset in type_offsets.iter() { let last_block_len; if type_offset == 0 { @@ -46,18 +45,13 @@ impl TypedDict { } else { let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); - dbg!(last_block_offset_of_previous_type); last_block_len = data[last_block_offset_of_previous_type as usize]; } let gap = BLOCK_SIZE as u8 - last_block_len; - dbg!(gap); tally += gap as u64; - dbg!(tally); type_id_offsets.push((type_offset + 1) * 8 - tally); } - dbg!(&type_id_offsets); - Self { types_present, type_offsets, @@ -74,13 +68,11 @@ impl TypedDict { } pub fn get(&self, id: u64) -> T { - eprintln!("id: {id}"); let (datatype, slice) = self.entry(id); datatype.cast(slice.into_buf()) } fn inner_type_segment(&self, i: usize) -> (SizedDict, u64) { - dbg!(i); let type_offset; let block_offset; let id_offset; @@ -90,15 +82,12 @@ impl TypedDict { id_offset = 0; } else { type_offset = self.type_offsets.entry(i - 1) as usize; - id_offset = dbg!(self.type_id_offsets[i - 1]); + id_offset = self.type_id_offsets[i - 1]; block_offset = self.block_offsets.entry(type_offset as usize) as usize; } - dbg!(type_offset); - dbg!(block_offset); let len; if i == self.types_present.len() - 1 { - eprintln!("last type"); if i == 0 { len = self.block_offsets.len() - type_offset; } else { @@ -112,12 +101,9 @@ impl TypedDict { len = next_offset - type_offset - 1; } } - dbg!(len); - dbg!(self.data.len()); let logarray_slice = self.block_offsets.slice(type_offset + 1, len); let data_slice = self.data.slice(block_offset..); - dbg!(data_slice.len()); ( SizedDict::from_parts(logarray_slice, data_slice, block_offset as u64), @@ -135,7 +121,6 @@ impl TypedDict { pub fn id_slice(&self, dt: Datatype, slice: &[u8]) -> IdLookupResult { if let Some((dict, offset)) = self.type_segment(dt) { - dbg!(&dict.data); let result = dict.id(slice).offset(offset); if offset != 0 { @@ -163,11 +148,10 @@ impl TypedDict { } pub fn entry(&self, id: u64) -> (Datatype, SizedDictEntry) { - eprintln!("entry(id): {id}"); let type_index = self.type_index_for_id(id); - let (dict, offset) = dbg!(self.inner_type_segment(type_index)); - let dt = dbg!(self.type_for_type_index(type_index)); + let (dict, offset) = self.inner_type_segment(type_index); + let dt = self.type_for_type_index(type_index); (dt, dict.entry(id - offset)) } } @@ -671,7 +655,6 @@ mod tests { block_offsets.freeze(), data.freeze(), ); - eprintln!("{dict:?}"); let id = dict.id(&"Batty".to_string()); assert_eq!(IdLookupResult::Found(2), id); @@ -679,12 +662,11 @@ mod tests { assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); for i in 1..vec.len() + 1 { - eprintln!("!!!!!!!!!!!! {i} {:?}", dict.entry(i as u64)); + let (t, s) = dict.entry(i as u64); + assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); } assert_eq!(Decimal("-12342343.2348973".to_string()), dict.get(11)); - - panic!(); } #[test] @@ -740,15 +722,30 @@ mod tests { block_offsets.freeze(), data.freeze(), ); - eprintln!("{dict:?}"); for i in 1..vec.len() + 1 { - eprintln!("!!!!!!!!!!!! {i} {:?}", dict.entry(i as u64)); + let (t, s) = dict.entry(i as u64); + assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); } assert_eq!("Batman".to_string(), dict.get::(1)); assert_eq!("fdsa".to_string(), dict.get::(7)); assert_eq!(26_u32, dict.get::(14)); assert_eq!(Decimal("234.8973".to_string()), dict.get(29)); + + assert_eq!(IdLookupResult::NotFound, dict.id(&"AAAA".to_string())); + assert_eq!(IdLookupResult::Closest(2), dict.id(&"Baz".to_string())); + assert_eq!(IdLookupResult::Found(17), dict.id(&3000_u32)); + assert_eq!( + IdLookupResult::Found(23), + dict.id(&Decimal("-0.001".to_string())) + ); + assert_eq!( + IdLookupResult::Closest(23), + dict.id(&Decimal("-0.0001".to_string())) + ); + assert_eq!(IdLookupResult::Found(16), dict.id(&28_u32)); + assert_eq!(IdLookupResult::Closest(16), dict.id(&29_u32)); + assert_eq!(IdLookupResult::Closest(17), dict.id(&3001_u32)); } } From d2959e6dae6a2c2d0f8674daa4bedc64181151bb Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 26 Nov 2022 23:43:55 +0100 Subject: [PATCH 25/99] Fix bigint naming, add tests --- src/structure/tfc/typed.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 1303c5e4..2228a380 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -366,7 +366,7 @@ impl TdbDataType for f64 { impl TdbDataType for Integer { fn datatype() -> Datatype { - Datatype::Float64 + Datatype::BigInt } fn to_lexical(&self) -> Bytes { @@ -644,10 +644,6 @@ mod tests { &mut data, vec.clone().into_iter(), ); - eprintln!("used_types : {used_types:?}"); - eprintln!("type_offsets : {type_offsets:?}"); - eprintln!("block_offsets : {block_offsets:?}"); - eprintln!("data : {data:?}"); let dict = TypedDict::from_parts( used_types.freeze(), @@ -702,6 +698,7 @@ mod tests { Decimal("0".to_string()).make_entry(), 4.389832_f32.make_entry(), 23434.389832_f32.make_entry(), + int("239487329872343987").make_entry(), ]; vec.sort(); let mut used_types = BytesMut::new(); @@ -735,7 +732,9 @@ mod tests { assert_eq!(IdLookupResult::NotFound, dict.id(&"AAAA".to_string())); assert_eq!(IdLookupResult::Closest(2), dict.id(&"Baz".to_string())); + assert_eq!(IdLookupResult::Found(17), dict.id(&3000_u32)); + assert_eq!( IdLookupResult::Found(23), dict.id(&Decimal("-0.001".to_string())) @@ -744,8 +743,21 @@ mod tests { IdLookupResult::Closest(23), dict.id(&Decimal("-0.0001".to_string())) ); + assert_eq!(IdLookupResult::Found(16), dict.id(&28_u32)); assert_eq!(IdLookupResult::Closest(16), dict.id(&29_u32)); assert_eq!(IdLookupResult::Closest(17), dict.id(&3001_u32)); + + assert_eq!(IdLookupResult::Closest(17), dict.id(&3001_u32)); + + assert_eq!(IdLookupResult::Closest(30), dict.id(&int("0"))); + assert_eq!( + IdLookupResult::Found(31), + dict.id(&int("239487329872343987")) + ); + assert_eq!( + IdLookupResult::Closest(31), + dict.id(&int("99999999999999999999999999")) + ); } } From 8dfa62b91b91afec4dcd11bfd435f658a3a0f982 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 14:05:05 +0100 Subject: [PATCH 26/99] make versions of the bitindex generator that work with bufs --- src/structure/bitarray.rs | 56 ++++++++++++++++++++++++++------------- src/structure/bitindex.rs | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/src/structure/bitarray.rs b/src/structure/bitarray.rs index a46cdfdd..43ea1dba 100644 --- a/src/structure/bitarray.rs +++ b/src/structure/bitarray.rs @@ -34,7 +34,7 @@ use super::util; use crate::storage::*; use crate::structure::bititer::BitIter; use byteorder::{BigEndian, ByteOrder}; -use bytes::{Bytes, BytesMut}; +use bytes::{Buf, Bytes, BytesMut}; use futures::io; use futures::stream::{Stream, StreamExt, TryStreamExt}; use std::{convert::TryFrom, error, fmt}; @@ -291,24 +291,25 @@ impl Decoder for BitArrayBlockDecoder { /// Decode the next block of the bit array. fn decode(&mut self, bytes: &mut BytesMut) -> Result, io::Error> { - // If there isn't a full word available in the buffer, stop. - if bytes.len() < 8 { - return Ok(None); - } + Ok(decode_next_bitarray_block(bytes, &mut self.readahead)) + } +} - // Read the next word. If `self.readahead` was `Some`, return that value; otherwise, - // recurse to read a second word and then return the first word. - // - // This trick means that we don't return the last word in the buffer, which is the control - // word. The consequence is that we read an extra word at the beginning of the decoding - // process. - match self - .readahead - .replace(BigEndian::read_u64(&bytes.split_to(8))) - { - Some(word) => Ok(Some(word)), - None => self.decode(bytes), - } +fn decode_next_bitarray_block(bytes: &mut B, readahead: &mut Option) -> Option { + // If there isn't a full word available in the buffer, stop. + if bytes.remaining() < 8 { + return None; + } + + // Read the next word. If `readahead` was `Some`, return that value; otherwise, + // recurse to read a second word and then return the first word. + // + // This trick means that we don't return the last word in the buffer, which is the control + // word. The consequence is that we read an extra word at the beginning of the decoding + // process. + match readahead.replace(bytes.get_u64()) { + Some(word) => Some(word), + None => decode_next_bitarray_block(bytes, readahead), } } @@ -316,6 +317,25 @@ pub fn bitarray_stream_blocks(r: R) -> FramedRead(b: &mut B) -> BitArrayBlockIterator { + BitArrayBlockIterator { + buf: b, + readahead: None, + } +} + +pub struct BitArrayBlockIterator<'a, B: Buf> { + buf: &'a mut B, + readahead: Option, +} + +impl<'a, B: Buf> Iterator for BitArrayBlockIterator<'a, B> { + type Item = u64; + fn next(&mut self) -> Option { + decode_next_bitarray_block(self.buf, &mut self.readahead) + } +} + /// Read the length (number of bits) from a `FileLoad`. pub(crate) async fn bitarray_len_from_file(f: F) -> io::Result { BitArrayError::validate_input_buf_size(f.size().await?)?; diff --git a/src/structure/bitindex.rs b/src/structure/bitindex.rs index 097d36ab..76b4c117 100644 --- a/src/structure/bitindex.rs +++ b/src/structure/bitindex.rs @@ -1,6 +1,9 @@ //! Logic for building and using an index over a bitarray which provides rank and select. use byteorder::{BigEndian, ByteOrder}; +use bytes::Buf; +use bytes::BufMut; use bytes::Bytes; +use itertools::Itertools; use super::bitarray::*; use super::logarray::*; @@ -434,6 +437,51 @@ pub async fn build_bitindex< Ok(()) } +pub fn build_bitindex_from_block_iter<'a, I: Iterator, B1: BufMut, B2: BufMut>( + blocks_iter: &'a mut I, + blocks: &mut B1, + sblocks: &mut B2, +) { + // the following widths are unoptimized, but should always be large enough + let mut blocks_builder = + LogArrayBufBuilder::new(blocks, 64 - (SBLOCK_SIZE * 64).leading_zeros() as u8); + let mut sblocks_builder = LogArrayBufBuilder::new(sblocks, 64); + + // we chunk block_stream into blocks of SBLOCK size for further processing + let mut sblock_rank = 0; + let chunks = blocks_iter.chunks(SBLOCK_SIZE); + let mut iter = chunks.into_iter(); + while let Some(chunk) = iter.next() { + let chunk: Vec<_> = chunk.collect(); + let mut block_ranks = Vec::with_capacity(chunk.len()); + for num in chunk { + block_ranks.push(num.count_ones() as u64); + } + + let mut sblock_subrank = block_ranks.iter().sum(); + sblock_rank += sblock_subrank; + + for block_rank in block_ranks { + blocks_builder.push(sblock_subrank); + sblock_subrank -= block_rank; + } + + sblocks_builder.push(sblock_rank); + } + + blocks_builder.finalize(); + sblocks_builder.finalize(); +} + +pub fn build_bitindex_from_buf( + bitarray: &mut B1, + blocks: &mut B2, + sblocks: &mut B3, +) { + let mut iter = bitarray_iter_blocks(bitarray); + build_bitindex_from_block_iter(&mut iter, blocks, sblocks) +} + #[cfg(test)] mod tests { use super::*; From 4e61d03965670b40c4c1a289b7f50e8f1b6b7912 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 14:33:44 +0100 Subject: [PATCH 27/99] precalculate typed dict len --- src/structure/logarray.rs | 2 +- src/structure/tfc/typed.rs | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index bb0d99c5..895999fa 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -716,7 +716,7 @@ impl MonotonicLogArray { MonotonicLogArray(logarray) } - pub fn parse(bytes: Bytes) -> Result { + pub fn parse(bytes: Bytes) -> Result { let logarray = LogArray::parse(bytes)?; Ok(Self::from_logarray(logarray)) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 2228a380..b05f31e0 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -22,6 +22,7 @@ pub struct TypedDict { type_offsets: MonotonicLogArray, block_offsets: MonotonicLogArray, type_id_offsets: Vec, + num_entries: usize, data: Bytes, } @@ -52,11 +53,16 @@ impl TypedDict { type_id_offsets.push((type_offset + 1) * 8 - tally); } + let last_gap = + BLOCK_SIZE - data[block_offsets.entry(block_offsets.len() - 1) as usize] as usize; + let num_entries = (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap; + Self { types_present, type_offsets, block_offsets, type_id_offsets, + num_entries, data, } } @@ -154,6 +160,10 @@ impl TypedDict { let dt = self.type_for_type_index(type_index); (dt, dict.entry(id - offset)) } + + pub fn num_entries(&self) -> usize { + self.num_entries + } } pub struct TypedDictSegment { @@ -652,6 +662,8 @@ mod tests { data.freeze(), ); + assert_eq!(13, dict.num_entries()); + let id = dict.id(&"Batty".to_string()); assert_eq!(IdLookupResult::Found(2), id); assert_eq!(IdLookupResult::Found(6), dict.id(&20_u32)); @@ -720,6 +732,8 @@ mod tests { data.freeze(), ); + assert_eq!(31, dict.num_entries()); + for i in 1..vec.len() + 1 { let (t, s) = dict.entry(i as u64); assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); From 75675ebf456c1088dd71232f5f5a946f4e4c2020 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 15:26:41 +0100 Subject: [PATCH 28/99] block and dict iterators --- src/structure/tfc/block.rs | 93 +++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index caba2706..184d7ed1 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -16,7 +16,7 @@ pub enum SizedDictError { NotEnoughData, } -#[derive(Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub struct SizedBlockHeader { head: Bytes, num_entries: u8, @@ -513,6 +513,56 @@ impl SizedDictBlock { IdLookupResult::Closest(self.header.num_entries as u64 - 1) } + + pub fn iter<'a>(&'a self) -> SizedDictBlockIterator<'a> { + SizedDictBlockIterator { + header: &self.header, + data: self.data.clone(), + ix: 0, + last: None, + } + } +} + +pub struct SizedDictBlockIterator<'a> { + header: &'a SizedBlockHeader, + data: Bytes, + ix: usize, + last: Option>, +} + +impl<'a> Iterator for SizedDictBlockIterator<'a> { + type Item = SizedDictEntry; + + fn next(&mut self) -> Option { + if let Some(last) = self.last.as_mut() { + if self.ix >= self.header.num_entries as usize - 1 { + return None; + } + let size = self.header.sizes[self.ix]; + let mut shared = self.header.shareds[self.ix]; + for rope_index in 0..last.len() { + let x = &mut last[rope_index]; + if x.len() < shared { + shared -= x.len(); + continue; + } + + x.truncate(shared); + last.truncate(rope_index + 1); + break; + } + + last.push(self.data.split_to(size)); + self.ix += 1; + + Some(SizedDictEntry::new(last.clone())) + } else { + let result = vec![self.header.head.clone()]; + self.last = Some(result.clone()); + Some(SizedDictEntry::new(result)) + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -796,4 +846,45 @@ mod tests { assert_eq!(IdLookupResult::Closest(7), block.id(b"f")); } + + #[test] + fn enumerate_complete_block() { + let strings: [&[u8]; 8] = [ + b"aaaaaa", + b"aabb", + b"cccc", + b"cdef", + b"cdff", + b"cdffasdf", + b"cdffeeee", + b"ceeeeeeeeeeeeeee", + ]; + let block = build_block(&strings); + + let result: Vec = block.iter().map(|e| e.to_bytes()).collect(); + assert_eq!( + strings + .iter() + .cloned() + .map(Bytes::from_static) + .collect::>(), + result + ); + } + + #[test] + fn enumerate_incomplete_block() { + let strings: [&[u8]; 6] = [b"aaaaaa", b"aabb", b"cccc", b"cdef", b"cdff", b"cdffasdf"]; + let block = build_block(&strings); + + let result: Vec = block.iter().map(|e| e.to_bytes()).collect(); + assert_eq!( + strings + .iter() + .cloned() + .map(Bytes::from_static) + .collect::>(), + result + ); + } } From b342ae8a15a990bfd17542a325497d4697e99103 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 16:19:10 +0100 Subject: [PATCH 29/99] full iterator over the entire typed dict --- src/structure/tfc/block.rs | 25 +++++-- src/structure/tfc/dict.rs | 82 ++++++++++++++++++++++- src/structure/tfc/typed.rs | 129 ++++++++++++++++++++++++++++++++++++- 3 files changed, 225 insertions(+), 11 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 184d7ed1..0d25e8c1 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::cmp::Ordering; use std::hash::{Hash, Hasher}; @@ -514,9 +515,18 @@ impl SizedDictBlock { IdLookupResult::Closest(self.header.num_entries as u64 - 1) } - pub fn iter<'a>(&'a self) -> SizedDictBlockIterator<'a> { - SizedDictBlockIterator { - header: &self.header, + pub fn iter<'a>(&'a self) -> SizedBlockIterator<'a> { + SizedBlockIterator { + header: Cow::Borrowed(&self.header), + data: self.data.clone(), + ix: 0, + last: None, + } + } + + pub fn into_iter(self) -> OwnedSizedBlockIterator { + SizedBlockIterator { + header: Cow::Owned(self.header), data: self.data.clone(), ix: 0, last: None, @@ -524,14 +534,17 @@ impl SizedDictBlock { } } -pub struct SizedDictBlockIterator<'a> { - header: &'a SizedBlockHeader, +type OwnedSizedBlockIterator = SizedBlockIterator<'static>; + +#[derive(Clone)] +pub struct SizedBlockIterator<'a> { + header: Cow<'a, SizedBlockHeader>, data: Bytes, ix: usize, last: Option>, } -impl<'a> Iterator for SizedDictBlockIterator<'a> { +impl<'a> Iterator for SizedBlockIterator<'a> { type Item = SizedDictEntry; fn next(&mut self) -> Option { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 78626bff..046c8a35 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,4 +1,4 @@ -use std::cmp::Ordering; +use std::{cmp::Ordering, borrow::Cow}; use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; use bytes::{BufMut, Bytes}; @@ -35,7 +35,7 @@ pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { array_builder.finalize(); } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct SizedDict { offsets: MonotonicLogArray, pub(crate) data: Bytes, @@ -146,7 +146,85 @@ impl SizedDict { result } + + pub fn block_iter<'a>(&'a self) -> SizedDictBlockIterator<'a> { + SizedDictBlockIterator { + dict: Cow::Borrowed(self), + index: 0 + } + } + + pub fn into_block_iter(self) -> OwnedSizedDictBlockIterator { + SizedDictBlockIterator { + dict: Cow::Owned(self), + index: 0 + } + } + + pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { + self.block_iter() + .flat_map(|b|b.into_iter()) + } + + pub fn into_iter(self) -> impl Iterator+Clone { + self.into_block_iter() + .flat_map(|b|b.into_iter()) + } +} + +type OwnedSizedDictBlockIterator = SizedDictBlockIterator<'static>; + +#[derive(Clone)] +pub struct SizedDictBlockIterator<'a> { + dict: Cow<'a, SizedDict>, + index: usize, +} + +impl<'a> Iterator for SizedDictBlockIterator<'a> { + type Item = SizedDictBlock; + + fn next(&mut self) -> Option { + if self.index >= self.dict.num_blocks() { + return None; + } + + let block = self.dict.block(self.index); + self.index += 1; + + Some(block) + } +} + +/* +pub struct SizedDictIterator<'a> { + dict: SizedDictBlockIterator<'a>, + block: Option>, +} + +impl<'a> Iterator for SizedDictIterator<'a> { + type Item = SizedDictEntry; + + fn next(&mut self) -> Option { + if let Some(entry) = self.block.as_ref().and_then(|b|b.next()) { + Some(entry) + } + else { + let next_block = self.dict.next(); + if next_block.is_none() { + return None; + } + let next_block = next_block.unwrap(); + let next_block_iter = next_block.iter(); + + let result = next_block_iter.next(); + + self.block = Some(next_block_iter); + + result + } + } } +*/ #[cfg(test)] mod tests { diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index b05f31e0..9aa328df 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -7,16 +7,16 @@ use itertools::*; use num_derive::FromPrimitive; use num_traits::FromPrimitive; use rug::Integer; -use std::marker::PhantomData; +use std::{marker::PhantomData, borrow::Cow}; use super::{ - block::{IdLookupResult, SizedDictEntry}, + block::{IdLookupResult, SizedDictEntry, SizedDictBlock}, decimal::{decimal_to_storage, storage_to_decimal}, dict::{build_dict_unchecked, build_offset_logarray, SizedDict}, integer::{bigint_to_storage, storage_to_bigint}, }; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct TypedDict { types_present: MonotonicLogArray, type_offsets: MonotonicLogArray, @@ -164,8 +164,71 @@ impl TypedDict { pub fn num_entries(&self) -> usize { self.num_entries } + + pub fn segment_iter<'a>(&'a self) -> DictSegmentIterator<'a> { + DictSegmentIterator { + dict: Cow::Borrowed(&self), + type_index: 0 + } + } + + pub fn into_segment_iter(self) -> OwnedDictSegmentIterator { + DictSegmentIterator { + dict: Cow::Owned(self), + type_index: 0 + } + } + + pub fn block_iter<'a>(&'a self) -> impl Iterator+'a+Clone { + self.segment_iter() + .flat_map(|(datatype, segment)| segment.into_block_iter() + .map(move |block| (datatype, block))) + } + + pub fn into_block_iter(self) -> impl Iterator+Clone { + self.into_segment_iter() + .flat_map(|(datatype, segment)| segment.into_block_iter() + .map(move |block| (datatype, block))) + } + + pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { + self.block_iter() + .flat_map(|(datatype, segment)| segment.into_iter() + .map(move |entry| (datatype, entry))) + } + + pub fn into_iter(self) -> impl Iterator+Clone { + self.into_block_iter() + .flat_map(|(datatype, segment)| segment.into_iter() + .map(move |entry| (datatype, entry))) + } +} + +type OwnedDictSegmentIterator = DictSegmentIterator<'static>; + +#[derive(Clone)] +pub struct DictSegmentIterator<'a> { + dict: Cow<'a, TypedDict>, + type_index: usize, } +impl<'a> Iterator for DictSegmentIterator<'a> { + type Item = (Datatype, SizedDict); + + fn next(&mut self) -> Option<(Datatype, SizedDict)> { + if self.type_index >= self.dict.types_present.len() { + return None; + } + + let (segment, _) = self.dict.inner_type_segment(self.type_index); + let datatype = self.dict.type_for_type_index(self.type_index); + self.type_index += 1; + + Some((datatype, segment)) + } +} + + pub struct TypedDictSegment { dict: SizedDict, _x: PhantomData, @@ -774,4 +837,64 @@ mod tests { dict.id(&int("99999999999999999999999999")) ); } + + #[test] + fn iterate_full_blocks() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + "fdsa".to_string().make_entry(), + "a".to_string().make_entry(), + "bc".to_string().make_entry(), + "bcd".to_string().make_entry(), + "z".to_string().make_entry(), + "Batty".to_string().make_entry(), + "Batman".to_string().make_entry(), + "apple".to_string().make_entry(), + (-500_i32).make_entry(), + 20_u32.make_entry(), + 22_u32.make_entry(), + 23_u32.make_entry(), + 24_u32.make_entry(), + 25_u32.make_entry(), + 26_u32.make_entry(), + 27_u32.make_entry(), + 28_u32.make_entry(), + 3000_u32.make_entry(), + (-3_i64).make_entry(), + Decimal("-12342343.2348973".to_string()).make_entry(), + Decimal("234.8973".to_string()).make_entry(), + Decimal("0.2348973".to_string()).make_entry(), + Decimal("23423423.8973".to_string()).make_entry(), + Decimal("3.3".to_string()).make_entry(), + Decimal("0.001".to_string()).make_entry(), + Decimal("-0.001".to_string()).make_entry(), + Decimal("2".to_string()).make_entry(), + Decimal("0".to_string()).make_entry(), + 4.389832_f32.make_entry(), + 23434.389832_f32.make_entry(), + int("239487329872343987").make_entry(), + ]; + vec.sort(); + let mut used_types = BytesMut::new(); + let mut type_offsets = BytesMut::new(); + let mut block_offsets = BytesMut::new(); + let mut data = BytesMut::new(); + build_multiple_segments( + &mut used_types, + &mut type_offsets, + &mut block_offsets, + &mut data, + vec.clone().into_iter(), + ); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + + let actual: Vec<_> = dict.iter().map(|(dt,e)|(dt, e.to_bytes())).collect(); + + assert_eq!(vec, actual); + } } From 73cf21414539965140c1917bc9051611328a07ca Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 17:10:55 +0100 Subject: [PATCH 30/99] prereserve a vector with the right size on block iteration --- src/structure/tfc/block.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 0d25e8c1..b566de0c 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -571,8 +571,10 @@ impl<'a> Iterator for SizedBlockIterator<'a> { Some(SizedDictEntry::new(last.clone())) } else { - let result = vec![self.header.head.clone()]; - self.last = Some(result.clone()); + let mut last = Vec::with_capacity(BLOCK_SIZE); + last.push(self.header.head.clone()); + let result = last.clone(); + self.last = Some(last); Some(SizedDictEntry::new(result)) } } From 1185e233cb32eca2f7931d88189db5f906ee45cb Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 17:32:07 +0100 Subject: [PATCH 31/99] refactor entry buf code to reuse code smarter --- src/structure/tfc/block.rs | 139 ++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 79 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index b566de0c..5b35f58b 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -104,15 +104,15 @@ impl SizedDictEntry { pub fn as_buf(&self) -> SizedDictEntryBuf { SizedDictEntryBuf { - entry: self, + entry: Cow::Borrowed(self), slice_ix: 0, pos_in_slice: 0, } } pub fn into_buf(self) -> OwnedSizedDictEntryBuf { - OwnedSizedDictEntryBuf { - entry: self, + SizedDictEntryBuf { + entry: Cow::Owned(self), slice_ix: 0, pos_in_slice: 0, } @@ -279,100 +279,81 @@ impl PartialOrd for SizedDictEntry { #[derive(Clone)] pub struct SizedDictEntryBuf<'a> { - entry: &'a SizedDictEntry, + entry: Cow<'a, SizedDictEntry>, slice_ix: usize, pos_in_slice: usize, } -fn calculate_remaining<'a>(entry: &SizedDictEntry, slice_ix: usize, pos_in_slice: usize) -> usize { - let total: usize = entry.0.iter().skip(slice_ix).map(|s| s.len()).sum(); - total - pos_in_slice -} - -fn calculate_chunk<'a>(entry: &'a SizedDictEntry, slice_ix: usize, pos_in_slice: usize) -> &[u8] { - if slice_ix >= entry.0.len() { - &[] - } else { - let slice = &entry.0[slice_ix]; - &slice[pos_in_slice..] - } -} - -fn calculate_advance<'a>( - entry: &'a SizedDictEntry, - slice_ix: &mut usize, - pos_in_slice: &mut usize, - mut cnt: usize, -) { - if *slice_ix < entry.0.len() { - let slice = &entry.0[*slice_ix]; - let remaining_in_slice = slice.len() - *pos_in_slice; - - if remaining_in_slice > cnt { - // we remain in the slice we're at. - *pos_in_slice += cnt; - } else { - // we are starting at the next slice - cnt -= remaining_in_slice; - *slice_ix += 1; - - loop { - if entry.0.len() >= *slice_ix { - // past the end - *pos_in_slice = 0; - break; - } - - let slice_len = entry.0[*slice_ix].len(); - - if cnt < slice_len { - // this is our slice - *pos_in_slice = cnt; - break; - } - - // not our slice, so advance to next - cnt -= entry.0.len(); - *slice_ix += 1; - } - } - } -} - impl<'a> Buf for SizedDictEntryBuf<'a> { fn remaining(&self) -> usize { - calculate_remaining(self.entry, self.slice_ix, self.pos_in_slice) + { + let pos_in_slice = self.pos_in_slice; + let total: usize = self + .entry + .0 + .iter() + .skip(self.slice_ix) + .map(|s| s.len()) + .sum(); + total - pos_in_slice + } } fn chunk(&self) -> &[u8] { - calculate_chunk(self.entry, self.slice_ix, self.pos_in_slice) + { + let pos_in_slice = self.pos_in_slice; + if self.slice_ix >= self.entry.0.len() { + &[] + } else { + let slice = &self.entry.0[self.slice_ix]; + &slice[pos_in_slice..] + } + } } fn advance(&mut self, cnt: usize) { - calculate_advance(self.entry, &mut self.slice_ix, &mut self.pos_in_slice, cnt) - } -} - -pub struct OwnedSizedDictEntryBuf { - entry: SizedDictEntry, - slice_ix: usize, - pos_in_slice: usize, -} + { + let pos_in_slice: &mut usize = &mut self.pos_in_slice; + let mut cnt = cnt; + if self.slice_ix < self.entry.0.len() { + let slice = &self.entry.0[self.slice_ix]; + let remaining_in_slice = slice.len() - *pos_in_slice; + + if remaining_in_slice > cnt { + // we remain in the slice we're at. + *pos_in_slice += cnt; + } else { + // we are starting at the next slice + cnt -= remaining_in_slice; + self.slice_ix += 1; + + loop { + if self.entry.0.len() >= self.slice_ix { + // past the end + *pos_in_slice = 0; + break; + } -impl Buf for OwnedSizedDictEntryBuf { - fn remaining(&self) -> usize { - calculate_remaining(&self.entry, self.slice_ix, self.pos_in_slice) - } + let slice_len = self.entry.0[self.slice_ix].len(); - fn chunk(&self) -> &[u8] { - calculate_chunk(&self.entry, self.slice_ix, self.pos_in_slice) - } + if cnt < slice_len { + // this is our slice + *pos_in_slice = cnt; + break; + } - fn advance(&mut self, cnt: usize) { - calculate_advance(&self.entry, &mut self.slice_ix, &mut self.pos_in_slice, cnt) + // not our slice, so advance to next + cnt -= self.entry.0.len(); + self.slice_ix += 1; + } + } + } + } } } +type OwnedSizedDictEntryBuf = SizedDictEntryBuf<'static>; + #[derive(Debug)] pub struct SizedDictBlock { header: SizedBlockHeader, From e23b065c7f696c0c8c3e9dad7a853ccb8ce7c800 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Mon, 28 Nov 2022 17:32:16 +0100 Subject: [PATCH 32/99] reformat --- src/structure/tfc/dict.rs | 16 +++++++------- src/structure/tfc/typed.rs | 43 ++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 046c8a35..c47b65f9 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,4 +1,4 @@ -use std::{cmp::Ordering, borrow::Cow}; +use std::{borrow::Cow, cmp::Ordering}; use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; use bytes::{BufMut, Bytes}; @@ -150,25 +150,23 @@ impl SizedDict { pub fn block_iter<'a>(&'a self) -> SizedDictBlockIterator<'a> { SizedDictBlockIterator { dict: Cow::Borrowed(self), - index: 0 + index: 0, } } pub fn into_block_iter(self) -> OwnedSizedDictBlockIterator { SizedDictBlockIterator { dict: Cow::Owned(self), - index: 0 + index: 0, } } - pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { - self.block_iter() - .flat_map(|b|b.into_iter()) + pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { + self.block_iter().flat_map(|b| b.into_iter()) } - pub fn into_iter(self) -> impl Iterator+Clone { - self.into_block_iter() - .flat_map(|b|b.into_iter()) + pub fn into_iter(self) -> impl Iterator + Clone { + self.into_block_iter().flat_map(|b| b.into_iter()) } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 9aa328df..e00f8a29 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -7,10 +7,10 @@ use itertools::*; use num_derive::FromPrimitive; use num_traits::FromPrimitive; use rug::Integer; -use std::{marker::PhantomData, borrow::Cow}; +use std::{borrow::Cow, marker::PhantomData}; use super::{ - block::{IdLookupResult, SizedDictEntry, SizedDictBlock}, + block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, decimal::{decimal_to_storage, storage_to_decimal}, dict::{build_dict_unchecked, build_offset_logarray, SizedDict}, integer::{bigint_to_storage, storage_to_bigint}, @@ -168,39 +168,43 @@ impl TypedDict { pub fn segment_iter<'a>(&'a self) -> DictSegmentIterator<'a> { DictSegmentIterator { dict: Cow::Borrowed(&self), - type_index: 0 + type_index: 0, } } pub fn into_segment_iter(self) -> OwnedDictSegmentIterator { DictSegmentIterator { dict: Cow::Owned(self), - type_index: 0 + type_index: 0, } } - pub fn block_iter<'a>(&'a self) -> impl Iterator+'a+Clone { - self.segment_iter() - .flat_map(|(datatype, segment)| segment.into_block_iter() - .map(move |block| (datatype, block))) + pub fn block_iter<'a>( + &'a self, + ) -> impl Iterator + 'a + Clone { + self.segment_iter().flat_map(|(datatype, segment)| { + segment + .into_block_iter() + .map(move |block| (datatype, block)) + }) } - pub fn into_block_iter(self) -> impl Iterator+Clone { - self.into_segment_iter() - .flat_map(|(datatype, segment)| segment.into_block_iter() - .map(move |block| (datatype, block))) + pub fn into_block_iter(self) -> impl Iterator + Clone { + self.into_segment_iter().flat_map(|(datatype, segment)| { + segment + .into_block_iter() + .map(move |block| (datatype, block)) + }) } - pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { + pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { self.block_iter() - .flat_map(|(datatype, segment)| segment.into_iter() - .map(move |entry| (datatype, entry))) + .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| (datatype, entry))) } - pub fn into_iter(self) -> impl Iterator+Clone { + pub fn into_iter(self) -> impl Iterator + Clone { self.into_block_iter() - .flat_map(|(datatype, segment)| segment.into_iter() - .map(move |entry| (datatype, entry))) + .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| (datatype, entry))) } } @@ -228,7 +232,6 @@ impl<'a> Iterator for DictSegmentIterator<'a> { } } - pub struct TypedDictSegment { dict: SizedDict, _x: PhantomData, @@ -893,7 +896,7 @@ mod tests { data.freeze(), ); - let actual: Vec<_> = dict.iter().map(|(dt,e)|(dt, e.to_bytes())).collect(); + let actual: Vec<_> = dict.iter().map(|(dt, e)| (dt, e.to_bytes())).collect(); assert_eq!(vec, actual); } From 6411aa382c815b9086440b7eb82e557a1a3a64f8 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 29 Nov 2022 14:23:59 +0100 Subject: [PATCH 33/99] work --- src/layer/id_map.rs | 57 +++++++++++++++++++++------------- src/layer/internal/base.rs | 6 ++-- src/layer/internal/child.rs | 6 ++-- src/layer/internal/mod.rs | 6 ++-- src/storage/cache.rs | 8 ++--- src/storage/consts.rs | 10 ++++-- src/storage/delta.rs | 12 ++++---- src/storage/file.rs | 32 +++++++++++++++++++ src/storage/layer.rs | 34 ++++++++++++-------- src/structure/mod.rs | 1 + src/structure/tfc/block.rs | 2 +- src/structure/tfc/decimal.rs | 2 -- src/structure/tfc/dict.rs | 7 +++++ src/structure/tfc/file.rs | 60 ++++++++++++++++++++++++++++++++++++ src/structure/tfc/mod.rs | 4 +++ src/structure/tfc/typed.rs | 15 +++++++++ 16 files changed, 203 insertions(+), 59 deletions(-) create mode 100644 src/structure/tfc/file.rs diff --git a/src/layer/id_map.rs b/src/layer/id_map.rs index 43994103..345e5f19 100644 --- a/src/layer/id_map.rs +++ b/src/layer/id_map.rs @@ -75,9 +75,9 @@ pub async fn memory_construct_idmaps_upto( } pub async fn construct_idmaps_from_structures( - node_dicts: &[PfcDict], - predicate_dicts: &[PfcDict], - value_dicts: &[PfcDict], + node_dicts: Vec, + predicate_dicts: Vec, + value_dicts: Vec, node_value_idmaps: &[IdMap], predicate_idmaps: &[IdMap], idmap_files: IdMapFiles, @@ -86,49 +86,54 @@ pub async fn construct_idmaps_from_structures debug_assert!(node_dicts.len() == value_dicts.len()); debug_assert!(node_dicts.len() == node_value_idmaps.len()); debug_assert!(node_dicts.len() == predicate_idmaps.len()); + let len = node_dicts.len(); - let mut node_iters = Vec::with_capacity(node_dicts.len()); + let mut node_iters = Vec::with_capacity(len); let mut node_offset = 0; - for (ix, dict) in node_dicts.iter().enumerate() { + let node_entries_len: Vec<_> = node_dicts.iter().map(|d|d.num_entries()).collect(); + for (ix, dict) in node_dicts.into_iter().enumerate() { let idmap = node_value_idmaps[ix].clone(); + let num_entries = dict.num_entries(); node_iters.push( - dict.entries() + dict.into_iter() .enumerate() .map(move |(i, e)| (idmap.inner_to_outer(i as u64) + node_offset as u64, e)), ); - node_offset += dict.len() + value_dicts[ix].len(); + node_offset += num_entries + value_dicts[ix].num_entries(); } - let mut value_iters = Vec::with_capacity(node_dicts.len()); + let mut value_iters = Vec::with_capacity(len); let mut value_offset = 0; - for (ix, dict) in value_dicts.iter().enumerate() { + for (ix, dict) in value_dicts.into_iter().enumerate() { let idmap = node_value_idmaps[ix].clone(); - let node_count = node_dicts[ix].len(); - value_iters.push(dict.entries().enumerate().map(move |(i, e)| { + let node_count = node_entries_len[ix]; + let num_entries = dict.num_entries(); + value_iters.push(dict.into_iter().enumerate().map(move |(i, e)| { ( idmap.inner_to_outer(i as u64 + node_count as u64) + value_offset as u64, e, ) })); - value_offset += node_count + dict.len(); + value_offset += node_count + num_entries; } - let mut predicate_iters = Vec::with_capacity(node_dicts.len()); + let mut predicate_iters = Vec::with_capacity(len); let mut predicate_offset = 0; - for (ix, dict) in predicate_dicts.iter().enumerate() { + for (ix, dict) in predicate_dicts.into_iter().enumerate() { let idmap = predicate_idmaps[ix].clone(); + let num_entries = dict.num_entries(); predicate_iters.push( - dict.entries() + dict.into_iter() .enumerate() .map(move |(i, e)| (idmap.inner_to_outer(i as u64) + predicate_offset as u64, e)), ); - predicate_offset += dict.len(); + predicate_offset += num_entries; } - let entry_comparator = |vals: &[Option<&(u64, PfcDictEntry)>]| { + let entry_comparator = |vals: &[Option<&(u64, SizedDictEntry)>]| { vals.iter() .enumerate() .filter(|(_, x)| x.is_some()) @@ -136,8 +141,16 @@ pub async fn construct_idmaps_from_structures .map(|x| x.0) }; - let sorted_node_iter = sorted_iterator(node_iters, entry_comparator); - let sorted_value_iter = sorted_iterator(value_iters, entry_comparator); + let typed_entry_comparator = |vals: &[Option<&(u64, (Datatype, SizedDictEntry))>]| { + vals.iter() + .enumerate() + .filter(|(_, x)| x.is_some()) + .min_by(|(_, x), (_, y)| x.unwrap().1.cmp(&y.unwrap().1)) + .map(|x| x.0) + }; + + let sorted_node_iter = sorted_iterator(node_iters, entry_comparator).map(|(i,s)|(i, (Datatype::String, s))); + let sorted_value_iter = sorted_iterator(value_iters, typed_entry_comparator); let sorted_node_value_iter = sorted_node_iter.chain(sorted_value_iter).map(|(id, _)| id); let sorted_predicate_iter = sorted_iterator(predicate_iters, entry_comparator).map(|(id, _)| id); @@ -193,9 +206,9 @@ async fn construct_idmaps_from_layers( .collect(); construct_idmaps_from_structures( - &node_dicts, - &predicate_dicts, - &value_dicts, + node_dicts, + predicate_dicts, + value_dicts, &node_value_idmaps, &predicate_idmaps, idmap_files, diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 1c37c107..1a18df01 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -22,9 +22,9 @@ use std::pin::Pin; #[derive(Clone)] pub struct BaseLayer { pub(super) name: [u32; 5], - pub(super) node_dictionary: PfcDict, - pub(super) predicate_dictionary: PfcDict, - pub(super) value_dictionary: PfcDict, + pub(super) node_dictionary: StringDict, + pub(super) predicate_dictionary: StringDict, + pub(super) value_dictionary: TypedDict, pub(super) node_value_idmap: IdMap, pub(super) predicate_idmap: IdMap, diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index 4c911747..c688e66b 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -25,9 +25,9 @@ pub struct ChildLayer { pub(super) name: [u32; 5], pub(super) parent: Arc, - pub(super) node_dictionary: PfcDict, - pub(super) predicate_dictionary: PfcDict, - pub(super) value_dictionary: PfcDict, + pub(super) node_dictionary: StringDict, + pub(super) predicate_dictionary: StringDict, + pub(super) value_dictionary: TypedDict, pub(super) node_value_idmap: IdMap, pub(super) predicate_idmap: IdMap, diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 246af53f..169f6462 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -64,7 +64,7 @@ impl InternalLayer { count } - pub fn node_dictionary(&self) -> &PfcDict { + pub fn node_dictionary(&self) -> &StringDict { match self { Base(base) => &base.node_dictionary, Child(child) => &child.node_dictionary, @@ -72,7 +72,7 @@ impl InternalLayer { } } - pub fn predicate_dictionary(&self) -> &PfcDict { + pub fn predicate_dictionary(&self) -> &StringDict { match self { Base(base) => &base.predicate_dictionary, Child(child) => &child.predicate_dictionary, @@ -80,7 +80,7 @@ impl InternalLayer { } } - pub fn value_dictionary(&self) -> &PfcDict { + pub fn value_dictionary(&self) -> &TypedDict { match self { Base(base) => &base.value_dictionary, Child(child) => &child.value_dictionary, diff --git a/src/storage/cache.rs b/src/storage/cache.rs index 01f96db8..2d00302b 100644 --- a/src/storage/cache.rs +++ b/src/storage/cache.rs @@ -1,6 +1,6 @@ use super::layer::*; use crate::layer::*; -use crate::structure::PfcDict; +use crate::structure::{StringDict, TypedDict}; use async_trait::async_trait; use std::collections::HashMap; use std::io; @@ -135,7 +135,7 @@ impl LayerStore for CachedLayerStore { } } - async fn get_node_dictionary(&self, name: [u32; 5]) -> io::Result> { + async fn get_node_dictionary(&self, name: [u32; 5]) -> io::Result> { // is layer in cache? if so, we can use the cached version if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup @@ -147,7 +147,7 @@ impl LayerStore for CachedLayerStore { self.inner.get_node_dictionary(name).await } - async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result> { + async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result> { // is layer in cache? if so, we can use the cached version if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup @@ -159,7 +159,7 @@ impl LayerStore for CachedLayerStore { self.inner.get_predicate_dictionary(name).await } - async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result> { + async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result> { // is layer in cache? if so, we can use the cached version if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup diff --git a/src/storage/consts.rs b/src/storage/consts.rs index 61e1da41..de296ebb 100644 --- a/src/storage/consts.rs +++ b/src/storage/consts.rs @@ -5,6 +5,8 @@ pub struct Filenames { pub predicate_dictionary_blocks: &'static str, pub predicate_dictionary_offsets: &'static str, + pub value_dictionary_types_present: &'static str, + pub value_dictionary_type_offsets: &'static str, pub value_dictionary_blocks: &'static str, pub value_dictionary_offsets: &'static str, @@ -87,13 +89,15 @@ pub struct Filenames { } pub const FILENAMES: Filenames = Filenames { - node_dictionary_blocks: "node_dictionary_blocks.pfc", + node_dictionary_blocks: "node_dictionary_blocks.tfc", node_dictionary_offsets: "node_dictionary_offsets.logarray", - predicate_dictionary_blocks: "predicate_dictionary_blocks.pfc", + predicate_dictionary_blocks: "predicate_dictionary_blocks.tfc", predicate_dictionary_offsets: "predicate_dictionary_offsets.logarray", - value_dictionary_blocks: "value_dictionary_blocks.pfc", + value_dictionary_types_present: "value_dictionary_types.logarray", + value_dictionary_type_offsets: "value_dictionary_type_offsets.logarray", + value_dictionary_blocks: "value_dictionary_blocks.tfc", value_dictionary_offsets: "value_dictionary_offsets.logarray", node_value_idmap_bits: "node_value_idmap_bits.bitarray", diff --git a/src/storage/delta.rs b/src/storage/delta.rs index 87bf5f14..ad6e49ae 100644 --- a/src/storage/delta.rs +++ b/src/storage/delta.rs @@ -48,7 +48,7 @@ async fn get_node_dicts_from_disk( store: &S, name: [u32; 5], upto: [u32; 5], -) -> io::Result> { +) -> io::Result> { let mut result = Vec::new(); walk_backwards_from_disk_upto!(store, name, upto, current, { let dict = store @@ -67,7 +67,7 @@ async fn get_predicate_dicts_from_disk( store: &S, name: [u32; 5], upto: [u32; 5], -) -> io::Result> { +) -> io::Result> { let mut result = Vec::new(); walk_backwards_from_disk_upto!(store, name, upto, current, { let dict = store @@ -86,7 +86,7 @@ async fn get_value_dicts_from_disk( store: &S, name: [u32; 5], upto: [u32; 5], -) -> io::Result> { +) -> io::Result> { let mut result = Vec::new(); walk_backwards_from_disk_upto!(store, name, upto, current, { let dict = store @@ -208,9 +208,9 @@ async fn dictionary_rollup_upto ChildLayerFiles { } } +#[derive(Clone)] +pub struct TypedDictionaryMaps { + pub types_present_map: Bytes, + pub type_offsets_map: Bytes, + pub blocks_map: Bytes, + pub offsets_map: Bytes, +} + +#[derive(Clone)] +pub struct TypedDictionaryFiles { + pub types_present_file: F, + pub type_offsets_file: F, + pub blocks_file: F, + pub offsets_file: F, +} + +impl TypedDictionaryFiles { + pub async fn map_all(&self) -> io::Result { + let types_present_map = self.types_present_file.map().await?; + let type_offsets_map = self.type_offsets_file.map().await?; + let blocks_map = self.blocks_file.map().await?; + let offsets_map = self.offsets_file.map().await?; + + Ok(TypedDictionaryMaps { + types_present_map, + type_offsets_map, + blocks_map, + offsets_map, + }) + } +} + #[derive(Clone)] pub struct DictionaryMaps { pub blocks_map: Bytes, diff --git a/src/storage/layer.rs b/src/storage/layer.rs index db6806b7..bd17c109 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -10,10 +10,12 @@ use crate::layer::{ OptInternalLayerTriplePredicateIterator, OptInternalLayerTripleSubjectIterator, RollupLayer, SimpleLayerBuilder, }; +use crate::structure::StringDict; +use crate::structure::TypedDict; use crate::structure::bitarray::bitarray_len_from_file; use crate::structure::logarray::logarray_file_get_length_and_width; use crate::structure::{ - dict_file_get_count, util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, PfcDict, + dict_file_get_count, util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree, }; @@ -75,11 +77,11 @@ pub trait LayerStore: 'static + Packable + Send + Sync { async fn get_layer_parent_name(&self, name: [u32; 5]) -> io::Result>; - async fn get_node_dictionary(&self, name: [u32; 5]) -> io::Result>; + async fn get_node_dictionary(&self, name: [u32; 5]) -> io::Result>; - async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result>; + async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result>; - async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result>; + async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result>; async fn get_node_count(&self, name: [u32; 5]) -> io::Result>; @@ -738,9 +740,15 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { async fn value_dictionary_files( &self, layer: [u32; 5], - ) -> io::Result> { + ) -> io::Result> { // does layer exist? if self.directory_exists(layer).await? { + let types_present_file = self + .get_file(layer, FILENAMES.value_dictionary_types_present) + .await?; + let type_offsets_file = self + .get_file(layer, FILENAMES.value_dictionary_type_offsets) + .await?; let blocks_file = self .get_file(layer, FILENAMES.value_dictionary_blocks) .await?; @@ -748,7 +756,9 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { .get_file(layer, FILENAMES.value_dictionary_offsets) .await?; - Ok(DictionaryFiles { + Ok(TypedDictionaryFiles { + types_present_file, + type_offsets_file, blocks_file, offsets_file, }) @@ -1546,34 +1556,34 @@ impl io::Result> { + async fn get_node_dictionary(&self, name: [u32; 5]) -> io::Result> { if self.directory_exists(name).await? { let files = self.node_dictionary_files(name).await?; let maps = files.map_all().await?; - Ok(Some(PfcDict::parse(maps.blocks_map, maps.offsets_map)?)) + Ok(Some(StringDict::parse(maps.blocks_map, maps.offsets_map)?)) } else { Ok(None) } } - async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result> { + async fn get_predicate_dictionary(&self, name: [u32; 5]) -> io::Result> { if self.directory_exists(name).await? { let files = self.predicate_dictionary_files(name).await?; let maps = files.map_all().await?; - Ok(Some(PfcDict::parse(maps.blocks_map, maps.offsets_map)?)) + Ok(Some(StringDict::parse(maps.blocks_map, maps.offsets_map)?)) } else { Ok(None) } } - async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result> { + async fn get_value_dictionary(&self, name: [u32; 5]) -> io::Result> { if self.directory_exists(name).await? { let files = self.value_dictionary_files(name).await?; let maps = files.map_all().await?; - Ok(Some(PfcDict::parse(maps.blocks_map, maps.offsets_map)?)) + Ok(Some(TypedDict::from_parts(maps.blocks_map, maps.offsets_map)?)) } else { Ok(None) } diff --git a/src/structure/mod.rs b/src/structure/mod.rs index 78906419..b4ada408 100644 --- a/src/structure/mod.rs +++ b/src/structure/mod.rs @@ -19,4 +19,5 @@ pub use bitarray::*; pub use bitindex::*; pub use logarray::*; pub use pfc::*; +pub use tfc::*; pub use wavelettree::*; diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 5b35f58b..fbc63d9d 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -352,7 +352,7 @@ impl<'a> Buf for SizedDictEntryBuf<'a> { } } -type OwnedSizedDictEntryBuf = SizedDictEntryBuf<'static>; +pub type OwnedSizedDictEntryBuf = SizedDictEntryBuf<'static>; #[derive(Debug)] pub struct SizedDictBlock { diff --git a/src/structure/tfc/decimal.rs b/src/structure/tfc/decimal.rs index 7fa0143d..b7acaf6c 100644 --- a/src/structure/tfc/decimal.rs +++ b/src/structure/tfc/decimal.rs @@ -1,8 +1,6 @@ use bytes::Buf; use rug::Integer; -use crate::structure::tfc::integer; - use super::integer::{bigint_to_storage, storage_to_bigint_and_sign, NEGATIVE_ZERO}; fn encode_fraction(fraction: Option<&str>) -> Vec { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index c47b65f9..301c393b 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -168,6 +168,13 @@ impl SizedDict { pub fn into_iter(self) -> impl Iterator + Clone { self.into_block_iter().flat_map(|b| b.into_iter()) } + + pub fn num_entries(&self) -> usize { + let num_blocks = self.num_blocks(); + let last_block_size = self.block_num_elements(num_blocks - 1); + + (num_blocks-1) * BLOCK_SIZE + last_block_size as usize + } } type OwnedSizedDictBlockIterator = SizedDictBlockIterator<'static>; diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs new file mode 100644 index 00000000..15a75c2d --- /dev/null +++ b/src/structure/tfc/file.rs @@ -0,0 +1,60 @@ +use bytes::BytesMut; + +use crate::{storage::*, structure::util::sorted_iterator}; + +use super::{*, dict::{build_dict_unchecked, build_offset_logarray}}; + +pub struct StringDictFileBuilder { + /// the file that this builder writes the pfc blocks to + blocks_file: W, + /// the file that this builder writes the block offsets to + block_offsets_file: W, + + strings: Vec, +} + +impl StringDictFileBuilder { + pub fn new(blocks_file: W, block_offsets_file: W) -> Self { + Self { + blocks_file, + block_offsets_file, + strings: Vec::new() + } + } +} + +pub async fn merge_string_dictionaries< + 'a, + F: 'static + FileLoad + FileStore, + I: Iterator, +>( + dictionaries: I, + dict_files: DictionaryFiles, +) -> io::Result<()> { + let iterators: Vec<_> = dictionaries.map(|d| d.iter()).collect(); + + let pick_fn = |vals: &[Option<&SizedDictEntry>]| { + vals.iter() + .enumerate() + .filter(|(_, v)| v.is_some()) + .min_by(|(_, x), (_, y)| x.cmp(y)) + .map(|(ix, _)| ix) + }; + + let sorted_iterator = sorted_iterator(iterators, pick_fn); + + let blocks_file_writer = dict_files.blocks_file.open_write().await?; + let offsets_file_writer = dict_files.offsets_file.open_write().await?; + + let mut offsets = Vec::new(); + let mut offsets_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + build_dict_unchecked(0, &mut offsets, &mut data_buf, sorted_iterator); + build_offset_logarray(&mut offsets_buf, offsets); + + + + + builder.add_all_entries(sorted_iterator).await?; + builder.finalize().await +} diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index 4b0b89cd..d6297508 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -3,3 +3,7 @@ pub mod decimal; pub mod dict; pub mod integer; pub mod typed; +pub mod file; + +pub use typed::*; +pub use block::{SizedDictEntry, SizedDictEntryBuf, OwnedSizedDictEntryBuf}; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index e00f8a29..8f87bce1 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -232,6 +232,7 @@ impl<'a> Iterator for DictSegmentIterator<'a> { } } +#[derive(Clone)] pub struct TypedDictSegment { dict: SizedDict, _x: PhantomData, @@ -255,8 +256,22 @@ impl TypedDictSegment { let slice = val.to_lexical(); self.dict.id(&slice[..]) } + + pub fn num_entries(&self) -> usize { + self.dict.num_entries() + } + + pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { + self.dict.iter() + } + + pub fn into_iter(self) -> impl Iterator+Clone { + self.dict.into_iter() + } } +pub type StringDict = TypedDictSegment; + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] pub enum Datatype { String = 0, From e25f46b09a936065071199a0d1a3a649954a4e39 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 29 Nov 2022 15:49:57 +0100 Subject: [PATCH 34/99] more work --- src/storage/delta.rs | 20 ++++++----- src/storage/file.rs | 8 ++--- src/structure/tfc/file.rs | 72 ++++++++++++++++++++++++++++++++++----- src/structure/util.rs | 12 ++++--- 4 files changed, 86 insertions(+), 26 deletions(-) diff --git a/src/storage/delta.rs b/src/storage/delta.rs index ad6e49ae..e93e80e1 100644 --- a/src/storage/delta.rs +++ b/src/storage/delta.rs @@ -1,5 +1,7 @@ use std::io; +use tfc::file::{merge_string_dictionaries, merge_typed_dictionaries}; + use crate::layer::builder::{build_indexes, TripleFileBuilder}; use crate::layer::*; use crate::storage::*; @@ -199,13 +201,13 @@ async fn dictionary_rollup_upto( .into_iter() .map(|l| l.value_dictionary()); - merge_dictionaries(node_dicts, files.node_dictionary_files.clone()).await?; - merge_dictionaries(predicate_dicts, files.predicate_dictionary_files.clone()).await?; - merge_dictionaries(value_dicts, files.value_dictionary_files.clone()).await?; + merge_string_dictionaries(node_dicts, files.node_dictionary_files.clone()).await?; + merge_string_dictionaries(predicate_dicts, files.predicate_dictionary_files.clone()).await?; + merge_typed_dictionaries(value_dicts, files.value_dictionary_files.clone()).await?; memory_construct_idmaps(layer, files.id_map_files.clone()).await } @@ -260,9 +262,9 @@ async fn memory_dictionary_rollup_upto( .into_iter() .map(|l| l.value_dictionary()); - merge_dictionaries(node_dicts, files.node_dictionary_files.clone()).await?; - merge_dictionaries(predicate_dicts, files.predicate_dictionary_files.clone()).await?; - merge_dictionaries(value_dicts, files.value_dictionary_files.clone()).await?; + merge_string_dictionaries(node_dicts, files.node_dictionary_files.clone()).await?; + merge_string_dictionaries(predicate_dicts, files.predicate_dictionary_files.clone()).await?; + merge_typed_dictionaries(value_dicts, files.value_dictionary_files.clone()).await?; memory_construct_idmaps_upto(layer, upto, files.id_map_files.clone()).await } diff --git a/src/storage/file.rs b/src/storage/file.rs index dd077cd6..6b04a923 100644 --- a/src/storage/file.rs +++ b/src/storage/file.rs @@ -91,7 +91,7 @@ impl LayerFiles { pub struct BaseLayerFiles { pub node_dictionary_files: DictionaryFiles, pub predicate_dictionary_files: DictionaryFiles, - pub value_dictionary_files: DictionaryFiles, + pub value_dictionary_files: TypedDictionaryFiles, pub id_map_files: IdMapFiles, @@ -110,7 +110,7 @@ pub struct BaseLayerFiles { pub struct BaseLayerMaps { pub node_dictionary_maps: DictionaryMaps, pub predicate_dictionary_maps: DictionaryMaps, - pub value_dictionary_maps: DictionaryMaps, + pub value_dictionary_maps: TypedDictionaryMaps, pub id_map_maps: IdMapMaps, @@ -165,7 +165,7 @@ impl BaseLayerFiles { pub struct ChildLayerFiles { pub node_dictionary_files: DictionaryFiles, pub predicate_dictionary_files: DictionaryFiles, - pub value_dictionary_files: DictionaryFiles, + pub value_dictionary_files: TypedDictionaryFiles, pub id_map_files: IdMapFiles, @@ -189,7 +189,7 @@ pub struct ChildLayerFiles StringDictFileBuilder { } pub async fn merge_string_dictionaries< - 'a, + 'a, F: 'static + FileLoad + FileStore, - I: Iterator, + I: Iterator+'a, >( dictionaries: I, dict_files: DictionaryFiles, ) -> io::Result<()> { - let iterators: Vec<_> = dictionaries.map(|d| d.iter()).collect(); + let iterators: Vec<_> = dictionaries.map(|d|d.iter()).collect(); let pick_fn = |vals: &[Option<&SizedDictEntry>]| { vals.iter() @@ -41,10 +43,10 @@ pub async fn merge_string_dictionaries< .map(|(ix, _)| ix) }; - let sorted_iterator = sorted_iterator(iterators, pick_fn); + let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|elt|elt.to_bytes()); - let blocks_file_writer = dict_files.blocks_file.open_write().await?; - let offsets_file_writer = dict_files.offsets_file.open_write().await?; + let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; + let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; let mut offsets = Vec::new(); let mut offsets_buf = BytesMut::new(); @@ -53,8 +55,62 @@ pub async fn merge_string_dictionaries< build_offset_logarray(&mut offsets_buf, offsets); + blocks_file_writer.write_all(data_buf.as_ref()).await?; + blocks_file_writer.flush().await?; + blocks_file_writer.sync_all().await?; + offsets_file_writer.write_all(offsets_buf.as_ref()).await?; + offsets_file_writer.flush().await?; + offsets_file_writer.sync_all().await?; + Ok(()) +} + +pub async fn merge_typed_dictionaries< + 'a, + F: 'static + FileLoad + FileStore, + I: Iterator+'a, +>( + dictionaries: I, + dict_files: TypedDictionaryFiles, +) -> io::Result<()> { + let iterators: Vec<_> = dictionaries.map(|d|d.iter()).collect(); + + let pick_fn = |vals: &[Option<&(Datatype, SizedDictEntry)>]| { + vals.iter() + .enumerate() + .filter(|(_, v)| v.is_some()) + .min_by(|(_, x), (_, y)| x.cmp(y)) + .map(|(ix, _)| ix) + }; + + let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|(dt, elt)|(dt,elt.to_bytes())); + + let mut types_present_file_writer = dict_files.types_present_file.open_write().await?; + let mut type_offsets_file_writer = dict_files.type_offsets_file.open_write().await?; + let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; + let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; + + let mut types_present_buf = BytesMut::new(); + let mut type_offsets_buf = BytesMut::new(); + let mut offsets_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + build_multiple_segments(&mut types_present_buf, &mut type_offsets_buf, &mut offsets_buf, &mut data_buf, sorted_iterator); + + types_present_file_writer.write_all(types_present_buf.as_ref()).await?; + types_present_file_writer.flush().await?; + types_present_file_writer.sync_all().await?; + + type_offsets_file_writer.write_all(type_offsets_buf.as_ref()).await?; + type_offsets_file_writer.flush().await?; + type_offsets_file_writer.sync_all().await?; + + blocks_file_writer.write_all(data_buf.as_ref()).await?; + blocks_file_writer.flush().await?; + blocks_file_writer.sync_all().await?; + + offsets_file_writer.write_all(offsets_buf.as_ref()).await?; + offsets_file_writer.flush().await?; + offsets_file_writer.sync_all().await?; - builder.add_all_entries(sorted_iterator).await?; - builder.finalize().await + Ok(()) } diff --git a/src/structure/util.rs b/src/structure/util.rs index 6bb0f721..d47f86d9 100644 --- a/src/structure/util.rs +++ b/src/structure/util.rs @@ -123,7 +123,7 @@ pub fn sorted_stream< struct SortedIterator< T, - I: 'static + Iterator + Send, + I: Iterator + Send, F: 'static + Fn(&[Option<&T>]) -> Option, > { iters: Vec>, @@ -131,8 +131,9 @@ struct SortedIterator< } impl< + 'a, T, - I: 'static + Iterator + Send, + I: 'a + Iterator + Send, F: 'static + Fn(&[Option<&T>]) -> Option, > Iterator for SortedIterator { @@ -154,13 +155,14 @@ impl< } pub fn sorted_iterator< - T, - I: 'static + Iterator + Send, + 'a, + T: 'a, + I: 'a + Iterator + Send, F: 'static + Fn(&[Option<&T>]) -> Option, >( iters: Vec, pick_fn: F, -) -> impl Iterator { +) -> impl Iterator+'a { let peekable_iters = iters .into_iter() .map(std::iter::Iterator::peekable) From 1982937408b3237a3d65215fdd50db2c7a5cb7f7 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 29 Nov 2022 16:31:53 +0100 Subject: [PATCH 35/99] some builder logic around sizeddict --- src/structure/logarray.rs | 47 ++++++++++++++++- src/structure/tfc/dict.rs | 103 +++++++++++++++++++++++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 895999fa..2fa7479e 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -49,7 +49,7 @@ //! //! * length: the number of elements in the log array -use super::util; +use super::util::{self, calculate_width}; use crate::storage::*; use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, Bytes, BytesMut}; @@ -417,6 +417,51 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { } } +pub struct LateLogArrayBufBuilder<'a, B: BufMut> { + /// Destination of the log array data + buf: &'a mut B, + vals: Vec, + width: u8 +} + +impl<'a, B: BufMut> LateLogArrayBufBuilder<'a, B> { + pub fn new(buf: &'a mut B) -> Self { + Self { + buf, + vals: Vec::new(), + width: 0 + } + } + + pub fn count(&self) -> u32 { + self.vals.len() as u32 + } + + pub fn push(&mut self, val: u64) { + self.vals.push(val); + let width = calculate_width(val); + if self.width < width { + self.width = width; + } + } + + pub fn push_vec(&mut self, vals: Vec) { + for val in vals { + self.push(val) + } + } + + pub fn pop(&mut self) -> Option { + self.vals.pop() + } + + pub fn finalize(self) { + let mut builder = LogArrayBufBuilder::new(self.buf, self.width); + builder.push_vec(self.vals); + builder.finalize(); + } +} + /// write a logarray directly to an AsyncWrite pub struct LogArrayFileBuilder { /// Destination of the log array data diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index c47b65f9..46abdce6 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,6 +1,6 @@ use std::{borrow::Cow, cmp::Ordering}; -use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray}; +use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, LateLogArrayBufBuilder}; use bytes::{BufMut, Bytes}; use itertools::Itertools; @@ -23,6 +23,58 @@ pub fn build_dict_unchecked, I: Iterator>( offsets.push(offset); } } + +struct SizedDictBufBuilder<'a, B1:BufMut, B2:BufMut> { + block_offset: u64, + id_offset: u64, + offsets: LateLogArrayBufBuilder<'a, B2>, + data_buf: &'a mut B1, + current_block: Vec, +} + +impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { + pub fn new(block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B2>, data_buf: &'a mut B1) -> Self { + Self { + block_offset, + id_offset, offsets, data_buf, + current_block: Vec::with_capacity(8) + } + } + + pub fn add(&mut self, value: Bytes) -> u64 { + self.current_block.push(value); + self.id_offset += 1; + if self.current_block.len() == BLOCK_SIZE { + let current_block: Vec<&[u8]> = self.current_block.iter().map(|e|e.as_ref()).collect(); + let size = build_block_unchecked(self.data_buf, ¤t_block); + self.block_offset += size as u64; + self.offsets.push(self.block_offset); + + self.current_block.truncate(0); + } + + self.id_offset + } + + pub fn add_entry(&mut self, e: &SizedDictEntry) -> u64 { + self.add(e.to_bytes()) + } + + pub fn add_all>(&mut self, it: I) -> Vec { + it.map(|val| self.add(val)).collect() + } + + pub fn finalize(mut self) -> LateLogArrayBufBuilder<'a, B2> { + if self.current_block.len() > 0 { + let current_block: Vec<&[u8]> = self.current_block.iter().map(|e|e.as_ref()).collect(); + let size = build_block_unchecked(self.data_buf, ¤t_block); + self.offsets.push(self.block_offset + size as u64); + } + + self.offsets + } +} + pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { // the last offset doesn't matter as it's implied by the total size offsets.pop(); @@ -281,6 +333,55 @@ mod tests { } } + #[test] + fn build_dict_of_two_blocks_with_builder() { + let strings: Vec<&[u8]> = vec![ + b"aaaaaaaa", + b"bbbbbbbb", + b"bbbcccdaaaa", + b"f", + b"fafasdfas", + b"gafovp", + b"gdfasfa", + b"gdfbbbbbb", + b"hello", + b"iguana", + b"illusion", + b"illustrated", + b"jetengine", + b"jetplane", + ]; + + let mut array_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + + let logarray_builder = LateLogArrayBufBuilder::new(&mut array_buf); + + let mut builder = SizedDictBufBuilder::new(0, 0, logarray_builder, &mut data_buf); + builder.add_all(strings.clone().into_iter().map(|v|Bytes::from_static(v))); + let mut logarray_builder = builder.finalize(); + logarray_builder.pop(); + logarray_builder.finalize(); + + let array_bytes = array_buf.freeze(); + let data_bytes = data_buf.freeze(); + let dict = SizedDict::parse(array_bytes, data_bytes, 0); + + assert_eq!(2, dict.num_blocks()); + assert_eq!(b"aaaaaaaa", &dict.block_head(0)[..]); + assert_eq!(b"hello", &dict.block_head(1)[..]); + + let block0 = dict.block(0); + let block1 = dict.block(1); + + assert_eq!(8, block0.num_entries()); + assert_eq!(6, block1.num_entries()); + + for (ix, s) in strings.into_iter().enumerate() { + assert_eq!(s, &dict.entry((ix + 1) as u64).to_bytes()[..]); + } + } + #[test] fn lookup_entries_by_slice() { let strings: Vec<&[u8]> = vec![ From 26d655ed7626e9580f78396589a000d959533e8d Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 29 Nov 2022 20:58:43 +0100 Subject: [PATCH 36/99] Adding builder, doesn't work because of buf borrow --- src/structure/logarray.rs | 4 ++ src/structure/tfc/dict.rs | 21 ++++-- src/structure/tfc/typed.rs | 142 ++++++++++++++++++++++++++++++++++++- 3 files changed, 159 insertions(+), 8 deletions(-) diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 2fa7479e..6fa51562 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -451,6 +451,10 @@ impl<'a, B: BufMut> LateLogArrayBufBuilder<'a, B> { } } + pub fn last(&mut self) -> Option { + self.vals.last().copied() + } + pub fn pop(&mut self) -> Option { self.vals.pop() } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 46abdce6..12c608ca 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -24,16 +24,16 @@ pub fn build_dict_unchecked, I: Iterator>( } } -struct SizedDictBufBuilder<'a, B1:BufMut, B2:BufMut> { +pub struct SizedDictBufBuilder<'a, B1:BufMut, B2:BufMut> { block_offset: u64, id_offset: u64, - offsets: LateLogArrayBufBuilder<'a, B2>, - data_buf: &'a mut B1, + offsets: LateLogArrayBufBuilder<'a, B1>, + data_buf: &'a mut B2, current_block: Vec, } impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { - pub fn new(block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B2>, data_buf: &'a mut B1) -> Self { + pub fn new(block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B1>, data_buf: &'a mut B2) -> Self { Self { block_offset, id_offset, offsets, data_buf, @@ -41,6 +41,14 @@ impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { } } + pub fn id_offset(&self) -> u64{ + self.id_offset + } + + pub fn block_offset(&self) -> u64{ + self.block_offset + } + pub fn add(&mut self, value: Bytes) -> u64 { self.current_block.push(value); self.id_offset += 1; @@ -64,11 +72,12 @@ impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { it.map(|val| self.add(val)).collect() } - pub fn finalize(mut self) -> LateLogArrayBufBuilder<'a, B2> { + pub fn finalize(mut self) -> LateLogArrayBufBuilder<'a, B1> { if self.current_block.len() > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e|e.as_ref()).collect(); let size = build_block_unchecked(self.data_buf, ¤t_block); - self.offsets.push(self.block_offset + size as u64); + self.block_offset += size as u64; + self.offsets.push(self.block_offset); } self.offsets diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index e00f8a29..b904f931 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,5 +1,6 @@ use crate::structure::{ - tfc::block::BLOCK_SIZE, util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, + tfc::block::BLOCK_SIZE, util::calculate_width, LateLogArrayBufBuilder, LogArrayBufBuilder, + MonotonicLogArray, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -12,7 +13,7 @@ use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, decimal::{decimal_to_storage, storage_to_decimal}, - dict::{build_dict_unchecked, build_offset_logarray, SizedDict}, + dict::{build_dict_unchecked, build_offset_logarray, SizedDict, SizedDictBufBuilder}, integer::{bigint_to_storage, storage_to_bigint}, }; @@ -529,6 +530,79 @@ pub fn build_multiple_segments< type_offsets_builder.finalize(); } +struct TypedDictBufBuilder<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> { + types_present_builder: LateLogArrayBufBuilder<'a, B1>, + type_offsets_builder: LateLogArrayBufBuilder<'a, B2>, + sized_dict_buf_builder: SizedDictBufBuilder<'a, B3, B4>, + data_buf: &'a mut B4, + current_datatype: Option, +} + +impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, B1, B2, B3, B4> { + pub fn new( + used_types: &'a mut B1, + type_offsets: &'a mut B2, + block_offsets: &'a mut B3, + data_buf: &'a mut B4, + ) -> Self { + let types_present_builder = LateLogArrayBufBuilder::new(used_types); + let type_offsets_builder = LateLogArrayBufBuilder::new(type_offsets); + let block_offset_builder = LateLogArrayBufBuilder::new(block_offsets); + let sized_dict_buf_builder = SizedDictBufBuilder::new(0, 0, block_offset_builder, data_buf); + Self { + types_present_builder, + type_offsets_builder, + data_buf, + sized_dict_buf_builder, + current_datatype: None, + } + } + + pub fn add(&mut self, dt: Datatype, value: Bytes) -> u64 { + if self.current_datatype == None { + self.current_datatype = Some(dt); + self.types_present_builder.push(dt as u64); + } + + if self.current_datatype != Some(dt) { + let id_offset = self.sized_dict_buf_builder.id_offset(); + let block_offset = self.sized_dict_buf_builder.block_offset(); + let block_offset_builder = self.sized_dict_buf_builder.finalize(); + + self.types_present_builder.push(dt as u64); + self.type_offsets_builder.push(block_offset + 1); + self.sized_dict_buf_builder = SizedDictBufBuilder::new( + id_offset, + block_offset, + block_offset_builder, + self.data_buf, + ); + } + + self.sized_dict_buf_builder.add(value) + } + + pub fn add_entry(&mut self, dt: Datatype, e: &SizedDictEntry) -> u64 { + self.add(dt, e.to_bytes()) + } + + pub fn add_all>(&mut self, it: I) -> Vec { + it.map(|(dt, val)| self.add(dt, val)).collect() + } + + pub fn finalize(mut self) { + if self.current_datatype == None { + panic!("There was nothing added to this dictionary!"); + } + let block_offset_builder = self.sized_dict_buf_builder.finalize(); + block_offset_builder.pop(); + block_offset_builder.finalize(); + + self.types_present_builder.finalize(); + self.type_offsets_builder.finalize(); + } +} + #[cfg(test)] mod tests { use crate::structure::tfc::dict::build_offset_logarray; @@ -900,4 +974,68 @@ mod tests { assert_eq!(vec, actual); } + + fn test_incremental_builder() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + "fdsa".to_string().make_entry(), + "a".to_string().make_entry(), + "bc".to_string().make_entry(), + "bcd".to_string().make_entry(), + "z".to_string().make_entry(), + "Batty".to_string().make_entry(), + "Batman".to_string().make_entry(), + "apple".to_string().make_entry(), + (-500_i32).make_entry(), + 20_u32.make_entry(), + 22_u32.make_entry(), + 23_u32.make_entry(), + 24_u32.make_entry(), + 25_u32.make_entry(), + 26_u32.make_entry(), + 27_u32.make_entry(), + 28_u32.make_entry(), + 3000_u32.make_entry(), + (-3_i64).make_entry(), + Decimal("-12342343.2348973".to_string()).make_entry(), + Decimal("234.8973".to_string()).make_entry(), + Decimal("0.2348973".to_string()).make_entry(), + Decimal("23423423.8973".to_string()).make_entry(), + Decimal("3.3".to_string()).make_entry(), + Decimal("0.001".to_string()).make_entry(), + Decimal("-0.001".to_string()).make_entry(), + Decimal("2".to_string()).make_entry(), + Decimal("0".to_string()).make_entry(), + 4.389832_f32.make_entry(), + 23434.389832_f32.make_entry(), + int("239487329872343987").make_entry(), + ]; + vec.sort(); + + let mut used_types_buf = BytesMut::new(); + let mut type_offsets_buf = BytesMut::new(); + let mut block_offsets_buf = BytesMut::new(); + let mut data_buf = BytesMut::new(); + + let typed_builder = TypedDictBufBuilder::new( + &mut used_types_buf, + &mut type_offsets_buf, + &mut block_offsets_buf, + &mut data_buf, + ); + + vec.into_iter() + .map(|(dt, entry)| typed_builder.add(dt, entry)); + + typed_builder.finalize(); + + let used_types = used_types_buf.freeze(); + let type_offsets = type_offsets_buf.freeze(); + let block_offsets = block_offsets_buf.freeze(); + let data = data_buf.freeze(); + + let dict = TypedDict::from_parts(used_types, type_offsets, block_offsets, data); + + let res = dict.entry(0); + eprintln!("res: {res:?}"); + } } From b6f8df81979d3f3c571f7d27ae6a6c7bb703f4b8 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 29 Nov 2022 21:20:02 +0100 Subject: [PATCH 37/99] Annoyed about a move --- src/structure/tfc/dict.rs | 47 +++++++++++++++++++++++--------------- src/structure/tfc/typed.rs | 23 ++++++++----------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 12c608ca..a102318d 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -1,6 +1,8 @@ use std::{borrow::Cow, cmp::Ordering}; -use crate::structure::{util::calculate_width, LogArrayBufBuilder, MonotonicLogArray, LateLogArrayBufBuilder}; +use crate::structure::{ + util::calculate_width, LateLogArrayBufBuilder, LogArrayBufBuilder, MonotonicLogArray, +}; use bytes::{BufMut, Bytes}; use itertools::Itertools; @@ -24,28 +26,35 @@ pub fn build_dict_unchecked, I: Iterator>( } } -pub struct SizedDictBufBuilder<'a, B1:BufMut, B2:BufMut> { +pub struct SizedDictBufBuilder<'a, B1: BufMut, B2: BufMut> { block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B1>, - data_buf: &'a mut B2, + data_buf: B2, current_block: Vec, } -impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { - pub fn new(block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B1>, data_buf: &'a mut B2) -> Self { +impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { + pub fn new( + block_offset: u64, + id_offset: u64, + offsets: LateLogArrayBufBuilder<'a, B1>, + data_buf: B2, + ) -> Self { Self { block_offset, - id_offset, offsets, data_buf, - current_block: Vec::with_capacity(8) + id_offset, + offsets, + data_buf, + current_block: Vec::with_capacity(8), } } - pub fn id_offset(&self) -> u64{ + pub fn id_offset(&self) -> u64 { self.id_offset } - pub fn block_offset(&self) -> u64{ + pub fn block_offset(&self) -> u64 { self.block_offset } @@ -53,8 +62,8 @@ impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { self.current_block.push(value); self.id_offset += 1; if self.current_block.len() == BLOCK_SIZE { - let current_block: Vec<&[u8]> = self.current_block.iter().map(|e|e.as_ref()).collect(); - let size = build_block_unchecked(self.data_buf, ¤t_block); + let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); + let size = build_block_unchecked(&mut self.data_buf, ¤t_block); self.block_offset += size as u64; self.offsets.push(self.block_offset); @@ -72,15 +81,15 @@ impl<'a, B1:BufMut, B2:BufMut> SizedDictBufBuilder<'a, B1, B2> { it.map(|val| self.add(val)).collect() } - pub fn finalize(mut self) -> LateLogArrayBufBuilder<'a, B1> { + pub fn finalize(mut self) -> (LateLogArrayBufBuilder<'a, B1>, B2) { if self.current_block.len() > 0 { - let current_block: Vec<&[u8]> = self.current_block.iter().map(|e|e.as_ref()).collect(); - let size = build_block_unchecked(self.data_buf, ¤t_block); + let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); + let size = build_block_unchecked(&mut self.data_buf, ¤t_block); self.block_offset += size as u64; self.offsets.push(self.block_offset); } - self.offsets + (self.offsets, self.data_buf) } } @@ -362,13 +371,13 @@ mod tests { ]; let mut array_buf = BytesMut::new(); - let mut data_buf = BytesMut::new(); + let data_buf = BytesMut::new(); let logarray_builder = LateLogArrayBufBuilder::new(&mut array_buf); - let mut builder = SizedDictBufBuilder::new(0, 0, logarray_builder, &mut data_buf); - builder.add_all(strings.clone().into_iter().map(|v|Bytes::from_static(v))); - let mut logarray_builder = builder.finalize(); + let mut builder = SizedDictBufBuilder::new(0, 0, logarray_builder, data_buf); + builder.add_all(strings.clone().into_iter().map(|v| Bytes::from_static(v))); + let (mut logarray_builder, data_buf) = builder.finalize(); logarray_builder.pop(); logarray_builder.finalize(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index b904f931..fc0607e3 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -534,7 +534,6 @@ struct TypedDictBufBuilder<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> { types_present_builder: LateLogArrayBufBuilder<'a, B1>, type_offsets_builder: LateLogArrayBufBuilder<'a, B2>, sized_dict_buf_builder: SizedDictBufBuilder<'a, B3, B4>, - data_buf: &'a mut B4, current_datatype: Option, } @@ -543,7 +542,7 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, used_types: &'a mut B1, type_offsets: &'a mut B2, block_offsets: &'a mut B3, - data_buf: &'a mut B4, + data_buf: B4, ) -> Self { let types_present_builder = LateLogArrayBufBuilder::new(used_types); let type_offsets_builder = LateLogArrayBufBuilder::new(type_offsets); @@ -552,7 +551,6 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, Self { types_present_builder, type_offsets_builder, - data_buf, sized_dict_buf_builder, current_datatype: None, } @@ -567,16 +565,12 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, if self.current_datatype != Some(dt) { let id_offset = self.sized_dict_buf_builder.id_offset(); let block_offset = self.sized_dict_buf_builder.block_offset(); - let block_offset_builder = self.sized_dict_buf_builder.finalize(); + let (block_offset_builder, data_buf) = self.sized_dict_buf_builder.finalize(); self.types_present_builder.push(dt as u64); self.type_offsets_builder.push(block_offset + 1); - self.sized_dict_buf_builder = SizedDictBufBuilder::new( - id_offset, - block_offset, - block_offset_builder, - self.data_buf, - ); + self.sized_dict_buf_builder = + SizedDictBufBuilder::new(id_offset, block_offset, block_offset_builder, data_buf); } self.sized_dict_buf_builder.add(value) @@ -590,16 +584,17 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, it.map(|(dt, val)| self.add(dt, val)).collect() } - pub fn finalize(mut self) { + pub fn finalize(self) -> B4 { if self.current_datatype == None { panic!("There was nothing added to this dictionary!"); } - let block_offset_builder = self.sized_dict_buf_builder.finalize(); + let (mut block_offset_builder, mut data_buf) = self.sized_dict_buf_builder.finalize(); block_offset_builder.pop(); block_offset_builder.finalize(); self.types_present_builder.finalize(); self.type_offsets_builder.finalize(); + data_buf } } @@ -1020,13 +1015,13 @@ mod tests { &mut used_types_buf, &mut type_offsets_buf, &mut block_offsets_buf, - &mut data_buf, + data_buf, ); vec.into_iter() .map(|(dt, entry)| typed_builder.add(dt, entry)); - typed_builder.finalize(); + let data_buf = typed_builder.finalize(); let used_types = used_types_buf.freeze(); let type_offsets = type_offsets_buf.freeze(); From 265121574e9b1a763d2bc45a55c276ace7d22a4f Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 29 Nov 2022 23:54:37 +0100 Subject: [PATCH 38/99] Fix borrow issues by adding option --- src/structure/tfc/dict.rs | 11 +++++--- src/structure/tfc/typed.rs | 54 +++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index a102318d..d9c981c1 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -81,7 +81,7 @@ impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { it.map(|val| self.add(val)).collect() } - pub fn finalize(mut self) -> (LateLogArrayBufBuilder<'a, B1>, B2) { + pub fn finalize(mut self) -> (LateLogArrayBufBuilder<'a, B1>, B2, u64, u64) { if self.current_block.len() > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); let size = build_block_unchecked(&mut self.data_buf, ¤t_block); @@ -89,7 +89,12 @@ impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { self.offsets.push(self.block_offset); } - (self.offsets, self.data_buf) + ( + self.offsets, + self.data_buf, + self.block_offset, + self.id_offset, + ) } } @@ -377,7 +382,7 @@ mod tests { let mut builder = SizedDictBufBuilder::new(0, 0, logarray_builder, data_buf); builder.add_all(strings.clone().into_iter().map(|v| Bytes::from_static(v))); - let (mut logarray_builder, data_buf) = builder.finalize(); + let (mut logarray_builder, data_buf, _, _) = builder.finalize(); logarray_builder.pop(); logarray_builder.finalize(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index fc0607e3..ba57540a 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -533,7 +533,7 @@ pub fn build_multiple_segments< struct TypedDictBufBuilder<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> { types_present_builder: LateLogArrayBufBuilder<'a, B1>, type_offsets_builder: LateLogArrayBufBuilder<'a, B2>, - sized_dict_buf_builder: SizedDictBufBuilder<'a, B3, B4>, + sized_dict_buf_builder: Option>, current_datatype: Option, } @@ -547,7 +547,12 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, let types_present_builder = LateLogArrayBufBuilder::new(used_types); let type_offsets_builder = LateLogArrayBufBuilder::new(type_offsets); let block_offset_builder = LateLogArrayBufBuilder::new(block_offsets); - let sized_dict_buf_builder = SizedDictBufBuilder::new(0, 0, block_offset_builder, data_buf); + let sized_dict_buf_builder = Some(SizedDictBufBuilder::new( + 0, + 0, + block_offset_builder, + data_buf, + )); Self { types_present_builder, type_offsets_builder, @@ -558,22 +563,30 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, pub fn add(&mut self, dt: Datatype, value: Bytes) -> u64 { if self.current_datatype == None { - self.current_datatype = Some(dt); + self.current_datatype = dbg!(Some(dt)); self.types_present_builder.push(dt as u64); } if self.current_datatype != Some(dt) { - let id_offset = self.sized_dict_buf_builder.id_offset(); - let block_offset = self.sized_dict_buf_builder.block_offset(); - let (block_offset_builder, data_buf) = self.sized_dict_buf_builder.finalize(); - + let (block_offset_builder, data_buf, block_offset, id_offset) = + self.sized_dict_buf_builder.take().unwrap().finalize(); + dbg!(dt); + dbg!(id_offset); + dbg!(block_offset); self.types_present_builder.push(dt as u64); self.type_offsets_builder.push(block_offset + 1); - self.sized_dict_buf_builder = - SizedDictBufBuilder::new(id_offset, block_offset, block_offset_builder, data_buf); + self.sized_dict_buf_builder = Some(SizedDictBufBuilder::new( + block_offset, + id_offset, + block_offset_builder, + data_buf, + )); } - self.sized_dict_buf_builder.add(value) + self.sized_dict_buf_builder + .as_mut() + .map(|s| s.add(value)) + .unwrap() } pub fn add_entry(&mut self, dt: Datatype, e: &SizedDictEntry) -> u64 { @@ -588,7 +601,8 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, if self.current_datatype == None { panic!("There was nothing added to this dictionary!"); } - let (mut block_offset_builder, mut data_buf) = self.sized_dict_buf_builder.finalize(); + let (mut block_offset_builder, data_buf, _, _) = + self.sized_dict_buf_builder.unwrap().finalize(); block_offset_builder.pop(); block_offset_builder.finalize(); @@ -970,6 +984,7 @@ mod tests { assert_eq!(vec, actual); } + #[test] fn test_incremental_builder() { let mut vec: Vec<(Datatype, Bytes)> = vec![ "fdsa".to_string().make_entry(), @@ -1009,17 +1024,22 @@ mod tests { let mut used_types_buf = BytesMut::new(); let mut type_offsets_buf = BytesMut::new(); let mut block_offsets_buf = BytesMut::new(); - let mut data_buf = BytesMut::new(); + let data_buf = BytesMut::new(); - let typed_builder = TypedDictBufBuilder::new( + let mut typed_builder = TypedDictBufBuilder::new( &mut used_types_buf, &mut type_offsets_buf, &mut block_offsets_buf, data_buf, ); - vec.into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)); + let results: Vec = vec + .into_iter() + .map(|(dt, entry)| { + eprintln!("dt: {dt:?}"); + dbg!(typed_builder.add(dt, entry)) + }) + .collect(); let data_buf = typed_builder.finalize(); @@ -1032,5 +1052,9 @@ mod tests { let res = dict.entry(0); eprintln!("res: {res:?}"); + + let res = dict.entry(1); + eprintln!("res: {res:?}"); + panic!(); } } From 102900c76dc6ad9745d1264f61b942b9d4f99676 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 01:13:18 +0100 Subject: [PATCH 39/99] Almost working --- src/structure/logarray.rs | 3 ++- src/structure/tfc/typed.rs | 11 +++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 6fa51562..18725890 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -420,7 +420,8 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { pub struct LateLogArrayBufBuilder<'a, B: BufMut> { /// Destination of the log array data buf: &'a mut B, - vals: Vec, + /// NOTE: remove pub + pub vals: Vec, width: u8 } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index ba57540a..f273f1e7 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -45,6 +45,7 @@ impl TypedDict { if type_offset == 0 { last_block_len = data[0]; } else { + eprintln!("type_offset: {type_offset}"); let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); last_block_len = data[last_block_offset_of_previous_type as usize]; @@ -573,14 +574,19 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, dbg!(dt); dbg!(id_offset); dbg!(block_offset); + + dbg!(&self.types_present_builder.vals); + dbg!(&self.type_offsets_builder.vals); self.types_present_builder.push(dt as u64); - self.type_offsets_builder.push(block_offset + 1); + self.type_offsets_builder + .push(block_offset_builder.count() as u64 - 1); self.sized_dict_buf_builder = Some(SizedDictBufBuilder::new( block_offset, id_offset, block_offset_builder, data_buf, )); + self.current_datatype = Some(dt); } self.sized_dict_buf_builder @@ -608,6 +614,7 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, self.types_present_builder.finalize(); self.type_offsets_builder.finalize(); + data_buf } } @@ -1049,7 +1056,7 @@ mod tests { let data = data_buf.freeze(); let dict = TypedDict::from_parts(used_types, type_offsets, block_offsets, data); - + eprintln!("dict: {dict:?}"); let res = dict.entry(0); eprintln!("res: {res:?}"); From 724d668913b6ed3b5b14d569c73824bac1703c3d Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 01:17:50 +0100 Subject: [PATCH 40/99] Working --- src/structure/tfc/typed.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index f273f1e7..6600819b 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1056,12 +1056,20 @@ mod tests { let data = data_buf.freeze(); let dict = TypedDict::from_parts(used_types, type_offsets, block_offsets, data); - eprintln!("dict: {dict:?}"); - let res = dict.entry(0); - eprintln!("res: {res:?}"); - - let res = dict.entry(1); - eprintln!("res: {res:?}"); + assert_eq!( + dict.entry(1), + ( + Datatype::String, + SizedDictEntry(vec![Bytes::from_static(b"Batman")]) + ) + ); + assert_eq!( + dict.entry(2), + ( + Datatype::String, + SizedDictEntry(vec![Bytes::from_static(b"Batty")]) + ) + ); panic!(); } } From 51744838ca0b70ce34f74788f913cac1f917fad2 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 09:04:36 +0100 Subject: [PATCH 41/99] Remove debug prints --- src/structure/tfc/typed.rs | 39 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 6600819b..a9877433 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -45,7 +45,6 @@ impl TypedDict { if type_offset == 0 { last_block_len = data[0]; } else { - eprintln!("type_offset: {type_offset}"); let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); last_block_len = data[last_block_offset_of_previous_type as usize]; @@ -509,7 +508,6 @@ pub fn build_multiple_segments< } build_offset_logarray(block_offsets_buf, offsets); - eprintln!("types: {types:?}"); let largest_type = types.last().unwrap(); let largest_type_offset = type_offsets.last().unwrap(); @@ -564,19 +562,13 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, pub fn add(&mut self, dt: Datatype, value: Bytes) -> u64 { if self.current_datatype == None { - self.current_datatype = dbg!(Some(dt)); + self.current_datatype = Some(dt); self.types_present_builder.push(dt as u64); } if self.current_datatype != Some(dt) { let (block_offset_builder, data_buf, block_offset, id_offset) = self.sized_dict_buf_builder.take().unwrap().finalize(); - dbg!(dt); - dbg!(id_offset); - dbg!(block_offset); - - dbg!(&self.types_present_builder.vals); - dbg!(&self.type_offsets_builder.vals); self.types_present_builder.push(dt as u64); self.type_offsets_builder .push(block_offset_builder.count() as u64 - 1); @@ -991,6 +983,10 @@ mod tests { assert_eq!(vec, actual); } + fn convert_entry(e: (Datatype, SizedDictEntry)) -> (Datatype, Bytes) { + (e.0, e.1.to_bytes()) + } + #[test] fn test_incremental_builder() { let mut vec: Vec<(Datatype, Bytes)> = vec![ @@ -1041,11 +1037,9 @@ mod tests { ); let results: Vec = vec + .clone() .into_iter() - .map(|(dt, entry)| { - eprintln!("dt: {dt:?}"); - dbg!(typed_builder.add(dt, entry)) - }) + .map(|(dt, entry)| typed_builder.add(dt, entry)) .collect(); let data_buf = typed_builder.finalize(); @@ -1056,20 +1050,9 @@ mod tests { let data = data_buf.freeze(); let dict = TypedDict::from_parts(used_types, type_offsets, block_offsets, data); - assert_eq!( - dict.entry(1), - ( - Datatype::String, - SizedDictEntry(vec![Bytes::from_static(b"Batman")]) - ) - ); - assert_eq!( - dict.entry(2), - ( - Datatype::String, - SizedDictEntry(vec![Bytes::from_static(b"Batty")]) - ) - ); - panic!(); + + for i in 0..vec.len() { + assert_eq!(vec[i], convert_entry(dict.entry(i as u64 + 1))) + } } } From df22829b4cf4bb5364b9a0bbe5b9bcc097221cfc Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 11:15:22 +0100 Subject: [PATCH 42/99] Adding suffixless blocks --- src/structure/tfc/block.rs | 95 ++++++++++++++++++++++++++++++++------ src/structure/tfc/dict.rs | 14 ++++-- src/structure/tfc/typed.rs | 59 +++++++++++++++++++---- 3 files changed, 139 insertions(+), 29 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 5b35f58b..9b6b337f 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -37,8 +37,8 @@ impl From for SizedDictError { impl SizedBlockHeader { fn parse(buf: &mut Bytes) -> Result { - let num_entries = buf.get_u8(); - + let cw = buf.get_u8(); + let (record_size, num_entries) = parse_block_control_word(cw); let mut sizes = [0_usize; BLOCK_SIZE - 1]; let mut shareds = [0_usize; BLOCK_SIZE - 1]; let (first_size, _) = vbyte::decode_buf(buf)?; @@ -47,8 +47,12 @@ impl SizedBlockHeader { for i in 0..(num_entries - 1) as usize { let (shared, _) = vbyte::decode_buf(buf)?; - let (size, _) = vbyte::decode_buf(buf)?; - + let size = if record_size == None { + let (size, _) = vbyte::decode_buf(buf)?; + size + } else { + record_size.unwrap() as u64 - shared + }; sizes[i] = size as usize; shareds[i] = shared as usize; } @@ -533,7 +537,7 @@ impl<'a> Iterator for SizedBlockIterator<'a> { if self.ix >= self.header.num_entries as usize - 1 { return None; } - let size = self.header.sizes[self.ix]; + let size = dbg!(self.header.sizes[self.ix]); let mut shared = self.header.shareds[self.ix]; for rope_index in 0..last.len() { let x = &mut last[rope_index]; @@ -585,12 +589,60 @@ impl IdLookupResult { } } -pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> usize { +pub fn parse_block_control_records(cw: u8) -> u8 { + parse_block_control_word(cw).1 +} + +pub fn parse_block_control_word(cw: u8) -> (Option, u8) { + let records = (cw & ((1 << 3) - 1)) + 1; + let record_size = record_size_decoding(cw); + (record_size, records) +} + +// None => 0 +// Some(1) => 1 +// Some(2) => 2 +// +// Some(4) => 3 +// Some(8) => 4 + +// MSB = 1, fake ids block. +// +// id => byte = id - id_offset +// two more bits + +fn record_size_decoding(enc: u8) -> Option { + match enc >> 3 { + 0 => None, + 3 => Some(4), + 4 => Some(8), + _ => panic!("Ok, this is not known"), + } +} + +fn record_size_encoding(record_size: Option) -> u8 { + match record_size { + None => 0, + Some(4) => 3 << 3, + Some(8) => 4 << 3, + _ => panic!("This is really bad!"), + } +} + +fn create_block_control_word(record_size: Option, records: u8) -> u8 { + records - 1 + record_size_encoding(record_size) +} + +pub(crate) fn build_block_unchecked( + record_size: Option, + buf: &mut B, + slices: &[&[u8]], +) -> usize { let mut size = 0; let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); - - buf.put_u8(slices_len as u8); + let cw = dbg!(create_block_control_word(record_size, slices_len as u8)); + buf.put_u8(cw as u8); size += 1; let first = slices[0]; @@ -611,11 +663,14 @@ pub(crate) fn build_block_unchecked(buf: &mut B, slices: &[&[u8]]) -> buf.put_slice(&vbyte[..vbyte_len]); size += vbyte_len; - let suffix_len = cur.len() - common_prefix; - let (vbyte, vbyte_len) = encode_array(suffix_len as u64); - buf.put_slice(&vbyte[..vbyte_len]); - size += vbyte_len; - + if record_size == None { + let suffix_len = cur.len() - common_prefix; + let (vbyte, vbyte_len) = encode_array(suffix_len as u64); + buf.put_slice(&vbyte[..vbyte_len]); + size += vbyte_len; + } else { + eprintln!("Fixed width: {record_size:?}"); + } suffixes.push(&cur[common_prefix..]); last = cur; } @@ -642,7 +697,7 @@ mod tests { fn build_block_bytes(strings: &[&[u8]]) -> Bytes { let mut buf = BytesMut::new(); - build_block_unchecked(&mut buf, &strings); + build_block_unchecked(None, &mut buf, &strings); buf.freeze() } @@ -883,4 +938,16 @@ mod tests { result ); } + + #[test] + fn control_word_round_trip() { + let cw = create_block_control_word(None, 3); + assert_eq!(parse_block_control_word(cw), (None, 3)); + + let cw = create_block_control_word(Some(8), 5); + assert_eq!(parse_block_control_word(cw), (Some(8), 5)); + + let cw = create_block_control_word(Some(12), 6); + assert_eq!(parse_block_control_word(cw), (Some(12), 6)) + } } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index d9c981c1..557641e7 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -9,6 +9,7 @@ use itertools::Itertools; use super::block::*; pub fn build_dict_unchecked, I: Iterator>( + record_size: Option, start_offset: u64, offsets: &mut Vec, data_buf: &mut B, @@ -20,13 +21,14 @@ pub fn build_dict_unchecked, I: Iterator>( for chunk in &chunk_iter { let slices: Vec = chunk.collect(); let borrows: Vec<&[u8]> = slices.iter().map(|s| s.as_ref()).collect(); - let size = build_block_unchecked(data_buf, &borrows); + let size = build_block_unchecked(record_size, data_buf, &borrows); offset += size as u64; offsets.push(offset); } } pub struct SizedDictBufBuilder<'a, B1: BufMut, B2: BufMut> { + pub(crate) record_size: Option, block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B1>, @@ -36,12 +38,14 @@ pub struct SizedDictBufBuilder<'a, B1: BufMut, B2: BufMut> { impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { pub fn new( + record_size: Option, block_offset: u64, id_offset: u64, offsets: LateLogArrayBufBuilder<'a, B1>, data_buf: B2, ) -> Self { Self { + record_size, block_offset, id_offset, offsets, @@ -63,7 +67,7 @@ impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { self.id_offset += 1; if self.current_block.len() == BLOCK_SIZE { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); - let size = build_block_unchecked(&mut self.data_buf, ¤t_block); + let size = build_block_unchecked(self.record_size, &mut self.data_buf, ¤t_block); self.block_offset += size as u64; self.offsets.push(self.block_offset); @@ -84,7 +88,7 @@ impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { pub fn finalize(mut self) -> (LateLogArrayBufBuilder<'a, B1>, B2, u64, u64) { if self.current_block.len() > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); - let size = build_block_unchecked(&mut self.data_buf, ¤t_block); + let size = build_block_unchecked(self.record_size, &mut self.data_buf, ¤t_block); self.block_offset += size as u64; self.offsets.push(self.block_offset); } @@ -310,7 +314,7 @@ mod tests { vals: I, ) { let mut offsets = Vec::new(); - build_dict_unchecked(0, &mut offsets, data_buf, vals); + build_dict_unchecked(None, 0, &mut offsets, data_buf, vals); build_offset_logarray(array_buf, offsets); } @@ -380,7 +384,7 @@ mod tests { let logarray_builder = LateLogArrayBufBuilder::new(&mut array_buf); - let mut builder = SizedDictBufBuilder::new(0, 0, logarray_builder, data_buf); + let mut builder = SizedDictBufBuilder::new(None, 0, 0, logarray_builder, data_buf); builder.add_all(strings.clone().into_iter().map(|v| Bytes::from_static(v))); let (mut logarray_builder, data_buf, _, _) = builder.finalize(); logarray_builder.pop(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index a9877433..edd699b0 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,6 +1,7 @@ use crate::structure::{ - tfc::block::BLOCK_SIZE, util::calculate_width, LateLogArrayBufBuilder, LogArrayBufBuilder, - MonotonicLogArray, + tfc::block::{parse_block_control_records, BLOCK_SIZE}, + util::calculate_width, + LateLogArrayBufBuilder, LogArrayBufBuilder, MonotonicLogArray, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -43,12 +44,14 @@ impl TypedDict { for type_offset in type_offsets.iter() { let last_block_len; if type_offset == 0 { - last_block_len = data[0]; + last_block_len = parse_block_control_records(data[0]); } else { let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); - last_block_len = data[last_block_offset_of_previous_type as usize]; + last_block_len = + parse_block_control_records(data[last_block_offset_of_previous_type as usize]); } + eprintln!("last_block_len: {last_block_len}"); let gap = BLOCK_SIZE as u8 - last_block_len; tally += gap as u64; type_id_offsets.push((type_offset + 1) * 8 - tally); @@ -279,6 +282,20 @@ impl Datatype { T::from_lexical(b) } + + pub fn record_size(&self) -> Option { + match self { + Datatype::String => None, + Datatype::UInt32 => Some(4), + Datatype::Int32 => Some(4), + Datatype::UInt64 => Some(8), + Datatype::Int64 => Some(8), + Datatype::Float32 => Some(4), + Datatype::Float64 => Some(8), + Datatype::Decimal => None, + Datatype::BigInt => None, + } + } } pub trait TdbDataType { @@ -473,13 +490,13 @@ impl TdbDataType for Decimal { } pub fn build_segment>( + record_size: Option, offsets: &mut Vec, data_buf: &mut B, iter: I, ) { let slices = iter.map(|val| val.to_lexical()); - - build_dict_unchecked(0, offsets, data_buf, slices); + build_dict_unchecked(record_size, 0, offsets, data_buf, slices); } pub fn build_multiple_segments< @@ -504,7 +521,13 @@ pub fn build_multiple_segments< let start_type_offset = offsets.len(); types.push(key); type_offsets.push(start_type_offset as u64); - build_dict_unchecked(start_offset, &mut offsets, data_buf, group.map(|v| v.1)); + build_dict_unchecked( + key.record_size(), + start_offset, + &mut offsets, + data_buf, + group.map(|v| v.1), + ); } build_offset_logarray(block_offsets_buf, offsets); @@ -547,6 +570,7 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, let type_offsets_builder = LateLogArrayBufBuilder::new(type_offsets); let block_offset_builder = LateLogArrayBufBuilder::new(block_offsets); let sized_dict_buf_builder = Some(SizedDictBufBuilder::new( + None, 0, 0, block_offset_builder, @@ -564,6 +588,9 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, if self.current_datatype == None { self.current_datatype = Some(dt); self.types_present_builder.push(dt as u64); + self.sized_dict_buf_builder + .as_mut() + .map(|b| b.record_size = dt.record_size()); } if self.current_datatype != Some(dt) { @@ -573,6 +600,7 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, self.type_offsets_builder .push(block_offset_builder.count() as u64 - 1); self.sized_dict_buf_builder = Some(SizedDictBufBuilder::new( + dt.record_size(), block_offset, id_offset, block_offset_builder, @@ -618,12 +646,13 @@ mod tests { use super::*; fn build_segment_and_offsets>( + dt: Datatype, array_buf: &mut B1, data_buf: &mut B2, iter: I, ) { let mut offsets = Vec::new(); - build_segment(&mut offsets, data_buf, iter); + build_segment(dt.record_size(), &mut offsets, data_buf, iter); build_offset_logarray(array_buf, offsets); } @@ -652,7 +681,12 @@ mod tests { let mut offsets = BytesMut::new(); let mut data = BytesMut::new(); - build_segment_and_offsets(&mut offsets, &mut data, strings.clone().into_iter()); + build_segment_and_offsets( + Datatype::String, + &mut offsets, + &mut data, + strings.clone().into_iter(), + ); let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); @@ -671,7 +705,12 @@ mod tests { let mut offsets = BytesMut::new(); let mut data = BytesMut::new(); - build_segment_and_offsets(&mut offsets, &mut data, nums.clone().into_iter()); + build_segment_and_offsets( + Datatype::UInt64, + &mut offsets, + &mut data, + nums.clone().into_iter(), + ); let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); From 8ed589f12e74419051831dfe06f825d7cdf52d9b Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 11:20:20 +0100 Subject: [PATCH 43/99] Remove extraneous --- src/structure/tfc/block.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 9b6b337f..3800b43b 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -599,18 +599,6 @@ pub fn parse_block_control_word(cw: u8) -> (Option, u8) { (record_size, records) } -// None => 0 -// Some(1) => 1 -// Some(2) => 2 -// -// Some(4) => 3 -// Some(8) => 4 - -// MSB = 1, fake ids block. -// -// id => byte = id - id_offset -// two more bits - fn record_size_decoding(enc: u8) -> Option { match enc >> 3 { 0 => None, From c209e25d4573a01a639f92b307ef553c44ed83cd Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 30 Nov 2022 12:17:54 +0100 Subject: [PATCH 44/99] refactor progress --- src/layer/builder.rs | 92 ++++++++++++++++++++----------------- src/layer/internal/base.rs | 65 +++++++++++++------------- src/layer/internal/child.rs | 65 +++++++++++++------------- src/layer/internal/mod.rs | 13 +++--- src/storage/cache.rs | 6 +-- src/storage/delta.rs | 2 +- src/storage/file.rs | 40 +++++++++++++++- src/structure/logarray.rs | 14 +++--- src/structure/tfc/dict.rs | 10 ++-- src/structure/tfc/file.rs | 2 +- src/structure/tfc/typed.rs | 26 +++++------ 11 files changed, 193 insertions(+), 142 deletions(-) diff --git a/src/layer/builder.rs b/src/layer/builder.rs index 5c78c91f..822a3bc4 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -1,39 +1,41 @@ use std::io; +use bytes::{BytesMut, Bytes}; use futures::stream::TryStreamExt; use rayon::prelude::*; +use tfc::dict::SizedDictBufBuilder; use super::layer::*; use crate::storage::*; use crate::structure::util; use crate::structure::*; -pub struct DictionarySetFileBuilder { - node_dictionary_builder: PfcDictFileBuilder, - predicate_dictionary_builder: PfcDictFileBuilder, - value_dictionary_builder: PfcDictFileBuilder, +pub struct DictionarySetFileBuilder { + node_files: DictionaryFiles, + predicate_files: DictionaryFiles, + value_files: TypedDictionaryFiles, + node_dictionary_builder: SizedDictBufBuilder, + predicate_dictionary_builder: SizedDictBufBuilder, + value_dictionary_builder: TypedDictBufBuilder, } impl DictionarySetFileBuilder { pub async fn from_files( node_files: DictionaryFiles, predicate_files: DictionaryFiles, - value_files: DictionaryFiles, + value_files: TypedDictionaryFiles, ) -> io::Result { - let node_dictionary_builder = PfcDictFileBuilder::new( - node_files.blocks_file.open_write().await?, - node_files.offsets_file.open_write().await?, - ); - let predicate_dictionary_builder = PfcDictFileBuilder::new( - predicate_files.blocks_file.open_write().await?, - predicate_files.offsets_file.open_write().await?, - ); - let value_dictionary_builder = PfcDictFileBuilder::new( - value_files.blocks_file.open_write().await?, - value_files.offsets_file.open_write().await?, - ); + let node_dictionary_builder = SizedDictBufBuilder::new(None, 0, 0, LateLogArrayBufBuilder::new(BytesMut::new()), BytesMut::new()); + let predicate_dictionary_builder = SizedDictBufBuilder::new(None, 0, 0, LateLogArrayBufBuilder::new(BytesMut::new()), BytesMut::new()); + let value_dictionary_builder = TypedDictBufBuilder::new(BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + BytesMut::new()); Ok(Self { + node_files, + predicate_files, + value_files, node_dictionary_builder, predicate_dictionary_builder, value_dictionary_builder, @@ -43,91 +45,99 @@ impl DictionarySetFileBuilder { /// Add a node string. /// /// Panics if the given node string is not a lexical successor of the previous node string. - pub async fn add_node(&mut self, node: &str) -> io::Result { - let id = self.node_dictionary_builder.add(node).await?; + pub fn add_node(&mut self, node: &str) -> u64 { + let id = self.node_dictionary_builder.add(Bytes::copy_from_slice(node.as_bytes())); - Ok(id) + id } /// Add a predicate string. /// /// Panics if the given predicate string is not a lexical successor of the previous node string. - pub async fn add_predicate(&mut self, predicate: &str) -> io::Result { - let id = self.predicate_dictionary_builder.add(predicate).await?; + pub fn add_predicate(&mut self, predicate: &str) -> u64 { + let id = self.predicate_dictionary_builder.add(Bytes::copy_from_slice(predicate.as_bytes())); - Ok(id) + id } /// Add a value string. /// /// Panics if the given value string is not a lexical successor of the previous value string. - pub async fn add_value(&mut self, value: &str) -> io::Result { - let id = self.value_dictionary_builder.add(value).await?; + pub fn add_value(&mut self, value: &str) -> u64 { + let id = self.value_dictionary_builder.add(Datatype::String, + Bytes::copy_from_slice(value.as_bytes())); - Ok(id) + id } /// Add nodes from an iterable. /// /// Panics if the nodes are not in lexical order, or if previous added nodes are a lexical succesor of any of these nodes. - pub async fn add_nodes + Unpin + Send + Sync>( + pub fn add_nodes + Unpin + Send + Sync>( &mut self, nodes: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, { let mut ids = Vec::new(); for node in nodes { - let id = self.add_node(&node).await?; + let id = self.add_node(&node); ids.push(id); } - Ok(ids) + ids } /// Add predicates from an iterable. /// /// Panics if the predicates are not in lexical order, or if previous added predicates are a lexical succesor of any of these predicates. - pub async fn add_predicates + Unpin + Send + Sync>( + pub fn add_predicates + Unpin + Send + Sync>( &mut self, predicates: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, { let mut ids = Vec::new(); for predicate in predicates { - let id = self.add_predicate(&predicate).await?; + let id = self.add_predicate(&predicate); ids.push(id); } - Ok(ids) + ids } /// Add values from an iterable. /// /// Panics if the values are not in lexical order, or if previous added values are a lexical succesor of any of these values. - pub async fn add_values + Unpin + Send + Sync>( + pub fn add_values + Unpin + Send + Sync>( &mut self, values: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, { let mut ids = Vec::new(); for value in values { - let id = self.add_value(&value).await?; + let id = self.add_value(&value); ids.push(id); } - Ok(ids) + ids } pub async fn finalize(self) -> io::Result<()> { - self.node_dictionary_builder.finalize().await?; - self.predicate_dictionary_builder.finalize().await?; - self.value_dictionary_builder.finalize().await?; + let (node_offsets_builder, mut node_data_buf, _, _) = self.node_dictionary_builder.finalize(); + let mut node_offsets_buf = node_offsets_builder.finalize(); + let (predicate_offsets_builder, mut predicate_data_buf, _, _) = self.predicate_dictionary_builder.finalize(); + let mut predicate_offsets_buf = predicate_offsets_builder.finalize(); + let (mut value_types_present_buf, mut value_type_offsets_buf, mut value_offsets_buf, mut value_data_buf) = self.value_dictionary_builder.finalize(); + + self.node_files.write_all_from_bufs(&mut node_data_buf, &mut node_offsets_buf).await?; + self.predicate_files.write_all_from_bufs(&mut predicate_data_buf, &mut predicate_offsets_buf).await?; + + self.value_files.write_all_from_bufs(&mut value_types_present_buf, &mut value_type_offsets_buf, &mut value_offsets_buf, &mut value_data_buf).await?; Ok(()) } diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 1a18df01..5d81317a 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -49,27 +49,28 @@ impl BaseLayer { } pub fn load(name: [u32; 5], maps: BaseLayerMaps) -> InternalLayer { - let node_dictionary = PfcDict::parse( + let node_dictionary = TypedDictSegment::parse( maps.node_dictionary_maps.blocks_map, maps.node_dictionary_maps.offsets_map, - ) - .unwrap(); - let predicate_dictionary = PfcDict::parse( + 0, + ); + let predicate_dictionary = TypedDictSegment::parse( maps.predicate_dictionary_maps.blocks_map, maps.predicate_dictionary_maps.offsets_map, - ) - .unwrap(); - let value_dictionary = PfcDict::parse( + 0, + ); + let value_dictionary = TypedDict::from_parts( + maps.value_dictionary_maps.types_present_map, + maps.value_dictionary_maps.type_offsets_map, maps.value_dictionary_maps.blocks_map, maps.value_dictionary_maps.offsets_map, - ) - .unwrap(); + ); let node_value_idmap = match maps.id_map_maps.node_value_idmap_maps { None => IdMap::default(), Some(maps) => IdMap::from_maps( maps, - util::calculate_width((node_dictionary.len() + value_dictionary.len()) as u64), + util::calculate_width((node_dictionary.num_entries() + value_dictionary.num_entries()) as u64), ), }; @@ -77,7 +78,7 @@ impl BaseLayer { None => IdMap::default(), Some(map) => IdMap::from_maps( map, - util::calculate_width(predicate_dictionary.len() as u64), + util::calculate_width(predicate_dictionary.num_entries() as u64), ), }; @@ -170,76 +171,76 @@ impl BaseLayerFileBuilder { /// Add a node string. /// /// Panics if the given node string is not a lexical successor of the previous node string. - pub async fn add_node(&mut self, node: &str) -> io::Result { - let id = self.builder.add_node(node).await?; + pub fn add_node(&mut self, node: &str) -> u64 { + let id = self.builder.add_node(node); - Ok(id) + id } /// Add a predicate string. /// /// Panics if the given predicate string is not a lexical successor of the previous node string. - pub async fn add_predicate(&mut self, predicate: &str) -> io::Result { - let id = self.builder.add_predicate(predicate).await?; + pub fn add_predicate(&mut self, predicate: &str) -> u64 { + let id = self.builder.add_predicate(predicate); - Ok(id) + id } /// Add a value string. /// /// Panics if the given value string is not a lexical successor of the previous value string. - pub async fn add_value(&mut self, value: &str) -> io::Result { - let id = self.builder.add_value(value).await?; + pub fn add_value(&mut self, value: &str) -> u64 { + let id = self.builder.add_value(value); - Ok(id) + id } /// Add nodes from an iterable. /// /// Panics if the nodes are not in lexical order, or if previous added nodes are a lexical succesor of any of these nodes. - pub async fn add_nodes + Send>( + pub fn add_nodes + Send>( &mut self, nodes: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, I: Unpin + Sync, { - let ids = self.builder.add_nodes(nodes).await?; + let ids = self.builder.add_nodes(nodes); - Ok(ids) + ids } /// Add predicates from an iterable. /// /// Panics if the predicates are not in lexical order, or if previous added predicates are a lexical succesor of any of these predicates. - pub async fn add_predicates + Send>( + pub fn add_predicates + Send>( &mut self, predicates: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, I: Unpin + Sync, { - let ids = self.builder.add_predicates(predicates).await?; + let ids = self.builder.add_predicates(predicates); - Ok(ids) + ids } /// Add values from an iterable. /// /// Panics if the values are not in lexical order, or if previous added values are a lexical succesor of any of these values. - pub async fn add_values + Send>( + pub fn add_values + Send>( &mut self, values: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Unpin + Send + Sync, I: Unpin + Sync, { - let ids = self.builder.add_values(values).await?; + let ids = self.builder.add_values(values); - Ok(ids) + ids } /// Turn this builder into a phase 2 builder that will take triple data. diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index c688e66b..effd0fc1 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -62,21 +62,22 @@ impl ChildLayer { } pub fn load(name: [u32; 5], parent: Arc, maps: ChildLayerMaps) -> InternalLayer { - let node_dictionary = PfcDict::parse( + let node_dictionary = TypedDictSegment::parse( maps.node_dictionary_maps.blocks_map, maps.node_dictionary_maps.offsets_map, - ) - .unwrap(); - let predicate_dictionary = PfcDict::parse( + 0 + ); + let predicate_dictionary = TypedDictSegment::parse( maps.predicate_dictionary_maps.blocks_map, maps.predicate_dictionary_maps.offsets_map, - ) - .unwrap(); - let value_dictionary = PfcDict::parse( + 0, + ); + let value_dictionary = TypedDict::from_parts( + maps.value_dictionary_maps.types_present_map, + maps.value_dictionary_maps.type_offsets_map, maps.value_dictionary_maps.blocks_map, maps.value_dictionary_maps.offsets_map, - ) - .unwrap(); + ); let parent_node_value_count = parent.node_and_value_count(); let parent_predicate_count = parent.predicate_count(); @@ -85,7 +86,7 @@ impl ChildLayer { None => IdMap::default(), Some(maps) => IdMap::from_maps( maps, - util::calculate_width((node_dictionary.len() + value_dictionary.len()) as u64), + util::calculate_width((node_dictionary.num_entries() + value_dictionary.num_entries()) as u64), ), }; @@ -93,7 +94,7 @@ impl ChildLayer { None => IdMap::default(), Some(map) => IdMap::from_maps( map, - util::calculate_width(predicate_dictionary.len() as u64), + util::calculate_width(predicate_dictionary.num_entries() as u64), ), }; @@ -233,10 +234,10 @@ impl ChildLayerFileBuil /// Does nothing if the node already exists in the parent, and /// panics if the given node string is not a lexical successor of /// the previous node string. - pub async fn add_node(&mut self, node: &str) -> io::Result { + pub fn add_node(&mut self, node: &str) -> u64 { match self.parent.subject_id(node) { - None => self.builder.add_node(node).await, - Some(id) => Ok(id), + None => self.builder.add_node(node), + Some(id) => id, } } @@ -245,10 +246,10 @@ impl ChildLayerFileBuil /// Does nothing if the predicate already exists in the paretn, and /// panics if the given predicate string is not a lexical successor of /// the previous predicate string. - pub async fn add_predicate(&mut self, predicate: &str) -> io::Result { + pub fn add_predicate(&mut self, predicate: &str) -> u64 { match self.parent.predicate_id(predicate) { - None => self.builder.add_predicate(predicate).await, - Some(id) => Ok(id), + None => self.builder.add_predicate(predicate), + Some(id) => id, } } @@ -257,10 +258,10 @@ impl ChildLayerFileBuil /// Does nothing if the value already exists in the paretn, and /// panics if the given value string is not a lexical successor of /// the previous value string. - pub async fn add_value(&mut self, value: &str) -> io::Result { + pub fn add_value(&mut self, value: &str) -> u64 { match self.parent.object_value_id(value) { - None => self.builder.add_value(value).await, - Some(id) => Ok(id), + None => self.builder.add_value(value), + Some(id) => id, } } @@ -270,21 +271,21 @@ impl ChildLayerFileBuil /// added nodes are a lexical succesor of any of these /// nodes. Skips any nodes that are already part of the base /// layer. - pub async fn add_nodes + Send>( + pub fn add_nodes + Send>( &mut self, nodes: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Send, { // TODO bulk check node existence let mut result = Vec::new(); for node in nodes { - let id = self.add_node(&node).await?; + let id = self.add_node(&node); result.push(id); } - Ok(result) + result } /// Add predicates from an iterable. @@ -293,21 +294,21 @@ impl ChildLayerFileBuil /// previous added predicates are a lexical succesor of any of /// these predicates. Skips any predicates that are already part /// of the base layer. - pub async fn add_predicates + Send>( + pub fn add_predicates + Send>( &mut self, predicates: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Send, { // TODO bulk check predicate existence let mut result = Vec::new(); for predicate in predicates { - let id = self.add_predicate(&predicate).await?; + let id = self.add_predicate(&predicate); result.push(id); } - Ok(result) + result } /// Add values from an iterable. @@ -316,21 +317,21 @@ impl ChildLayerFileBuil /// added values are a lexical succesor of any of these /// values. Skips any nodes that are already part of the base /// layer. - pub async fn add_values + Send>( + pub fn add_values + Send>( &mut self, values: I, - ) -> io::Result> + ) -> Vec where ::IntoIter: Send, { // TODO bulk check predicate existence let mut result = Vec::new(); for value in values { - let id = self.add_value(&value).await?; + let id = self.add_value(&value); result.push(id); } - Ok(result) + result } /// Turn this builder into a phase 2 builder that will take triple data. diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 169f6462..ca07b5c0 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -27,6 +27,7 @@ pub enum InternalLayer { } use InternalLayer::*; +use tfc::block::IdLookupResult; impl InternalLayer { pub fn name(&self) -> [u32; 5] { @@ -221,14 +222,14 @@ impl InternalLayer { } pub fn predicate_dict_len(&self) -> usize { - self.predicate_dictionary().len() + self.predicate_dictionary().num_entries() } - pub fn predicate_dict_id(&self, predicate: &str) -> Option { + pub fn predicate_dict_id(&self, predicate: &str) -> IdLookupResult { self.predicate_dictionary().id(predicate) } - pub fn node_dict_id(&self, subject: &str) -> Option { + pub fn node_dict_id(&self, subject: &str) -> IdLookupResult { self.node_dictionary().id(subject) } @@ -237,15 +238,15 @@ impl InternalLayer { } pub fn node_dict_len(&self) -> usize { - self.node_dictionary().len() + self.node_dictionary().num_entries() } - pub fn value_dict_id(&self, value: &str) -> Option { + pub fn value_dict_id(&self, value: &str) -> IdLookupResult { self.value_dictionary().id(value) } pub fn value_dict_len(&self) -> usize { - self.value_dictionary().len() + self.value_dictionary().num_entries() } pub fn value_dict_get(&self, id: usize) -> Option { diff --git a/src/storage/cache.rs b/src/storage/cache.rs index 2d00302b..abb5bca0 100644 --- a/src/storage/cache.rs +++ b/src/storage/cache.rs @@ -176,7 +176,7 @@ impl LayerStore for CachedLayerStore { if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup if !layer.is_rollup() { - return Ok(Some(layer.node_dictionary().len() as u64)); + return Ok(Some(layer.node_dictionary().num_entries() as u64)); } } @@ -188,7 +188,7 @@ impl LayerStore for CachedLayerStore { if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup if !layer.is_rollup() { - return Ok(Some(layer.predicate_dictionary().len() as u64)); + return Ok(Some(layer.predicate_dictionary().num_entries() as u64)); } } @@ -200,7 +200,7 @@ impl LayerStore for CachedLayerStore { if let Some(layer) = self.cache.get_layer_from_cache(name) { // unless it is a rollup if !layer.is_rollup() { - return Ok(Some(layer.value_dictionary().len() as u64)); + return Ok(Some(layer.value_dictionary().num_entries() as u64)); } } diff --git a/src/storage/delta.rs b/src/storage/delta.rs index e93e80e1..16fe6353 100644 --- a/src/storage/delta.rs +++ b/src/storage/delta.rs @@ -207,7 +207,7 @@ async fn dictionary_rollup_upto TypedDictionaryFiles { offsets_map, }) } + + pub async fn write_all_from_bufs(&self, types_present_buf: &mut B1, type_offsets_buf: &mut B2, blocks_buf: &mut B3, offsets_buf: &mut B4) -> io::Result<()> { + let mut types_present_writer = self.types_present_file.open_write().await?; + let mut type_offsets_writer = self.type_offsets_file.open_write().await?; + let mut blocks_writer = self.blocks_file.open_write().await?; + let mut offsets_writer = self.offsets_file.open_write().await?; + + types_present_writer.write_all_buf(types_present_buf).await?; + type_offsets_writer.write_all_buf(type_offsets_buf).await?; + blocks_writer.write_all_buf(blocks_buf).await?; + offsets_writer.write_all_buf(offsets_buf).await?; + + blocks_writer.flush(); + blocks_writer.sync_all(); + + offsets_writer.flush(); + offsets_writer.sync_all(); + + Ok(()) + } } #[derive(Clone)] @@ -315,6 +335,22 @@ impl DictionaryFiles { offsets_map, }) } + + pub async fn write_all_from_bufs(&self, blocks_buf: &mut B1, offsets_buf: &mut B2) -> io::Result<()> { + let mut blocks_writer = self.blocks_file.open_write().await?; + let mut offsets_writer = self.offsets_file.open_write().await?; + + blocks_writer.write_all_buf(blocks_buf).await?; + offsets_writer.write_all_buf(offsets_buf).await?; + + blocks_writer.flush(); + blocks_writer.sync_all(); + + offsets_writer.flush(); + offsets_writer.sync_all(); + + Ok(()) + } } #[derive(Clone)] diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 18725890..83330736 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -417,16 +417,16 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { } } -pub struct LateLogArrayBufBuilder<'a, B: BufMut> { +pub struct LateLogArrayBufBuilder { /// Destination of the log array data - buf: &'a mut B, + buf: B, /// NOTE: remove pub pub vals: Vec, width: u8 } -impl<'a, B: BufMut> LateLogArrayBufBuilder<'a, B> { - pub fn new(buf: &'a mut B) -> Self { +impl LateLogArrayBufBuilder { + pub fn new(buf: B) -> Self { Self { buf, vals: Vec::new(), @@ -460,10 +460,12 @@ impl<'a, B: BufMut> LateLogArrayBufBuilder<'a, B> { self.vals.pop() } - pub fn finalize(self) { - let mut builder = LogArrayBufBuilder::new(self.buf, self.width); + pub fn finalize(self) -> B { + let mut builder = LogArrayBufBuilder::new(&mut self.buf, self.width); builder.push_vec(self.vals); builder.finalize(); + + self.buf } } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 63d8286b..2ae3b585 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -27,21 +27,21 @@ pub fn build_dict_unchecked, I: Iterator>( } } -pub struct SizedDictBufBuilder<'a, B1: BufMut, B2: BufMut> { +pub struct SizedDictBufBuilder { pub(crate) record_size: Option, block_offset: u64, id_offset: u64, - offsets: LateLogArrayBufBuilder<'a, B1>, + offsets: LateLogArrayBufBuilder, data_buf: B2, current_block: Vec, } -impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { +impl SizedDictBufBuilder { pub fn new( record_size: Option, block_offset: u64, id_offset: u64, - offsets: LateLogArrayBufBuilder<'a, B1>, + offsets: LateLogArrayBufBuilder, data_buf: B2, ) -> Self { Self { @@ -85,7 +85,7 @@ impl<'a, B1: BufMut, B2: BufMut> SizedDictBufBuilder<'a, B1, B2> { it.map(|val| self.add(val)).collect() } - pub fn finalize(mut self) -> (LateLogArrayBufBuilder<'a, B1>, B2, u64, u64) { + pub fn finalize(mut self) -> (LateLogArrayBufBuilder, B2, u64, u64) { if self.current_block.len() > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); let size = build_block_unchecked(self.record_size, &mut self.data_buf, ¤t_block); diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index e9b206cd..93ea3185 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -51,7 +51,7 @@ pub async fn merge_string_dictionaries< let mut offsets = Vec::new(); let mut offsets_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_unchecked(0, &mut offsets, &mut data_buf, sorted_iterator); + build_dict_unchecked(None, 0, &mut offsets, &mut data_buf, sorted_iterator); build_offset_logarray(&mut offsets_buf, offsets); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 13949f55..20f3f926 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -567,18 +567,18 @@ pub fn build_multiple_segments< type_offsets_builder.finalize(); } -struct TypedDictBufBuilder<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> { - types_present_builder: LateLogArrayBufBuilder<'a, B1>, - type_offsets_builder: LateLogArrayBufBuilder<'a, B2>, - sized_dict_buf_builder: Option>, +pub struct TypedDictBufBuilder { + types_present_builder: LateLogArrayBufBuilder, + type_offsets_builder: LateLogArrayBufBuilder, + sized_dict_buf_builder: Option>, current_datatype: Option, } -impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, B1, B2, B3, B4> { +impl TypedDictBufBuilder { pub fn new( - used_types: &'a mut B1, - type_offsets: &'a mut B2, - block_offsets: &'a mut B3, + used_types: B1, + type_offsets: B2, + block_offsets: B3, data_buf: B4, ) -> Self { let types_present_builder = LateLogArrayBufBuilder::new(used_types); @@ -638,19 +638,19 @@ impl<'a, B1: BufMut, B2: BufMut, B3: BufMut, B4: BufMut> TypedDictBufBuilder<'a, it.map(|(dt, val)| self.add(dt, val)).collect() } - pub fn finalize(self) -> B4 { + pub fn finalize(self) -> (B1, B2, B3, B4) { if self.current_datatype == None { panic!("There was nothing added to this dictionary!"); } let (mut block_offset_builder, data_buf, _, _) = self.sized_dict_buf_builder.unwrap().finalize(); block_offset_builder.pop(); - block_offset_builder.finalize(); + let block_offsets_buf = block_offset_builder.finalize(); - self.types_present_builder.finalize(); - self.type_offsets_builder.finalize(); + let types_present_buf = self.types_present_builder.finalize(); + let type_offsets_buf = self.type_offsets_builder.finalize(); - data_buf + (types_present_buf, type_offsets_buf, block_offsets_buf, data_buf) } } From a3768a3169a69300e59cdec8bf46fccff2e65263 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 15:38:44 +0100 Subject: [PATCH 45/99] adding parse of control word logic --- src/structure/tfc/block.rs | 15 ++++++++++++--- src/structure/tfc/typed.rs | 6 ++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 3800b43b..2b7bb71a 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -613,7 +613,10 @@ fn record_size_encoding(record_size: Option) -> u8 { None => 0, Some(4) => 3 << 3, Some(8) => 4 << 3, - _ => panic!("This is really bad!"), + _ => { + dbg!(record_size); + panic!("This is really bad!") + } } } @@ -929,13 +932,19 @@ mod tests { #[test] fn control_word_round_trip() { + let cw = create_block_control_word(None, 1); + assert_eq!(parse_block_control_word(cw), (None, 1)); + + let cw = create_block_control_word(None, 8); + assert_eq!(parse_block_control_word(cw), (None, 8)); + let cw = create_block_control_word(None, 3); assert_eq!(parse_block_control_word(cw), (None, 3)); let cw = create_block_control_word(Some(8), 5); assert_eq!(parse_block_control_word(cw), (Some(8), 5)); - let cw = create_block_control_word(Some(12), 6); - assert_eq!(parse_block_control_word(cw), (Some(12), 6)) + let cw = create_block_control_word(Some(4), 6); + assert_eq!(parse_block_control_word(cw), (Some(4), 6)) } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index edd699b0..18d09b61 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -57,8 +57,10 @@ impl TypedDict { type_id_offsets.push((type_offset + 1) * 8 - tally); } - let last_gap = - BLOCK_SIZE - data[block_offsets.entry(block_offsets.len() - 1) as usize] as usize; + let last_gap = BLOCK_SIZE + - parse_block_control_records( + data[block_offsets.entry(block_offsets.len() - 1) as usize], + ) as usize; let num_entries = (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap; Self { From 25d7d21bbbcf0f7fccd654e586cce7d713f81ea6 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 30 Nov 2022 15:40:27 +0100 Subject: [PATCH 46/99] make lexical conversion more generic --- src/structure/tfc/typed.rs | 357 ++++++++++++++++++++----------------- 1 file changed, 191 insertions(+), 166 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 18d09b61..6f0affad 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -73,8 +73,8 @@ impl TypedDict { } } - pub fn id(&self, v: &T) -> IdLookupResult { - let (datatype, bytes) = v.make_entry(); + pub fn id>(&self, v: &Q) -> IdLookupResult { + let (datatype, bytes) = T::make_entry(v); self.id_slice(datatype, bytes.as_ref()) } @@ -257,8 +257,8 @@ impl TypedDictSegment { T::from_lexical(entry.into_buf()) } - pub fn id(&self, val: &T) -> IdLookupResult { - let slice = val.to_lexical(); + pub fn id>(&self, val: &Q) -> IdLookupResult { + let slice = T::to_lexical(val); self.dict.id(&slice[..]) } } @@ -302,13 +302,26 @@ impl Datatype { pub trait TdbDataType { fn datatype() -> Datatype; + fn from_lexical(b: B) -> Self; - fn to_lexical(&self) -> Bytes; + fn to_lexical(val: &T) -> Bytes + where T: ToLexical + ?Sized { + val.to_lexical() + } - fn from_lexical(b: B) -> Self; + fn make_entry(val: &T) -> (Datatype, Bytes) + where T: ToLexical + ?Sized{ + (Self::datatype(), val.to_lexical()) + } +} + +pub trait ToLexical { + fn to_lexical(&self) -> Bytes; +} - fn make_entry(&self) -> (Datatype, Bytes) { - (Self::datatype(), self.to_lexical()) +impl> ToLexical for T { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self.as_ref().as_bytes()) } } @@ -317,10 +330,6 @@ impl TdbDataType for String { Datatype::String } - fn to_lexical(&self) -> Bytes { - Bytes::copy_from_slice(self.as_bytes()) - } - fn from_lexical(mut b: B) -> Self { let mut vec = vec![0; b.remaining()]; b.copy_to_slice(&mut vec); @@ -333,16 +342,18 @@ impl TdbDataType for u32 { Datatype::UInt32 } + fn from_lexical(b: B) -> Self { + b.reader().read_u32::().unwrap() + } +} + +impl ToLexical for u32 { fn to_lexical(&self) -> Bytes { let mut buf = BytesMut::new().writer(); buf.write_u32::(*self).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - b.reader().read_u32::().unwrap() - } } const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); @@ -351,17 +362,19 @@ impl TdbDataType for i32 { Datatype::Int32 } + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + (I32_BYTE_MASK ^ i) as i32 + } +} + +impl ToLexical for i32 { fn to_lexical(&self) -> Bytes { let sign_flip = I32_BYTE_MASK ^ (*self as u32); let mut buf = BytesMut::new().writer(); buf.write_u32::(sign_flip).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u32::().unwrap(); - (I32_BYTE_MASK ^ i) as i32 - } } impl TdbDataType for u64 { @@ -369,16 +382,18 @@ impl TdbDataType for u64 { Datatype::UInt64 } + fn from_lexical(b: B) -> Self { + b.reader().read_u64::().unwrap() + } +} + +impl ToLexical for u64 { fn to_lexical(&self) -> Bytes { let mut buf = BytesMut::new().writer(); buf.write_u64::(*self).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - b.reader().read_u64::().unwrap() - } } const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); @@ -387,17 +402,19 @@ impl TdbDataType for i64 { Datatype::Int64 } + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + (I64_BYTE_MASK ^ i) as i64 + } +} + +impl ToLexical for i64 { fn to_lexical(&self) -> Bytes { let sign_flip = I64_BYTE_MASK ^ (*self as u64); let mut buf = BytesMut::new().writer(); buf.write_u64::(sign_flip).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u64::().unwrap(); - (I64_BYTE_MASK ^ i) as i64 - } } const F32_SIGN_MASK: u32 = 0x8000_0000; @@ -407,6 +424,17 @@ impl TdbDataType for f32 { Datatype::Float32 } + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + if i & F32_SIGN_MASK > 0 { + f32::from_bits(i ^ F32_SIGN_MASK) + } else { + f32::from_bits(i ^ F32_COMPLEMENT) + } + } +} + +impl ToLexical for f32 { fn to_lexical(&self) -> Bytes { let f = *self; let g: u32; @@ -419,15 +447,6 @@ impl TdbDataType for f32 { buf.write_u32::(g).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u32::().unwrap(); - if i & F32_SIGN_MASK > 0 { - f32::from_bits(i ^ F32_SIGN_MASK) - } else { - f32::from_bits(i ^ F32_COMPLEMENT) - } - } } const F64_SIGN_MASK: u64 = 0x8000_0000_0000_0000; @@ -437,6 +456,17 @@ impl TdbDataType for f64 { Datatype::Float64 } + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + if i & F64_SIGN_MASK > 0 { + f64::from_bits(i ^ F64_SIGN_MASK) + } else { + f64::from_bits(i ^ F64_COMPLEMENT) + } + } +} + +impl ToLexical for f64 { fn to_lexical(&self) -> Bytes { let f = *self; let g: u64; @@ -449,15 +479,6 @@ impl TdbDataType for f64 { buf.write_u64::(g).unwrap(); buf.into_inner().freeze() } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u64::().unwrap(); - if i & F64_SIGN_MASK > 0 { - f64::from_bits(i ^ F64_SIGN_MASK) - } else { - f64::from_bits(i ^ F64_COMPLEMENT) - } - } } impl TdbDataType for Integer { @@ -465,15 +486,17 @@ impl TdbDataType for Integer { Datatype::BigInt } - fn to_lexical(&self) -> Bytes { - Bytes::from(bigint_to_storage(self.clone())) - } - fn from_lexical(mut b: B) -> Self { storage_to_bigint(&mut b) } } +impl ToLexical for Integer { + fn to_lexical(&self) -> Bytes { + Bytes::from(bigint_to_storage(self.clone())) + } +} + #[derive(PartialEq, Debug)] pub struct Decimal(String); @@ -482,16 +505,18 @@ impl TdbDataType for Decimal { Datatype::Decimal } - fn to_lexical(&self) -> Bytes { - Bytes::from(decimal_to_storage(&self.0)) - } - fn from_lexical(mut b: B) -> Self { Decimal(storage_to_decimal(&mut b)) } } -pub fn build_segment>( +impl ToLexical for Decimal { + fn to_lexical(&self) -> Bytes { + Bytes::from(decimal_to_storage(&self.0)) + } +} + +pub fn build_segment, I: Iterator>( record_size: Option, offsets: &mut Vec, data_buf: &mut B, @@ -647,7 +672,7 @@ mod tests { use super::*; - fn build_segment_and_offsets>( + fn build_segment_and_offsets, I: Iterator>( dt: Datatype, array_buf: &mut B1, data_buf: &mut B2, @@ -726,9 +751,9 @@ mod tests { fn cycle(d: D) where - D: TdbDataType + PartialEq + Debug, + D: TdbDataType + PartialEq + Debug + ToLexical, { - let j = D::from_lexical(d.to_lexical()); + let j = D::from_lexical(::to_lexical(&d)); assert_eq!(d, j) } @@ -817,19 +842,19 @@ mod tests { #[test] fn test_multi_segment() { let mut vec: Vec<(Datatype, Bytes)> = vec![ - Decimal("-1".to_string()).make_entry(), - "asdf".to_string().make_entry(), - Decimal("-12342343.2348973".to_string()).make_entry(), - "Batty".to_string().make_entry(), - "Batman".to_string().make_entry(), - (-3_i64).make_entry(), - Decimal("2348973".to_string()).make_entry(), - 4.389832_f32.make_entry(), - "apple".to_string().make_entry(), - 23434.389832_f32.make_entry(), - "apply".to_string().make_entry(), - (-500_i32).make_entry(), - 20_u32.make_entry(), + Decimal::make_entry(&Decimal("-1".to_string())), + String::make_entry(&"asdf"), + Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + i64::make_entry(&-3_i64), + Decimal::make_entry(&Decimal("2348973".to_string())), + f32::make_entry(&4.389832_f32), + String::make_entry(&"apple"), + f32::make_entry(&23434.389832_f32), + String::make_entry(&"apply"), + i32::make_entry(&-500_i32), + u32::make_entry(&20_u32), ]; vec.sort(); let mut used_types = BytesMut::new(); @@ -869,37 +894,37 @@ mod tests { #[test] fn test_full_blocks() { let mut vec: Vec<(Datatype, Bytes)> = vec![ - "fdsa".to_string().make_entry(), - "a".to_string().make_entry(), - "bc".to_string().make_entry(), - "bcd".to_string().make_entry(), - "z".to_string().make_entry(), - "Batty".to_string().make_entry(), - "Batman".to_string().make_entry(), - "apple".to_string().make_entry(), - (-500_i32).make_entry(), - 20_u32.make_entry(), - 22_u32.make_entry(), - 23_u32.make_entry(), - 24_u32.make_entry(), - 25_u32.make_entry(), - 26_u32.make_entry(), - 27_u32.make_entry(), - 28_u32.make_entry(), - 3000_u32.make_entry(), - (-3_i64).make_entry(), - Decimal("-12342343.2348973".to_string()).make_entry(), - Decimal("234.8973".to_string()).make_entry(), - Decimal("0.2348973".to_string()).make_entry(), - Decimal("23423423.8973".to_string()).make_entry(), - Decimal("3.3".to_string()).make_entry(), - Decimal("0.001".to_string()).make_entry(), - Decimal("-0.001".to_string()).make_entry(), - Decimal("2".to_string()).make_entry(), - Decimal("0".to_string()).make_entry(), - 4.389832_f32.make_entry(), - 23434.389832_f32.make_entry(), - int("239487329872343987").make_entry(), + String::make_entry(&"fdsa"), + String::make_entry(&"a"), + String::make_entry(&"bc"), + String::make_entry(&"bcd"), + String::make_entry(&"z"), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + String::make_entry(&"apple"), + i32::make_entry(&-500_i32), + u32::make_entry(&20_u32), + u32::make_entry(&22_u32), + u32::make_entry(&23_u32), + u32::make_entry(&24_u32), + u32::make_entry(&25_u32), + u32::make_entry(&26_u32), + u32::make_entry(&27_u32), + u32::make_entry(&28_u32), + u32::make_entry(&3000_u32), + i64::make_entry(&-3_i64), + Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), + Decimal::make_entry(&Decimal("234.8973".to_string())), + Decimal::make_entry(&Decimal("0.2348973".to_string())), + Decimal::make_entry(&Decimal("23423423.8973".to_string())), + Decimal::make_entry(&Decimal("3.3".to_string())), + Decimal::make_entry(&Decimal("0.001".to_string())), + Decimal::make_entry(&Decimal("-0.001".to_string())), + Decimal::make_entry(&Decimal("2".to_string())), + Decimal::make_entry(&Decimal("0".to_string())), + f32::make_entry(&4.389832_f32), + f32::make_entry(&23434.389832_f32), + Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); let mut used_types = BytesMut::new(); @@ -967,37 +992,37 @@ mod tests { #[test] fn iterate_full_blocks() { let mut vec: Vec<(Datatype, Bytes)> = vec![ - "fdsa".to_string().make_entry(), - "a".to_string().make_entry(), - "bc".to_string().make_entry(), - "bcd".to_string().make_entry(), - "z".to_string().make_entry(), - "Batty".to_string().make_entry(), - "Batman".to_string().make_entry(), - "apple".to_string().make_entry(), - (-500_i32).make_entry(), - 20_u32.make_entry(), - 22_u32.make_entry(), - 23_u32.make_entry(), - 24_u32.make_entry(), - 25_u32.make_entry(), - 26_u32.make_entry(), - 27_u32.make_entry(), - 28_u32.make_entry(), - 3000_u32.make_entry(), - (-3_i64).make_entry(), - Decimal("-12342343.2348973".to_string()).make_entry(), - Decimal("234.8973".to_string()).make_entry(), - Decimal("0.2348973".to_string()).make_entry(), - Decimal("23423423.8973".to_string()).make_entry(), - Decimal("3.3".to_string()).make_entry(), - Decimal("0.001".to_string()).make_entry(), - Decimal("-0.001".to_string()).make_entry(), - Decimal("2".to_string()).make_entry(), - Decimal("0".to_string()).make_entry(), - 4.389832_f32.make_entry(), - 23434.389832_f32.make_entry(), - int("239487329872343987").make_entry(), + String::make_entry(&"fdsa"), + String::make_entry(&"a"), + String::make_entry(&"bc"), + String::make_entry(&"bcd"), + String::make_entry(&"z"), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + String::make_entry(&"apple"), + i32::make_entry(&-500_i32), + u32::make_entry(&20_u32), + u32::make_entry(&22_u32), + u32::make_entry(&23_u32), + u32::make_entry(&24_u32), + u32::make_entry(&25_u32), + u32::make_entry(&26_u32), + u32::make_entry(&27_u32), + u32::make_entry(&28_u32), + u32::make_entry(&3000_u32), + i64::make_entry(&-3_i64), + Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), + Decimal::make_entry(&Decimal("234.8973".to_string())), + Decimal::make_entry(&Decimal("0.2348973".to_string())), + Decimal::make_entry(&Decimal("23423423.8973".to_string())), + Decimal::make_entry(&Decimal("3.3".to_string())), + Decimal::make_entry(&Decimal("0.001".to_string())), + Decimal::make_entry(&Decimal("-0.001".to_string())), + Decimal::make_entry(&Decimal("2".to_string())), + Decimal::make_entry(&Decimal("0".to_string())), + f32::make_entry(&4.389832_f32), + f32::make_entry(&23434.389832_f32), + Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); let mut used_types = BytesMut::new(); @@ -1031,37 +1056,37 @@ mod tests { #[test] fn test_incremental_builder() { let mut vec: Vec<(Datatype, Bytes)> = vec![ - "fdsa".to_string().make_entry(), - "a".to_string().make_entry(), - "bc".to_string().make_entry(), - "bcd".to_string().make_entry(), - "z".to_string().make_entry(), - "Batty".to_string().make_entry(), - "Batman".to_string().make_entry(), - "apple".to_string().make_entry(), - (-500_i32).make_entry(), - 20_u32.make_entry(), - 22_u32.make_entry(), - 23_u32.make_entry(), - 24_u32.make_entry(), - 25_u32.make_entry(), - 26_u32.make_entry(), - 27_u32.make_entry(), - 28_u32.make_entry(), - 3000_u32.make_entry(), - (-3_i64).make_entry(), - Decimal("-12342343.2348973".to_string()).make_entry(), - Decimal("234.8973".to_string()).make_entry(), - Decimal("0.2348973".to_string()).make_entry(), - Decimal("23423423.8973".to_string()).make_entry(), - Decimal("3.3".to_string()).make_entry(), - Decimal("0.001".to_string()).make_entry(), - Decimal("-0.001".to_string()).make_entry(), - Decimal("2".to_string()).make_entry(), - Decimal("0".to_string()).make_entry(), - 4.389832_f32.make_entry(), - 23434.389832_f32.make_entry(), - int("239487329872343987").make_entry(), + String::make_entry(&"fdsa"), + String::make_entry(&"a"), + String::make_entry(&"bc"), + String::make_entry(&"bcd"), + String::make_entry(&"z"), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + String::make_entry(&"apple"), + i32::make_entry(&-500_i32), + u32::make_entry(&20_u32), + u32::make_entry(&22_u32), + u32::make_entry(&23_u32), + u32::make_entry(&24_u32), + u32::make_entry(&25_u32), + u32::make_entry(&26_u32), + u32::make_entry(&27_u32), + u32::make_entry(&28_u32), + u32::make_entry(&3000_u32), + i64::make_entry(&-3_i64), + Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), + Decimal::make_entry(&Decimal("234.8973".to_string())), + Decimal::make_entry(&Decimal("0.2348973".to_string())), + Decimal::make_entry(&Decimal("23423423.8973".to_string())), + Decimal::make_entry(&Decimal("3.3".to_string())), + Decimal::make_entry(&Decimal("0.001".to_string())), + Decimal::make_entry(&Decimal("-0.001".to_string())), + Decimal::make_entry(&Decimal("2".to_string())), + Decimal::make_entry(&Decimal("0".to_string())), + f32::make_entry(&4.389832_f32), + f32::make_entry(&23434.389832_f32), + Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); From 46d62b170df64c2d2cda41e4fc51c056bb22f3d2 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 30 Nov 2022 16:03:28 +0100 Subject: [PATCH 47/99] back to something that compiles --- src/layer/internal/mod.rs | 19 +++-- src/layer/simple_builder.rs | 14 ++-- src/storage/file.rs | 2 +- src/storage/layer.rs | 152 +++++++++++++++++++----------------- src/structure/logarray.rs | 2 +- src/structure/tfc/block.rs | 15 ++++ src/structure/tfc/dict.rs | 7 +- src/structure/tfc/typed.rs | 19 +++-- 8 files changed, 131 insertions(+), 99 deletions(-) diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index ca07b5c0..52aab0a9 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -226,11 +226,11 @@ impl InternalLayer { } pub fn predicate_dict_id(&self, predicate: &str) -> IdLookupResult { - self.predicate_dictionary().id(predicate) + self.predicate_dictionary().id(&predicate) } pub fn node_dict_id(&self, subject: &str) -> IdLookupResult { - self.node_dictionary().id(subject) + self.node_dictionary().id(&subject) } pub fn node_dict_get(&self, id: usize) -> Option { @@ -242,7 +242,7 @@ impl InternalLayer { } pub fn value_dict_id(&self, value: &str) -> IdLookupResult { - self.value_dictionary().id(value) + self.value_dictionary().id(&value) } pub fn value_dict_len(&self) -> usize { @@ -542,12 +542,12 @@ impl Layer for InternalLayer { fn node_and_value_count(&self) -> usize { self.parent_node_value_count() - + self.node_dictionary().len() - + self.value_dictionary().len() + + self.node_dictionary().num_entries() + + self.value_dictionary().num_entries() } fn predicate_count(&self) -> usize { - self.parent_predicate_count() + self.predicate_dictionary().len() + self.parent_predicate_count() + self.predicate_dictionary().num_entries() } fn subject_id<'a>(&'a self, subject: &str) -> Option { @@ -555,6 +555,7 @@ impl Layer for InternalLayer { ( layer .node_dict_id(subject) + .into_option() .map(|id| layer.node_value_id_map().inner_to_outer(id)), layer.immediate_parent(), ) @@ -572,6 +573,7 @@ impl Layer for InternalLayer { ( layer .predicate_dict_id(predicate) + .into_option() .map(|id| layer.predicate_id_map().inner_to_outer(id)), layer.immediate_parent(), ) @@ -589,6 +591,7 @@ impl Layer for InternalLayer { ( layer .node_dict_id(object) + .into_option() .map(|id| layer.node_value_id_map().inner_to_outer(id)), layer.immediate_parent(), ) @@ -604,7 +607,9 @@ impl Layer for InternalLayer { fn object_value_id<'a>(&'a self, object: &str) -> Option { let to_result = |layer: &'a InternalLayer| { ( - layer.value_dict_id(object).map(|i| { + layer.value_dict_id(object) + .into_option() + .map(|i| { layer .node_value_id_map() .inner_to_outer(i + layer.node_dict_len() as u64) diff --git a/src/layer/simple_builder.rs b/src/layer/simple_builder.rs index 8a9672ad..8eaaf4fb 100644 --- a/src/layer/simple_builder.rs +++ b/src/layer/simple_builder.rs @@ -193,11 +193,10 @@ impl LayerBuilder for SimpleLayerBuil let mut builder = ChildLayerFileBuilder::from_files(parent.clone(), &files).await?; - let node_ids = builder.add_nodes(unresolved_nodes.clone()).await?; + let node_ids = builder.add_nodes(unresolved_nodes.clone()); let predicate_ids = builder - .add_predicates(unresolved_predicates.clone()) - .await?; - let value_ids = builder.add_values(unresolved_values.clone()).await?; + .add_predicates(unresolved_predicates.clone()); + let value_ids = builder.add_values(unresolved_values.clone()); let mut builder = builder.into_phase2().await?; @@ -240,11 +239,10 @@ impl LayerBuilder for SimpleLayerBuil let files = files.into_base(); let mut builder = BaseLayerFileBuilder::from_files(&files).await?; - let node_ids = builder.add_nodes(unresolved_nodes.clone()).await?; + let node_ids = builder.add_nodes(unresolved_nodes.clone()); let predicate_ids = builder - .add_predicates(unresolved_predicates.clone()) - .await?; - let value_ids = builder.add_values(unresolved_values.clone()).await?; + .add_predicates(unresolved_predicates.clone()); + let value_ids = builder.add_values(unresolved_values.clone()); let mut builder = builder.into_phase2().await?; diff --git a/src/storage/file.rs b/src/storage/file.rs index 7f6be220..02ce1104 100644 --- a/src/storage/file.rs +++ b/src/storage/file.rs @@ -65,7 +65,7 @@ impl LayerFiles { } } - pub fn value_dictionary_files(&self) -> &DictionaryFiles { + pub fn value_dictionary_files(&self) -> &TypedDictionaryFiles { match self { Self::Base(b) => &b.value_dictionary_files, Self::Child(c) => &c.value_dictionary_files, diff --git a/src/storage/layer.rs b/src/storage/layer.rs index bd17c109..29af5973 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -365,6 +365,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { FILENAMES.node_dictionary_offsets, FILENAMES.predicate_dictionary_blocks, FILENAMES.predicate_dictionary_offsets, + FILENAMES.value_dictionary_types_present, + FILENAMES.value_dictionary_type_offsets, FILENAMES.value_dictionary_blocks, FILENAMES.value_dictionary_offsets, FILENAMES.node_value_idmap_bits, @@ -407,55 +409,57 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { blocks_file: files[2].clone(), offsets_file: files[3].clone(), }, - value_dictionary_files: DictionaryFiles { - blocks_file: files[4].clone(), - offsets_file: files[5].clone(), + value_dictionary_files: TypedDictionaryFiles { + types_present_file: files[4].clone(), + type_offsets_file: files[5].clone(), + blocks_file: files[6].clone(), + offsets_file: files[7].clone(), }, id_map_files: IdMapFiles { node_value_idmap_files: BitIndexFiles { - bits_file: files[6].clone(), - blocks_file: files[7].clone(), - sblocks_file: files[8].clone(), + bits_file: files[8].clone(), + blocks_file: files[9].clone(), + sblocks_file: files[10].clone(), }, predicate_idmap_files: BitIndexFiles { - bits_file: files[9].clone(), - blocks_file: files[10].clone(), - sblocks_file: files[11].clone(), + bits_file: files[11].clone(), + blocks_file: files[12].clone(), + sblocks_file: files[13].clone(), }, }, - subjects_file: files[12].clone(), - objects_file: files[13].clone(), + subjects_file: files[14].clone(), + objects_file: files[15].clone(), s_p_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[14].clone(), - blocks_file: files[15].clone(), - sblocks_file: files[16].clone(), + bits_file: files[16].clone(), + blocks_file: files[17].clone(), + sblocks_file: files[18].clone(), }, - nums_file: files[17].clone(), + nums_file: files[19].clone(), }, sp_o_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[18].clone(), - blocks_file: files[19].clone(), - sblocks_file: files[20].clone(), + bits_file: files[20].clone(), + blocks_file: files[21].clone(), + sblocks_file: files[22].clone(), }, - nums_file: files[21].clone(), + nums_file: files[23].clone(), }, o_ps_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[22].clone(), - blocks_file: files[23].clone(), - sblocks_file: files[24].clone(), + bits_file: files[24].clone(), + blocks_file: files[25].clone(), + sblocks_file: files[26].clone(), }, - nums_file: files[25].clone(), + nums_file: files[27].clone(), }, predicate_wavelet_tree_files: BitIndexFiles { - bits_file: files[26].clone(), - blocks_file: files[27].clone(), - sblocks_file: files[28].clone(), + bits_file: files[28].clone(), + blocks_file: files[29].clone(), + sblocks_file: files[30].clone(), }, }) } @@ -466,6 +470,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { FILENAMES.node_dictionary_offsets, FILENAMES.predicate_dictionary_blocks, FILENAMES.predicate_dictionary_offsets, + FILENAMES.value_dictionary_types_present, + FILENAMES.value_dictionary_type_offsets, FILENAMES.value_dictionary_blocks, FILENAMES.value_dictionary_offsets, FILENAMES.node_value_idmap_bits, @@ -524,86 +530,88 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { blocks_file: files[2].clone(), offsets_file: files[3].clone(), }, - value_dictionary_files: DictionaryFiles { - blocks_file: files[4].clone(), - offsets_file: files[5].clone(), + value_dictionary_files: TypedDictionaryFiles { + types_present_file: files[4].clone(), + type_offsets_file: files[5].clone(), + blocks_file: files[6].clone(), + offsets_file: files[7].clone(), }, id_map_files: IdMapFiles { node_value_idmap_files: BitIndexFiles { - bits_file: files[6].clone(), - blocks_file: files[7].clone(), - sblocks_file: files[8].clone(), + bits_file: files[8].clone(), + blocks_file: files[9].clone(), + sblocks_file: files[10].clone(), }, predicate_idmap_files: BitIndexFiles { - bits_file: files[9].clone(), - blocks_file: files[10].clone(), - sblocks_file: files[11].clone(), + bits_file: files[11].clone(), + blocks_file: files[12].clone(), + sblocks_file: files[13].clone(), }, }, - pos_subjects_file: files[12].clone(), - pos_objects_file: files[13].clone(), - neg_subjects_file: files[14].clone(), - neg_objects_file: files[15].clone(), + pos_subjects_file: files[14].clone(), + pos_objects_file: files[15].clone(), + neg_subjects_file: files[16].clone(), + neg_objects_file: files[17].clone(), pos_s_p_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[16].clone(), - blocks_file: files[17].clone(), - sblocks_file: files[18].clone(), + bits_file: files[18].clone(), + blocks_file: files[19].clone(), + sblocks_file: files[20].clone(), }, - nums_file: files[19].clone(), + nums_file: files[21].clone(), }, pos_sp_o_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[20].clone(), - blocks_file: files[21].clone(), - sblocks_file: files[22].clone(), + bits_file: files[22].clone(), + blocks_file: files[23].clone(), + sblocks_file: files[24].clone(), }, - nums_file: files[23].clone(), + nums_file: files[25].clone(), }, pos_o_ps_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[24].clone(), - blocks_file: files[25].clone(), - sblocks_file: files[26].clone(), + bits_file: files[26].clone(), + blocks_file: files[27].clone(), + sblocks_file: files[28].clone(), }, - nums_file: files[27].clone(), + nums_file: files[29].clone(), }, neg_s_p_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[28].clone(), - blocks_file: files[29].clone(), - sblocks_file: files[30].clone(), + bits_file: files[30].clone(), + blocks_file: files[31].clone(), + sblocks_file: files[32].clone(), }, - nums_file: files[31].clone(), + nums_file: files[33].clone(), }, neg_sp_o_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[32].clone(), - blocks_file: files[33].clone(), - sblocks_file: files[34].clone(), + bits_file: files[34].clone(), + blocks_file: files[35].clone(), + sblocks_file: files[36].clone(), }, - nums_file: files[35].clone(), + nums_file: files[37].clone(), }, neg_o_ps_adjacency_list_files: AdjacencyListFiles { bitindex_files: BitIndexFiles { - bits_file: files[36].clone(), - blocks_file: files[37].clone(), - sblocks_file: files[38].clone(), + bits_file: files[38].clone(), + blocks_file: files[39].clone(), + sblocks_file: files[40].clone(), }, - nums_file: files[39].clone(), + nums_file: files[41].clone(), }, pos_predicate_wavelet_tree_files: BitIndexFiles { - bits_file: files[40].clone(), - blocks_file: files[41].clone(), - sblocks_file: files[42].clone(), + bits_file: files[42].clone(), + blocks_file: files[43].clone(), + sblocks_file: files[44].clone(), }, neg_predicate_wavelet_tree_files: BitIndexFiles { - bits_file: files[43].clone(), - blocks_file: files[44].clone(), - sblocks_file: files[45].clone(), + bits_file: files[45].clone(), + blocks_file: files[46].clone(), + sblocks_file: files[47].clone(), }, }) } @@ -1561,7 +1569,7 @@ impl LateLogArrayBufBuilder { self.vals.pop() } - pub fn finalize(self) -> B { + pub fn finalize(mut self) -> B { let mut builder = LogArrayBufBuilder::new(&mut self.buf, self.width); builder.push_vec(self.vals); builder.finalize(); diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index a45365a2..07077ac4 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -587,6 +587,21 @@ impl IdLookupResult { _ => self, } } + + pub fn mapu64>(self, f: F) -> Self { + match self { + Self::Found(i) => Self::Found(f(i)), + Self::Closest(i) => Self::Closest(f(i)), + Self::NotFound => Self::NotFound + } + } + + pub fn into_option(self) -> Option { + match self { + Self::Found(i) => Some(i), + _ => None + } + } } pub fn parse_block_control_records(cw: u8) -> u8 { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 2ae3b585..9e2707ff 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -174,9 +174,12 @@ impl SizedDict { self.offsets.len() + 1 } - pub fn entry(&self, index: u64) -> SizedDictEntry { + pub fn entry(&self, index: usize) -> Option { + if index > self.num_entries() { + return None; + } let block = self.block(((index - 1) / 8) as usize); - block.entry(((index - 1) % 8) as usize) + Some(block.entry(((index - 1) % 8) as usize)) } pub fn id(&self, slice: &[u8]) -> IdLookupResult { diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index a9b9fc8c..1bdbdc58 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -79,9 +79,9 @@ impl TypedDict { self.id_slice(datatype, bytes.as_ref()) } - pub fn get(&self, id: u64) -> T { - let (datatype, slice) = self.entry(id); - datatype.cast(slice.into_buf()) + pub fn get(&self, id: usize) -> Option { + let result = self.entry(id); + result.map(|(datatype, slice)| datatype.cast(slice.into_buf())) } fn inner_type_segment(&self, i: usize) -> (SizedDict, u64) { @@ -159,12 +159,15 @@ impl TypedDict { FromPrimitive::from_u64(self.types_present.entry(type_index)).unwrap() } - pub fn entry(&self, id: u64) -> (Datatype, SizedDictEntry) { - let type_index = self.type_index_for_id(id); + pub fn entry(&self, id: usize) -> Option<(Datatype, SizedDictEntry)> { + if id > self.num_entries() { + return None; + } + let type_index = self.type_index_for_id(id as u64); let (dict, offset) = self.inner_type_segment(type_index); let dt = self.type_for_type_index(type_index); - (dt, dict.entry(id - offset)) + dict.entry(id - offset as usize).map(|e| (dt, e)) } pub fn num_entries(&self) -> usize { @@ -253,9 +256,9 @@ impl TypedDictSegment { } } - pub fn get(&self, index: u64) -> T { + pub fn get(&self, index: usize) -> Option { let entry = self.dict.entry(index); - T::from_lexical(entry.into_buf()) + entry.map(|e|T::from_lexical(e.into_buf())) } pub fn id>(&self, val: &Q) -> IdLookupResult { From a09a67e856aa3d05d3fed1b3f00f631336cdc45d Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 16:51:48 +0100 Subject: [PATCH 48/99] Builds --- src/layer/internal/base.rs | 16 ++--- src/layer/internal/child.rs | 18 +++--- src/layer/internal/mod.rs | 21 ++---- src/layer/internal/object_iterator.rs | 15 +---- src/layer/internal/subject_iterator.rs | 60 ++++-------------- src/storage/memory.rs | 8 ++- src/structure/tfc/dict.rs | 10 +-- src/structure/tfc/typed.rs | 88 +++++++++++++++----------- 8 files changed, 100 insertions(+), 136 deletions(-) diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 5d81317a..14bea2cc 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -70,7 +70,9 @@ impl BaseLayer { None => IdMap::default(), Some(maps) => IdMap::from_maps( maps, - util::calculate_width((node_dictionary.num_entries() + value_dictionary.num_entries()) as u64), + util::calculate_width( + (node_dictionary.num_entries() + value_dictionary.num_entries()) as u64, + ), ), }; @@ -452,15 +454,9 @@ pub mod tests { let mut builder = BaseLayerFileBuilder::from_files(&base_layer_files).await?; - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await?; - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await?; - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await?; + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await?; diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index effd0fc1..978fe965 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -65,7 +65,7 @@ impl ChildLayer { let node_dictionary = TypedDictSegment::parse( maps.node_dictionary_maps.blocks_map, maps.node_dictionary_maps.offsets_map, - 0 + 0, ); let predicate_dictionary = TypedDictSegment::parse( maps.predicate_dictionary_maps.blocks_map, @@ -86,7 +86,9 @@ impl ChildLayer { None => IdMap::default(), Some(maps) => IdMap::from_maps( maps, - util::calculate_width((node_dictionary.num_entries() + value_dictionary.num_entries()) as u64), + util::calculate_width( + (node_dictionary.num_entries() + value_dictionary.num_entries()) as u64, + ), ), }; @@ -944,9 +946,9 @@ pub mod tests { let mut b = ChildLayerFileBuilder::from_files(parent.clone(), &child_files) .await .unwrap(); - b.add_node("foo").await.unwrap(); - b.add_predicate("bar").await.unwrap(); - b.add_value("baz").await.unwrap(); + b.add_node("foo"); + b.add_predicate("bar"); + b.add_value("baz"); let b = b.into_phase2().await.unwrap(); b.finalize().await.unwrap(); @@ -982,9 +984,9 @@ pub mod tests { let mut b = ChildLayerFileBuilder::from_files(parent.clone(), &child_files) .await .unwrap(); - b.add_node("foo").await.unwrap(); - b.add_predicate("bar").await.unwrap(); - b.add_value("baz").await.unwrap(); + b.add_node("foo"); + b.add_predicate("bar"); + b.add_value("baz"); let b = b.into_phase2().await.unwrap(); b.finalize().await.unwrap(); diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 52aab0a9..650de6a2 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -26,8 +26,8 @@ pub enum InternalLayer { Rollup(RollupLayer), } -use InternalLayer::*; use tfc::block::IdLookupResult; +use InternalLayer::*; impl InternalLayer { pub fn name(&self) -> [u32; 5] { @@ -607,9 +607,7 @@ impl Layer for InternalLayer { fn object_value_id<'a>(&'a self, object: &str) -> Option { let to_result = |layer: &'a InternalLayer| { ( - layer.value_dict_id(object) - .into_option() - .map(|i| { + layer.value_dict_id(object).into_option().map(|i| { layer .node_value_id_map() .inner_to_outer(i + layer.node_dict_len() as u64) @@ -1109,18 +1107,9 @@ mod tests { let values = vec!["chicken", "cow", "dog", "pig", "zebra"]; let mut builder = BaseLayerFileBuilder::from_files(&files).await.unwrap(); - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(3, 3, 3).await.unwrap(); builder.finalize().await.unwrap(); diff --git a/src/layer/internal/object_iterator.rs b/src/layer/internal/object_iterator.rs index 1d3b9753..85a1fd98 100644 --- a/src/layer/internal/object_iterator.rs +++ b/src/layer/internal/object_iterator.rs @@ -242,18 +242,9 @@ mod tests { .await .unwrap(); - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 2).await.unwrap(); diff --git a/src/layer/internal/subject_iterator.rs b/src/layer/internal/subject_iterator.rs index a02be96d..db54274f 100644 --- a/src/layer/internal/subject_iterator.rs +++ b/src/layer/internal/subject_iterator.rs @@ -494,18 +494,9 @@ mod tests { let predicates = vec!["abcde", "fghij", "klmno", "lll"]; let values = vec!["chicken", "cow", "dog", "pig", "zebra"]; - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); @@ -536,18 +527,9 @@ mod tests { let predicates = vec!["abcde", "fghij", "klmno", "lll"]; let values = vec!["chicken", "cow", "dog", "pig", "zebra"]; - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); @@ -621,18 +603,9 @@ mod tests { let predicates = vec!["abcde", "fghij", "klmno", "lll"]; let values = vec!["chicken", "cow", "dog", "pig", "zebra"]; - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); builder.add_triple(3, 3, 5).await.unwrap(); @@ -663,18 +636,9 @@ mod tests { let predicates = vec!["abcde", "fghij", "klmno", "lll", "xyz", "yyy"]; let values = vec!["chicken", "cow", "dog", "pig", "zebra"]; - builder - .add_nodes(nodes.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_predicates(predicates.into_iter().map(|s| s.to_string())) - .await - .unwrap(); - builder - .add_values(values.into_iter().map(|s| s.to_string())) - .await - .unwrap(); + builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); + builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| s.to_string())); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 4).await.unwrap(); diff --git a/src/storage/memory.rs b/src/storage/memory.rs index 4d4d2354..0928e6b9 100644 --- a/src/storage/memory.rs +++ b/src/storage/memory.rs @@ -326,7 +326,9 @@ pub fn base_layer_memory_files() -> BaseLayerFiles { blocks_file: MemoryBackedStore::new(), offsets_file: MemoryBackedStore::new(), }, - value_dictionary_files: DictionaryFiles { + value_dictionary_files: TypedDictionaryFiles { + types_present_file: MemoryBackedStore::new(), + type_offsets_file: MemoryBackedStore::new(), blocks_file: MemoryBackedStore::new(), offsets_file: MemoryBackedStore::new(), }, @@ -390,7 +392,9 @@ pub fn child_layer_memory_files() -> ChildLayerFiles { blocks_file: MemoryBackedStore::new(), offsets_file: MemoryBackedStore::new(), }, - value_dictionary_files: DictionaryFiles { + value_dictionary_files: TypedDictionaryFiles { + types_present_file: MemoryBackedStore::new(), + type_offsets_file: MemoryBackedStore::new(), blocks_file: MemoryBackedStore::new(), offsets_file: MemoryBackedStore::new(), }, diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 9e2707ff..4ea1b6c9 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -167,7 +167,7 @@ impl SizedDict { pub fn block_num_elements(&self, block_index: usize) -> u8 { let offset = self.block_offset(block_index); - self.data[offset] + parse_block_control_records(self.data[offset]) } pub fn num_blocks(&self) -> usize { @@ -176,6 +176,8 @@ impl SizedDict { pub fn entry(&self, index: usize) -> Option { if index > self.num_entries() { + dbg!(index); + dbg!(self.num_entries()); return None; } let block = self.block(((index - 1) / 8) as usize); @@ -255,7 +257,7 @@ impl SizedDict { let num_blocks = self.num_blocks(); let last_block_size = self.block_num_elements(num_blocks - 1); - (num_blocks-1) * BLOCK_SIZE + last_block_size as usize + (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize } } @@ -366,7 +368,7 @@ mod tests { assert_eq!(6, block1.num_entries()); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(s, &dict.entry((ix + 1) as u64).to_bytes()[..]); + assert_eq!(s, &dict.entry(ix + 1).unwrap().to_bytes()[..]); } } @@ -415,7 +417,7 @@ mod tests { assert_eq!(6, block1.num_entries()); for (ix, s) in strings.into_iter().enumerate() { - assert_eq!(s, &dict.entry((ix + 1) as u64).to_bytes()[..]); + assert_eq!(s, &dict.entry(ix + 1).unwrap().to_bytes()[..]); } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 1bdbdc58..850d3d6a 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -161,13 +161,16 @@ impl TypedDict { pub fn entry(&self, id: usize) -> Option<(Datatype, SizedDictEntry)> { if id > self.num_entries() { + dbg!(self.num_entries()); return None; } let type_index = self.type_index_for_id(id as u64); let (dict, offset) = self.inner_type_segment(type_index); + dbg!(offset); + dbg!(type_index); let dt = self.type_for_type_index(type_index); - dict.entry(id - offset as usize).map(|e| (dt, e)) + dbg!(dict.entry(id - offset as usize).map(|e| (dt, e))) } pub fn num_entries(&self) -> usize { @@ -258,7 +261,7 @@ impl TypedDictSegment { pub fn get(&self, index: usize) -> Option { let entry = self.dict.entry(index); - entry.map(|e|T::from_lexical(e.into_buf())) + entry.map(|e| T::from_lexical(e.into_buf())) } pub fn id>(&self, val: &Q) -> IdLookupResult { @@ -270,11 +273,11 @@ impl TypedDictSegment { self.dict.num_entries() } - pub fn iter<'a>(&'a self) -> impl Iterator+'a+Clone { + pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { self.dict.iter() } - pub fn into_iter(self) -> impl Iterator+Clone { + pub fn into_iter(self) -> impl Iterator + Clone { self.dict.into_iter() } } @@ -323,21 +326,25 @@ pub trait TdbDataType { fn from_lexical(b: B) -> Self; fn to_lexical(val: &T) -> Bytes - where T: ToLexical + ?Sized { + where + T: ToLexical + ?Sized, + { val.to_lexical() } fn make_entry(val: &T) -> (Datatype, Bytes) - where T: ToLexical + ?Sized{ + where + T: ToLexical + ?Sized, + { (Self::datatype(), val.to_lexical()) } } -pub trait ToLexical { +pub trait ToLexical { fn to_lexical(&self) -> Bytes; } -impl> ToLexical for T { +impl> ToLexical for T { fn to_lexical(&self) -> Bytes { Bytes::copy_from_slice(self.as_ref().as_bytes()) } @@ -605,12 +612,7 @@ pub struct TypedDictBufBuilder { } impl TypedDictBufBuilder { - pub fn new( - used_types: B1, - type_offsets: B2, - block_offsets: B3, - data_buf: B4, - ) -> Self { + pub fn new(used_types: B1, type_offsets: B2, block_offsets: B3, data_buf: B4) -> Self { let types_present_builder = LateLogArrayBufBuilder::new(used_types); let type_offsets_builder = LateLogArrayBufBuilder::new(type_offsets); let block_offset_builder = LateLogArrayBufBuilder::new(block_offsets); @@ -680,7 +682,12 @@ impl TypedDictBufBuilder, I: Iterator>( + fn build_segment_and_offsets< + B1: BufMut, + B2: BufMut, + T: TdbDataType, + Q: ToLexical, + I: Iterator, + >( dt: Datatype, array_buf: &mut B1, data_buf: &mut B2, @@ -737,7 +750,7 @@ mod tests { for (ix, s) in strings.into_iter().enumerate() { assert_eq!(IdLookupResult::Found((ix + 1) as u64), segment.id(&s)); - assert_eq!(s, segment.get((ix + 1) as u64)); + assert_eq!(s, segment.get(ix + 1).unwrap()); } } @@ -761,7 +774,7 @@ mod tests { for (ix, s) in nums.into_iter().enumerate() { assert_eq!(IdLookupResult::Found((ix + 1) as u64), segment.id(&s)); - assert_eq!(s, segment.get((ix + 1) as u64)); + assert_eq!(s, segment.get(ix + 1).unwrap()); } } @@ -902,11 +915,14 @@ mod tests { assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); for i in 1..vec.len() + 1 { - let (t, s) = dict.entry(i as u64); + let (t, s) = dict.entry(i).unwrap(); assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); } - assert_eq!(Decimal("-12342343.2348973".to_string()), dict.get(11)); + assert_eq!( + Decimal("-12342343.2348973".to_string()), + dict.get(11).unwrap() + ); } #[test] @@ -967,14 +983,14 @@ mod tests { assert_eq!(31, dict.num_entries()); for i in 1..vec.len() + 1 { - let (t, s) = dict.entry(i as u64); + let (t, s) = dict.entry(i).unwrap(); assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); } - assert_eq!("Batman".to_string(), dict.get::(1)); - assert_eq!("fdsa".to_string(), dict.get::(7)); - assert_eq!(26_u32, dict.get::(14)); - assert_eq!(Decimal("234.8973".to_string()), dict.get(29)); + assert_eq!("Batman".to_string(), dict.get::(1).unwrap()); + assert_eq!("fdsa".to_string(), dict.get::(7).unwrap()); + assert_eq!(26_u32, dict.get::(14).unwrap()); + assert_eq!(Decimal("234.8973".to_string()), dict.get(29).unwrap()); assert_eq!(IdLookupResult::NotFound, dict.id(&"AAAA".to_string())); assert_eq!(IdLookupResult::Closest(2), dict.id(&"Baz".to_string())); @@ -1114,9 +1130,9 @@ mod tests { let data_buf = BytesMut::new(); let mut typed_builder = TypedDictBufBuilder::new( - &mut used_types_buf, - &mut type_offsets_buf, - &mut block_offsets_buf, + used_types_buf, + type_offsets_buf, + block_offsets_buf, data_buf, ); @@ -1126,17 +1142,17 @@ mod tests { .map(|(dt, entry)| typed_builder.add(dt, entry)) .collect(); - let data_buf = typed_builder.finalize(); - - let used_types = used_types_buf.freeze(); - let type_offsets = type_offsets_buf.freeze(); - let block_offsets = block_offsets_buf.freeze(); - let data = data_buf.freeze(); + let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); - let dict = TypedDict::from_parts(used_types, type_offsets, block_offsets, data); + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); for i in 0..vec.len() { - assert_eq!(vec[i], convert_entry(dict.entry(i as u64 + 1))) + assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) } } } From c802006c2de65dc7b202e6095e99c7bab523e0c4 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 30 Nov 2022 16:58:12 +0100 Subject: [PATCH 49/99] No warnings --- src/storage/file.rs | 42 ++++++++++++++++++-------- src/structure/tfc/file.rs | 61 +++++++++++++++++--------------------- src/structure/tfc/typed.rs | 8 ++--- 3 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/storage/file.rs b/src/storage/file.rs index 02ce1104..29e2c337 100644 --- a/src/storage/file.rs +++ b/src/storage/file.rs @@ -2,7 +2,7 @@ use std::io; -use bytes::{Bytes, Buf}; +use bytes::{Buf, Bytes}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use async_trait::async_trait; @@ -291,22 +291,36 @@ impl TypedDictionaryFiles { }) } - pub async fn write_all_from_bufs(&self, types_present_buf: &mut B1, type_offsets_buf: &mut B2, blocks_buf: &mut B3, offsets_buf: &mut B4) -> io::Result<()> { + pub async fn write_all_from_bufs( + &self, + types_present_buf: &mut B1, + type_offsets_buf: &mut B2, + blocks_buf: &mut B3, + offsets_buf: &mut B4, + ) -> io::Result<()> { let mut types_present_writer = self.types_present_file.open_write().await?; let mut type_offsets_writer = self.type_offsets_file.open_write().await?; let mut blocks_writer = self.blocks_file.open_write().await?; let mut offsets_writer = self.offsets_file.open_write().await?; - types_present_writer.write_all_buf(types_present_buf).await?; + types_present_writer + .write_all_buf(types_present_buf) + .await?; type_offsets_writer.write_all_buf(type_offsets_buf).await?; blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.write_all_buf(offsets_buf).await?; - blocks_writer.flush(); - blocks_writer.sync_all(); + types_present_writer.flush().await?; + types_present_writer.sync_all().await?; - offsets_writer.flush(); - offsets_writer.sync_all(); + type_offsets_writer.flush().await?; + type_offsets_writer.sync_all().await?; + + blocks_writer.flush().await?; + blocks_writer.sync_all().await?; + + offsets_writer.flush().await?; + offsets_writer.sync_all().await?; Ok(()) } @@ -336,18 +350,22 @@ impl DictionaryFiles { }) } - pub async fn write_all_from_bufs(&self, blocks_buf: &mut B1, offsets_buf: &mut B2) -> io::Result<()> { + pub async fn write_all_from_bufs( + &self, + blocks_buf: &mut B1, + offsets_buf: &mut B2, + ) -> io::Result<()> { let mut blocks_writer = self.blocks_file.open_write().await?; let mut offsets_writer = self.offsets_file.open_write().await?; blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.write_all_buf(offsets_buf).await?; - blocks_writer.flush(); - blocks_writer.sync_all(); + blocks_writer.flush().await?; + blocks_writer.sync_all().await?; - offsets_writer.flush(); - offsets_writer.sync_all(); + offsets_writer.flush().await?; + offsets_writer.sync_all().await?; Ok(()) } diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index 93ea3185..9b2e5528 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -1,39 +1,23 @@ use bytes::BytesMut; -use tokio::io::AsyncWriteExt; use std::io; +use tokio::io::AsyncWriteExt; use crate::{storage::*, structure::util::sorted_iterator}; -use super::{*, dict::{build_dict_unchecked, build_offset_logarray}}; - -pub struct StringDictFileBuilder { - /// the file that this builder writes the pfc blocks to - blocks_file: W, - /// the file that this builder writes the block offsets to - block_offsets_file: W, - - strings: Vec, -} - -impl StringDictFileBuilder { - pub fn new(blocks_file: W, block_offsets_file: W) -> Self { - Self { - blocks_file, - block_offsets_file, - strings: Vec::new() - } - } -} +use super::{ + dict::{build_dict_unchecked, build_offset_logarray}, + *, +}; pub async fn merge_string_dictionaries< - 'a, + 'a, F: 'static + FileLoad + FileStore, - I: Iterator+'a, + I: Iterator + 'a, >( dictionaries: I, dict_files: DictionaryFiles, ) -> io::Result<()> { - let iterators: Vec<_> = dictionaries.map(|d|d.iter()).collect(); + let iterators: Vec<_> = dictionaries.map(|d| d.iter()).collect(); let pick_fn = |vals: &[Option<&SizedDictEntry>]| { vals.iter() @@ -43,7 +27,7 @@ pub async fn merge_string_dictionaries< .map(|(ix, _)| ix) }; - let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|elt|elt.to_bytes()); + let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|elt| elt.to_bytes()); let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; @@ -54,7 +38,6 @@ pub async fn merge_string_dictionaries< build_dict_unchecked(None, 0, &mut offsets, &mut data_buf, sorted_iterator); build_offset_logarray(&mut offsets_buf, offsets); - blocks_file_writer.write_all(data_buf.as_ref()).await?; blocks_file_writer.flush().await?; blocks_file_writer.sync_all().await?; @@ -66,14 +49,14 @@ pub async fn merge_string_dictionaries< } pub async fn merge_typed_dictionaries< - 'a, + 'a, F: 'static + FileLoad + FileStore, - I: Iterator+'a, + I: Iterator + 'a, >( dictionaries: I, dict_files: TypedDictionaryFiles, ) -> io::Result<()> { - let iterators: Vec<_> = dictionaries.map(|d|d.iter()).collect(); + let iterators: Vec<_> = dictionaries.map(|d| d.iter()).collect(); let pick_fn = |vals: &[Option<&(Datatype, SizedDictEntry)>]| { vals.iter() @@ -83,7 +66,7 @@ pub async fn merge_typed_dictionaries< .map(|(ix, _)| ix) }; - let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|(dt, elt)|(dt,elt.to_bytes())); + let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|(dt, elt)| (dt, elt.to_bytes())); let mut types_present_file_writer = dict_files.types_present_file.open_write().await?; let mut type_offsets_file_writer = dict_files.type_offsets_file.open_write().await?; @@ -94,13 +77,23 @@ pub async fn merge_typed_dictionaries< let mut type_offsets_buf = BytesMut::new(); let mut offsets_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_multiple_segments(&mut types_present_buf, &mut type_offsets_buf, &mut offsets_buf, &mut data_buf, sorted_iterator); - - types_present_file_writer.write_all(types_present_buf.as_ref()).await?; + build_multiple_segments( + &mut types_present_buf, + &mut type_offsets_buf, + &mut offsets_buf, + &mut data_buf, + sorted_iterator, + ); + + types_present_file_writer + .write_all(types_present_buf.as_ref()) + .await?; types_present_file_writer.flush().await?; types_present_file_writer.sync_all().await?; - type_offsets_file_writer.write_all(type_offsets_buf.as_ref()).await?; + type_offsets_file_writer + .write_all(type_offsets_buf.as_ref()) + .await?; type_offsets_file_writer.flush().await?; type_offsets_file_writer.sync_all().await?; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 850d3d6a..5377e7d0 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1124,9 +1124,9 @@ mod tests { ]; vec.sort(); - let mut used_types_buf = BytesMut::new(); - let mut type_offsets_buf = BytesMut::new(); - let mut block_offsets_buf = BytesMut::new(); + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); let data_buf = BytesMut::new(); let mut typed_builder = TypedDictBufBuilder::new( @@ -1136,7 +1136,7 @@ mod tests { data_buf, ); - let results: Vec = vec + let _results: Vec = vec .clone() .into_iter() .map(|(dt, entry)| typed_builder.add(dt, entry)) From 4837d73186c5ab9a203ac1dbcd936ef1d75f9d19 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 1 Dec 2022 15:49:45 +0100 Subject: [PATCH 50/99] WIP: To avoid data loss, checking in this debugging code --- src/layer/simple_builder.rs | 7 ++--- src/storage/layer.rs | 28 ++++++++++++----- src/structure/logarray.rs | 31 ++++++++++++++----- src/structure/tfc/typed.rs | 60 +++++++++++++++++++++++++++++++------ src/structure/util.rs | 19 ++++++------ 5 files changed, 108 insertions(+), 37 deletions(-) diff --git a/src/layer/simple_builder.rs b/src/layer/simple_builder.rs index 8eaaf4fb..8ae41b05 100644 --- a/src/layer/simple_builder.rs +++ b/src/layer/simple_builder.rs @@ -62,6 +62,7 @@ pub struct SimpleLayerBuilder { impl SimpleLayerBuilder { /// Construct a layer builder for a base layer pub fn new(name: [u32; 5], files: BaseLayerFiles) -> Self { + eprintln!("Trying to make a new layer file"); Self { name, parent: None, @@ -194,8 +195,7 @@ impl LayerBuilder for SimpleLayerBuil ChildLayerFileBuilder::from_files(parent.clone(), &files).await?; let node_ids = builder.add_nodes(unresolved_nodes.clone()); - let predicate_ids = builder - .add_predicates(unresolved_predicates.clone()); + let predicate_ids = builder.add_predicates(unresolved_predicates.clone()); let value_ids = builder.add_values(unresolved_values.clone()); let mut builder = builder.into_phase2().await?; @@ -240,8 +240,7 @@ impl LayerBuilder for SimpleLayerBuil let mut builder = BaseLayerFileBuilder::from_files(&files).await?; let node_ids = builder.add_nodes(unresolved_nodes.clone()); - let predicate_ids = builder - .add_predicates(unresolved_predicates.clone()); + let predicate_ids = builder.add_predicates(unresolved_predicates.clone()); let value_ids = builder.add_values(unresolved_values.clone()); let mut builder = builder.into_phase2().await?; diff --git a/src/storage/layer.rs b/src/storage/layer.rs index 29af5973..bac60e72 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -10,13 +10,12 @@ use crate::layer::{ OptInternalLayerTriplePredicateIterator, OptInternalLayerTripleSubjectIterator, RollupLayer, SimpleLayerBuilder, }; -use crate::structure::StringDict; -use crate::structure::TypedDict; use crate::structure::bitarray::bitarray_len_from_file; use crate::structure::logarray::logarray_file_get_length_and_width; +use crate::structure::StringDict; +use crate::structure::TypedDict; use crate::structure::{ - dict_file_get_count, util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, - WaveletTree, + dict_file_get_count, util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree, }; use std::convert::TryInto; @@ -1569,7 +1568,11 @@ impl, )> { let mut builder = store.create_base_layer().await?; - let name = builder.name(); + let name = dbg!(builder.name()); for t in BASE_TRIPLES.iter() { builder.add_string_triple(t.clone()); } diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index d9ba9ca9..d71cde75 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -296,6 +296,8 @@ impl LogArray { /// /// Panics if `index` + `length` is >= the length of the log array. pub fn slice(&self, offset: usize, len: usize) -> LogArray { + dbg!(len); + dbg!(offset); let offset = u32::try_from(offset) .unwrap_or_else(|_| panic!("expected 32-bit slice offset ({})", offset)); let len = @@ -352,8 +354,10 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { } pub fn push(&mut self, val: u64) { + eprintln!("push"); + dbg!(val); // This is the minimum number of leading zeros that a decoded value should have. - let leading_zeros = 64 - self.width; + let leading_zeros = u64::BITS - self.width as u32; // If `val` does not fit in the `width`, return an error. if val.leading_zeros() < u32::from(leading_zeros) { @@ -404,7 +408,7 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { pub fn finalize(mut self) { let len = self.count; - let width = self.width; + let width = dbg!(self.width); // Write the final data word. self.finalize_data(); @@ -422,7 +426,7 @@ pub struct LateLogArrayBufBuilder { buf: B, /// NOTE: remove pub pub vals: Vec, - width: u8 + width: u8, } impl LateLogArrayBufBuilder { @@ -430,7 +434,7 @@ impl LateLogArrayBufBuilder { Self { buf, vals: Vec::new(), - width: 0 + width: 0, } } @@ -440,7 +444,7 @@ impl LateLogArrayBufBuilder { pub fn push(&mut self, val: u64) { self.vals.push(val); - let width = calculate_width(val); + let width = dbg!(calculate_width(val)); if self.width < width { self.width = width; } @@ -461,10 +465,13 @@ impl LateLogArrayBufBuilder { } pub fn finalize(mut self) -> B { + /*if self.width == 0 { + self.width = 1 + }*/ let mut builder = LogArrayBufBuilder::new(&mut self.buf, self.width); - builder.push_vec(self.vals); + builder.push_vec(dbg!(self.vals)); builder.finalize(); - + eprintln!("Finalized logarray"); self.buf } } @@ -930,6 +937,16 @@ mod tests { assert!(MonotonicLogArray::from_logarray(logarray).is_empty()); } + #[test] + pub fn late_logarray_just_zero() { + let buf = BytesMut::new(); + let mut builder = LateLogArrayBufBuilder::new(buf); + builder.push(0); + let logarray_buf = builder.finalize().freeze(); + let logarray = LogArray::parse(logarray_buf).unwrap(); + assert_eq!(logarray.entry(0_usize), 0_u64); + } + #[tokio::test] #[should_panic(expected = "expected value (8) to fit in 3 bits")] async fn log_array_file_builder_panic() { diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 5377e7d0..52893a3b 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -56,11 +56,15 @@ impl TypedDict { tally += gap as u64; type_id_offsets.push((type_offset + 1) * 8 - tally); } - - let last_gap = BLOCK_SIZE - - parse_block_control_records( - data[block_offsets.entry(block_offsets.len() - 1) as usize], - ) as usize; + dbg!(block_offsets.len()); + let last_gap = if block_offsets.len() == 0 { + 1 + } else { + BLOCK_SIZE + - parse_block_control_records( + data[block_offsets.entry(block_offsets.len() - 1) as usize], + ) as usize + }; let num_entries = (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap; Self { @@ -88,6 +92,7 @@ impl TypedDict { let type_offset; let block_offset; let id_offset; + dbg!(i); if i == 0 { type_offset = 0; block_offset = 0; @@ -113,7 +118,7 @@ impl TypedDict { len = next_offset - type_offset - 1; } } - + dbg!(type_offset + 1); let logarray_slice = self.block_offsets.slice(type_offset + 1, len); let data_slice = self.data.slice(block_offset..); @@ -286,7 +291,7 @@ pub type StringDict = TypedDictSegment; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] pub enum Datatype { - String = 0, + String = 1, UInt32, Int32, UInt64, @@ -663,6 +668,7 @@ impl TypedDictBufBuilder u64 { + eprintln!("Adding entry: {dt:?},{e:?}"); self.add(dt, e.to_bytes()) } @@ -671,17 +677,21 @@ impl TypedDictBufBuilder (B1, B2, B3, B4) { + eprintln!("Finalizing now"); if self.current_datatype == None { panic!("There was nothing added to this dictionary!"); } let (mut block_offset_builder, data_buf, _, _) = self.sized_dict_buf_builder.unwrap().finalize(); + eprintln!("a"); block_offset_builder.pop(); let block_offsets_buf = block_offset_builder.finalize(); - + eprintln!("b"); + dbg!(&self.types_present_builder.vals); let types_present_buf = self.types_present_builder.finalize(); + eprintln!("c"); let type_offsets_buf = self.type_offsets_builder.finalize(); - + eprintln!("Finalized..."); ( types_present_buf, type_offsets_buf, @@ -1087,6 +1097,38 @@ mod tests { (e.0, e.1.to_bytes()) } + #[test] + fn test_one_string() { + let vec: Vec<(Datatype, Bytes)> = vec![String::make_entry(&"fdsa")]; + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); + let data_buf = BytesMut::new(); + + let mut typed_builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); + + let _results: Vec = vec + .clone() + .into_iter() + .map(|(dt, entry)| typed_builder.add(dt, entry)) + .collect(); + + let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + assert_eq!(vec[0], convert_entry(dict.entry(1).unwrap())) + } + #[test] fn test_incremental_builder() { let mut vec: Vec<(Datatype, Bytes)> = vec![ diff --git a/src/structure/util.rs b/src/structure/util.rs index d47f86d9..f349bb91 100644 --- a/src/structure/util.rs +++ b/src/structure/util.rs @@ -130,12 +130,8 @@ struct SortedIterator< pick_fn: F, } -impl< - 'a, - T, - I: 'a + Iterator + Send, - F: 'static + Fn(&[Option<&T>]) -> Option, - > Iterator for SortedIterator +impl<'a, T, I: 'a + Iterator + Send, F: 'static + Fn(&[Option<&T>]) -> Option> + Iterator for SortedIterator { type Item = T; @@ -155,14 +151,14 @@ impl< } pub fn sorted_iterator< - 'a, + 'a, T: 'a, I: 'a + Iterator + Send, F: 'static + Fn(&[Option<&T>]) -> Option, >( iters: Vec, pick_fn: F, -) -> impl Iterator+'a { +) -> impl Iterator + 'a { let peekable_iters = iters .into_iter() .map(std::iter::Iterator::peekable) @@ -187,7 +183,12 @@ pub fn assert_poll_next>(stream: Pin<&mut S>, cx: &mut Co } pub fn calculate_width(size: u64) -> u8 { - ((size + 1) as f32).log2().ceil() as u8 + let mut msb = u64::BITS - size.leading_zeros(); + // zero is a degenerate case, but needs to be represented with one bit. + if msb == 0 { + msb = 1 + }; + msb as u8 } #[cfg(test)] From 2fbd8602894477433aabb365d279f925355f1f49 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 1 Dec 2022 17:27:20 +0100 Subject: [PATCH 51/99] Some debugging code. --- src/structure/tfc/dict.rs | 2 +- src/structure/tfc/typed.rs | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 4ea1b6c9..1ac85dff 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -140,7 +140,7 @@ impl SizedDict { if block_index == 0 { offset = 0; } else { - offset = (self.offsets.entry(block_index - 1) - self.dict_offset) as usize; + offset = (dbg!(self.offsets.entry(block_index - 1)) - dbg!(self.dict_offset)) as usize; } offset diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 52893a3b..eff19d1c 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -98,27 +98,30 @@ impl TypedDict { block_offset = 0; id_offset = 0; } else { - type_offset = self.type_offsets.entry(i - 1) as usize; - id_offset = self.type_id_offsets[i - 1]; - block_offset = self.block_offsets.entry(type_offset as usize) as usize; + type_offset = dbg!(self.type_offsets.entry(i - 1) as usize); + id_offset = dbg!(self.type_id_offsets[i - 1]); + block_offset = dbg!(self.block_offsets.entry(type_offset as usize) as usize); } let len; - if i == self.types_present.len() - 1 { + dbg!(&self.types_present); + dbg!(&self.type_id_offsets); + dbg!(&self.block_offsets); + if dbg!(i == dbg!(self.types_present.len()) - 1) { if i == 0 { len = self.block_offsets.len() - type_offset; } else { len = self.block_offsets.len() - type_offset - 1; } } else { - let next_offset = self.type_offsets.entry(i) as usize; + let next_offset = dbg!(self.type_offsets.entry(i) as usize); if i == 0 { len = next_offset - type_offset; } else { - len = next_offset - type_offset - 1; + len = dbg!(next_offset - type_offset - 1); } } - dbg!(type_offset + 1); + dbg!(type_offset); let logarray_slice = self.block_offsets.slice(type_offset + 1, len); let data_slice = self.data.slice(block_offset..); @@ -240,7 +243,6 @@ impl<'a> Iterator for DictSegmentIterator<'a> { if self.type_index >= self.dict.types_present.len() { return None; } - let (segment, _) = self.dict.inner_type_segment(self.type_index); let datatype = self.dict.type_for_type_index(self.type_index); self.type_index += 1; @@ -291,12 +293,12 @@ pub type StringDict = TypedDictSegment; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] pub enum Datatype { - String = 1, + String = 0, UInt32, Int32, + Float32, UInt64, Int64, - Float32, Float64, Decimal, BigInt, From 2c113eb2005858aebd7cbf014b3e478d0c7157a5 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 1 Dec 2022 20:22:57 +0100 Subject: [PATCH 52/99] Add condition for empty slice --- src/structure/tfc/typed.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index eff19d1c..86c6ee6c 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -122,7 +122,13 @@ impl TypedDict { } } dbg!(type_offset); - let logarray_slice = self.block_offsets.slice(type_offset + 1, len); + let logarray_slice; + if len == 0 { + // any slice will do + logarray_slice = self.block_offsets.slice(0, 0); + } else { + logarray_slice = self.block_offsets.slice(type_offset + 1, len); + } let data_slice = self.data.slice(block_offset..); ( From 2b07fa6594696404a1d2f76da603b84cd90df499 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 2 Dec 2022 17:53:46 +0100 Subject: [PATCH 53/99] Adding fixes for phase2 --- src/layer/internal/base.rs | 26 ++++++++--- src/structure/logarray.rs | 9 ++-- src/structure/tfc/block.rs | 10 ++--- src/structure/tfc/dict.rs | 4 +- src/structure/tfc/typed.rs | 89 +++++++++++++++++++++++++++----------- 5 files changed, 92 insertions(+), 46 deletions(-) diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 14bea2cc..6d8e223a 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -256,17 +256,29 @@ impl BaseLayerFileBuilder { let predicate_dict_blocks_map = files.predicate_dictionary_files.blocks_file.map().await?; let predicate_dict_offsets_map = files.predicate_dictionary_files.offsets_file.map().await?; + let value_dict_types_present_map = files + .value_dictionary_files + .types_present_file + .map() + .await?; + let value_dict_type_offsets_map = + files.value_dictionary_files.type_offsets_file.map().await?; let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; let value_dict_offsets_map = files.value_dictionary_files.offsets_file.map().await?; - let node_dict = PfcDict::parse(node_dict_blocks_map, node_dict_offsets_map)?; - let pred_dict = PfcDict::parse(predicate_dict_blocks_map, predicate_dict_offsets_map)?; - let val_dict = PfcDict::parse(value_dict_blocks_map, value_dict_offsets_map)?; + let node_dict = StringDict::parse(node_dict_blocks_map, node_dict_offsets_map, 0); + let pred_dict = StringDict::parse(predicate_dict_blocks_map, predicate_dict_offsets_map, 0); + let val_dict = TypedDict::from_parts( + value_dict_types_present_map, + value_dict_type_offsets_map, + value_dict_blocks_map, + value_dict_offsets_map, + ); // TODO: it is a bit silly to parse the dictionaries just for this. surely we can get the counts in an easier way? - let num_nodes = node_dict.len(); - let num_predicates = pred_dict.len(); - let num_values = val_dict.len(); + let num_nodes = node_dict.num_entries(); + let num_predicates = pred_dict.num_entries(); + let num_values = val_dict.num_entries(); BaseLayerFileBuilderPhase2::new(files, num_nodes, num_predicates, num_values).await } @@ -605,7 +617,7 @@ pub mod tests { let builder = builder.into_phase2().await.unwrap(); builder.finalize().await.unwrap(); - + eprintln!("Here"); let layer = BaseLayer::load_from_files([1, 2, 3, 4, 5], &base_layer_files) .await .unwrap(); diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index d71cde75..6429e459 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -354,8 +354,6 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { } pub fn push(&mut self, val: u64) { - eprintln!("push"); - dbg!(val); // This is the minimum number of leading zeros that a decoded value should have. let leading_zeros = u64::BITS - self.width as u32; @@ -408,7 +406,7 @@ impl<'a, B: BufMut> LogArrayBufBuilder<'a, B> { pub fn finalize(mut self) { let len = self.count; - let width = dbg!(self.width); + let width = self.width; // Write the final data word. self.finalize_data(); @@ -444,7 +442,7 @@ impl LateLogArrayBufBuilder { pub fn push(&mut self, val: u64) { self.vals.push(val); - let width = dbg!(calculate_width(val)); + let width = calculate_width(val); if self.width < width { self.width = width; } @@ -469,9 +467,8 @@ impl LateLogArrayBufBuilder { self.width = 1 }*/ let mut builder = LogArrayBufBuilder::new(&mut self.buf, self.width); - builder.push_vec(dbg!(self.vals)); + builder.push_vec(self.vals); builder.finalize(); - eprintln!("Finalized logarray"); self.buf } } diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 07077ac4..7672a985 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -537,7 +537,7 @@ impl<'a> Iterator for SizedBlockIterator<'a> { if self.ix >= self.header.num_entries as usize - 1 { return None; } - let size = dbg!(self.header.sizes[self.ix]); + let size = self.header.sizes[self.ix]; let mut shared = self.header.shareds[self.ix]; for rope_index in 0..last.len() { let x = &mut last[rope_index]; @@ -588,18 +588,18 @@ impl IdLookupResult { } } - pub fn mapu64>(self, f: F) -> Self { + pub fn map u64>(self, f: F) -> Self { match self { Self::Found(i) => Self::Found(f(i)), Self::Closest(i) => Self::Closest(f(i)), - Self::NotFound => Self::NotFound + Self::NotFound => Self::NotFound, } } pub fn into_option(self) -> Option { match self { Self::Found(i) => Some(i), - _ => None + _ => None, } } } @@ -647,7 +647,7 @@ pub(crate) fn build_block_unchecked( let mut size = 0; let slices_len = slices.len(); debug_assert!(slices_len <= BLOCK_SIZE && slices_len != 0); - let cw = dbg!(create_block_control_word(record_size, slices_len as u8)); + let cw = create_block_control_word(record_size, slices_len as u8); buf.put_u8(cw as u8); size += 1; diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 1ac85dff..542722e2 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -140,7 +140,7 @@ impl SizedDict { if block_index == 0 { offset = 0; } else { - offset = (dbg!(self.offsets.entry(block_index - 1)) - dbg!(self.dict_offset)) as usize; + offset = (self.offsets.entry(block_index - 1) - self.dict_offset) as usize; } offset @@ -176,8 +176,6 @@ impl SizedDict { pub fn entry(&self, index: usize) -> Option { if index > self.num_entries() { - dbg!(index); - dbg!(self.num_entries()); return None; } let block = self.block(((index - 1) / 8) as usize); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 86c6ee6c..40c3bfae 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -51,12 +51,12 @@ impl TypedDict { last_block_len = parse_block_control_records(data[last_block_offset_of_previous_type as usize]); } - eprintln!("last_block_len: {last_block_len}"); + let gap = BLOCK_SIZE as u8 - last_block_len; tally += gap as u64; type_id_offsets.push((type_offset + 1) * 8 - tally); } - dbg!(block_offsets.len()); + let last_gap = if block_offsets.len() == 0 { 1 } else { @@ -92,36 +92,33 @@ impl TypedDict { let type_offset; let block_offset; let id_offset; - dbg!(i); + if i == 0 { type_offset = 0; block_offset = 0; id_offset = 0; } else { - type_offset = dbg!(self.type_offsets.entry(i - 1) as usize); - id_offset = dbg!(self.type_id_offsets[i - 1]); - block_offset = dbg!(self.block_offsets.entry(type_offset as usize) as usize); + type_offset = self.type_offsets.entry(i - 1) as usize; + id_offset = self.type_id_offsets[i - 1]; + block_offset = self.block_offsets.entry(type_offset as usize) as usize; } let len; - dbg!(&self.types_present); - dbg!(&self.type_id_offsets); - dbg!(&self.block_offsets); - if dbg!(i == dbg!(self.types_present.len()) - 1) { + if i == self.types_present.len() - 1 { if i == 0 { len = self.block_offsets.len() - type_offset; } else { len = self.block_offsets.len() - type_offset - 1; } } else { - let next_offset = dbg!(self.type_offsets.entry(i) as usize); + let next_offset = self.type_offsets.entry(i) as usize; if i == 0 { len = next_offset - type_offset; } else { - len = dbg!(next_offset - type_offset - 1); + len = next_offset - type_offset - 1; } } - dbg!(type_offset); + let logarray_slice; if len == 0 { // any slice will do @@ -175,16 +172,13 @@ impl TypedDict { pub fn entry(&self, id: usize) -> Option<(Datatype, SizedDictEntry)> { if id > self.num_entries() { - dbg!(self.num_entries()); return None; } let type_index = self.type_index_for_id(id as u64); let (dict, offset) = self.inner_type_segment(type_index); - dbg!(offset); - dbg!(type_index); let dt = self.type_for_type_index(type_index); - dbg!(dict.entry(id - offset as usize).map(|e| (dt, e))) + dict.entry(id - offset as usize).map(|e| (dt, e)) } pub fn num_entries(&self) -> usize { @@ -265,6 +259,10 @@ pub struct TypedDictSegment { impl TypedDictSegment { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { + let offsets2 = offsets.clone(); + let data2 = data.clone(); + dbg!(offsets2); + dbg!(data2); let dict = SizedDict::parse(offsets, data, dict_offset); Self { dict, @@ -676,7 +674,6 @@ impl TypedDictBufBuilder u64 { - eprintln!("Adding entry: {dt:?},{e:?}"); self.add(dt, e.to_bytes()) } @@ -685,21 +682,18 @@ impl TypedDictBufBuilder (B1, B2, B3, B4) { - eprintln!("Finalizing now"); + /* if self.current_datatype == None { panic!("There was nothing added to this dictionary!"); - } + }*/ let (mut block_offset_builder, data_buf, _, _) = self.sized_dict_buf_builder.unwrap().finalize(); - eprintln!("a"); + block_offset_builder.pop(); let block_offsets_buf = block_offset_builder.finalize(); - eprintln!("b"); - dbg!(&self.types_present_builder.vals); let types_present_buf = self.types_present_builder.finalize(); - eprintln!("c"); let type_offsets_buf = self.type_offsets_builder.finalize(); - eprintln!("Finalized..."); + ( types_present_buf, type_offsets_buf, @@ -1205,4 +1199,49 @@ mod tests { assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) } } + + #[test] + fn test_incremental_builder_small_dicts() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + String::make_entry(&"fdsa"), + i32::make_entry(&-500_i32), + u32::make_entry(&20_u32), + i64::make_entry(&-3_i64), + Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), + f32::make_entry(&23434.389832_f32), + Integer::make_entry(&int("239487329872343987")), + ]; + vec.sort(); + + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); + let data_buf = BytesMut::new(); + + let mut typed_builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); + + let _results: Vec = vec + .clone() + .into_iter() + .map(|(dt, entry)| typed_builder.add(dt, entry)) + .collect(); + + let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + + for i in 0..vec.len() { + assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + } + } } From 55f20d5e98c3595630a9b7015b1e31a516db7016 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 3 Dec 2022 00:45:43 +0100 Subject: [PATCH 54/99] 190 passing --- src/layer/builder.rs | 77 ++++++++++++++++++++++++++++--------- src/layer/internal/base.rs | 16 ++++---- src/layer/internal/child.rs | 42 ++++++++++++-------- src/storage/consts.rs | 4 +- src/storage/file.rs | 28 +++++++------- src/storage/layer.rs | 36 ++++++++--------- src/storage/memory.rs | 10 ++--- src/structure/tfc/block.rs | 2 +- src/structure/tfc/dict.rs | 24 +++++++++--- src/structure/tfc/file.rs | 15 ++++---- src/structure/tfc/typed.rs | 41 ++++++++++++++------ 11 files changed, 191 insertions(+), 104 deletions(-) diff --git a/src/layer/builder.rs b/src/layer/builder.rs index 822a3bc4..55df9d00 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -1,6 +1,6 @@ use std::io; -use bytes::{BytesMut, Bytes}; +use bytes::{Bytes, BytesMut}; use futures::stream::TryStreamExt; use rayon::prelude::*; use tfc::dict::SizedDictBufBuilder; @@ -25,12 +25,26 @@ impl DictionarySetFileBuilder { predicate_files: DictionaryFiles, value_files: TypedDictionaryFiles, ) -> io::Result { - let node_dictionary_builder = SizedDictBufBuilder::new(None, 0, 0, LateLogArrayBufBuilder::new(BytesMut::new()), BytesMut::new()); - let predicate_dictionary_builder = SizedDictBufBuilder::new(None, 0, 0, LateLogArrayBufBuilder::new(BytesMut::new()), BytesMut::new()); - let value_dictionary_builder = TypedDictBufBuilder::new(BytesMut::new(), - BytesMut::new(), - BytesMut::new(), - BytesMut::new()); + let node_dictionary_builder = SizedDictBufBuilder::new( + None, + 0, + 0, + LateLogArrayBufBuilder::new(BytesMut::new()), + BytesMut::new(), + ); + let predicate_dictionary_builder = SizedDictBufBuilder::new( + None, + 0, + 0, + LateLogArrayBufBuilder::new(BytesMut::new()), + BytesMut::new(), + ); + let value_dictionary_builder = TypedDictBufBuilder::new( + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + ); Ok(Self { node_files, @@ -46,7 +60,9 @@ impl DictionarySetFileBuilder { /// /// Panics if the given node string is not a lexical successor of the previous node string. pub fn add_node(&mut self, node: &str) -> u64 { - let id = self.node_dictionary_builder.add(Bytes::copy_from_slice(node.as_bytes())); + let id = self + .node_dictionary_builder + .add(Bytes::copy_from_slice(node.as_bytes())); id } @@ -55,7 +71,9 @@ impl DictionarySetFileBuilder { /// /// Panics if the given predicate string is not a lexical successor of the previous node string. pub fn add_predicate(&mut self, predicate: &str) -> u64 { - let id = self.predicate_dictionary_builder.add(Bytes::copy_from_slice(predicate.as_bytes())); + let id = self + .predicate_dictionary_builder + .add(Bytes::copy_from_slice(predicate.as_bytes())); id } @@ -64,8 +82,9 @@ impl DictionarySetFileBuilder { /// /// Panics if the given value string is not a lexical successor of the previous value string. pub fn add_value(&mut self, value: &str) -> u64 { - let id = self.value_dictionary_builder.add(Datatype::String, - Bytes::copy_from_slice(value.as_bytes())); + let id = self + .value_dictionary_builder + .add(Datatype::String, Bytes::copy_from_slice(value.as_bytes())); id } @@ -128,16 +147,38 @@ impl DictionarySetFileBuilder { } pub async fn finalize(self) -> io::Result<()> { - let (node_offsets_builder, mut node_data_buf, _, _) = self.node_dictionary_builder.finalize(); + let (mut node_offsets_builder, mut node_data_buf, _, _) = + self.node_dictionary_builder.finalize(); + // last offset is useless + node_offsets_builder.pop(); let mut node_offsets_buf = node_offsets_builder.finalize(); - let (predicate_offsets_builder, mut predicate_data_buf, _, _) = self.predicate_dictionary_builder.finalize(); + let (mut predicate_offsets_builder, mut predicate_data_buf, _, _) = + self.predicate_dictionary_builder.finalize(); + // last offset is useless + predicate_offsets_builder.pop(); let mut predicate_offsets_buf = predicate_offsets_builder.finalize(); - let (mut value_types_present_buf, mut value_type_offsets_buf, mut value_offsets_buf, mut value_data_buf) = self.value_dictionary_builder.finalize(); - - self.node_files.write_all_from_bufs(&mut node_data_buf, &mut node_offsets_buf).await?; - self.predicate_files.write_all_from_bufs(&mut predicate_data_buf, &mut predicate_offsets_buf).await?; + let ( + mut value_types_present_buf, + mut value_type_offsets_buf, + mut value_offsets_buf, + mut value_data_buf, + ) = self.value_dictionary_builder.finalize(); + + self.node_files + .write_all_from_bufs(&mut node_data_buf, &mut node_offsets_buf) + .await?; + self.predicate_files + .write_all_from_bufs(&mut predicate_data_buf, &mut predicate_offsets_buf) + .await?; - self.value_files.write_all_from_bufs(&mut value_types_present_buf, &mut value_type_offsets_buf, &mut value_offsets_buf, &mut value_data_buf).await?; + self.value_files + .write_all_from_bufs( + &mut value_types_present_buf, + &mut value_type_offsets_buf, + &mut value_offsets_buf, + &mut value_data_buf, + ) + .await?; Ok(()) } diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 6d8e223a..4d6cbe69 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -49,21 +49,21 @@ impl BaseLayer { } pub fn load(name: [u32; 5], maps: BaseLayerMaps) -> InternalLayer { - let node_dictionary = TypedDictSegment::parse( - maps.node_dictionary_maps.blocks_map, + let node_dictionary = StringDict::parse( maps.node_dictionary_maps.offsets_map, + maps.node_dictionary_maps.blocks_map, 0, ); - let predicate_dictionary = TypedDictSegment::parse( - maps.predicate_dictionary_maps.blocks_map, + let predicate_dictionary = StringDict::parse( maps.predicate_dictionary_maps.offsets_map, + maps.predicate_dictionary_maps.blocks_map, 0, ); let value_dictionary = TypedDict::from_parts( maps.value_dictionary_maps.types_present_map, maps.value_dictionary_maps.type_offsets_map, - maps.value_dictionary_maps.blocks_map, maps.value_dictionary_maps.offsets_map, + maps.value_dictionary_maps.blocks_map, ); let node_value_idmap = match maps.id_map_maps.node_value_idmap_maps { @@ -266,13 +266,13 @@ impl BaseLayerFileBuilder { let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; let value_dict_offsets_map = files.value_dictionary_files.offsets_file.map().await?; - let node_dict = StringDict::parse(node_dict_blocks_map, node_dict_offsets_map, 0); - let pred_dict = StringDict::parse(predicate_dict_blocks_map, predicate_dict_offsets_map, 0); + let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map, 0); + let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map, 0); let val_dict = TypedDict::from_parts( value_dict_types_present_map, value_dict_type_offsets_map, - value_dict_blocks_map, value_dict_offsets_map, + value_dict_blocks_map, ); // TODO: it is a bit silly to parse the dictionaries just for this. surely we can get the counts in an easier way? diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index 978fe965..09744582 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -62,21 +62,21 @@ impl ChildLayer { } pub fn load(name: [u32; 5], parent: Arc, maps: ChildLayerMaps) -> InternalLayer { - let node_dictionary = TypedDictSegment::parse( - maps.node_dictionary_maps.blocks_map, + let node_dictionary = StringDict::parse( maps.node_dictionary_maps.offsets_map, + maps.node_dictionary_maps.blocks_map, 0, ); - let predicate_dictionary = TypedDictSegment::parse( - maps.predicate_dictionary_maps.blocks_map, + let predicate_dictionary = StringDict::parse( maps.predicate_dictionary_maps.offsets_map, + maps.predicate_dictionary_maps.blocks_map, 0, ); let value_dictionary = TypedDict::from_parts( maps.value_dictionary_maps.types_present_map, maps.value_dictionary_maps.type_offsets_map, - maps.value_dictionary_maps.blocks_map, maps.value_dictionary_maps.offsets_map, + maps.value_dictionary_maps.blocks_map, ); let parent_node_value_count = parent.node_and_value_count(); @@ -345,23 +345,35 @@ impl ChildLayerFileBuil } = self; builder.finalize().await?; - - let node_dict_blocks_map = files.node_dictionary_files.blocks_file.map().await?; + eprintln!("Into phase2"); let node_dict_offsets_map = files.node_dictionary_files.offsets_file.map().await?; - let predicate_dict_blocks_map = files.predicate_dictionary_files.blocks_file.map().await?; + let node_dict_blocks_map = files.node_dictionary_files.blocks_file.map().await?; let predicate_dict_offsets_map = files.predicate_dictionary_files.offsets_file.map().await?; - let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; + let predicate_dict_blocks_map = files.predicate_dictionary_files.blocks_file.map().await?; + let value_dict_types_present_map = files + .value_dictionary_files + .types_present_file + .map() + .await?; + let value_dict_type_offsets_map = + files.value_dictionary_files.type_offsets_file.map().await?; let value_dict_offsets_map = files.value_dictionary_files.offsets_file.map().await?; + let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; - let node_dict = PfcDict::parse(node_dict_blocks_map, node_dict_offsets_map)?; - let pred_dict = PfcDict::parse(predicate_dict_blocks_map, predicate_dict_offsets_map)?; - let val_dict = PfcDict::parse(value_dict_blocks_map, value_dict_offsets_map)?; + let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map, 0); + let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map, 0); + let val_dict = TypedDict::from_parts( + value_dict_types_present_map, + value_dict_type_offsets_map, + value_dict_offsets_map, + value_dict_blocks_map, + ); // TODO: it is a bit silly to parse the dictionaries just for this. surely we can get the counts in an easier way? - let num_nodes = node_dict.len(); - let num_predicates = pred_dict.len(); - let num_values = val_dict.len(); + let num_nodes = node_dict.num_entries(); + let num_predicates = pred_dict.num_entries(); + let num_values = val_dict.num_entries(); ChildLayerFileBuilderPhase2::new(parent, files, num_nodes, num_predicates, num_values).await } diff --git a/src/storage/consts.rs b/src/storage/consts.rs index de296ebb..6731d9f8 100644 --- a/src/storage/consts.rs +++ b/src/storage/consts.rs @@ -186,11 +186,13 @@ pub const FILENAMES: Filenames = Filenames { rollup: "rollup.hex", }; -pub const SHARED_REQUIRED_FILES: [&'static str; 6] = [ +pub const SHARED_REQUIRED_FILES: [&'static str; 8] = [ FILENAMES.node_dictionary_blocks, FILENAMES.node_dictionary_offsets, FILENAMES.predicate_dictionary_blocks, FILENAMES.predicate_dictionary_offsets, + FILENAMES.value_dictionary_types_present, + FILENAMES.value_dictionary_type_offsets, FILENAMES.value_dictionary_blocks, FILENAMES.value_dictionary_offsets, ]; diff --git a/src/storage/file.rs b/src/storage/file.rs index 29e2c337..89556ee0 100644 --- a/src/storage/file.rs +++ b/src/storage/file.rs @@ -280,14 +280,14 @@ impl TypedDictionaryFiles { pub async fn map_all(&self) -> io::Result { let types_present_map = self.types_present_file.map().await?; let type_offsets_map = self.type_offsets_file.map().await?; - let blocks_map = self.blocks_file.map().await?; let offsets_map = self.offsets_file.map().await?; + let blocks_map = self.blocks_file.map().await?; Ok(TypedDictionaryMaps { types_present_map, type_offsets_map, - blocks_map, offsets_map, + blocks_map, }) } @@ -295,20 +295,20 @@ impl TypedDictionaryFiles { &self, types_present_buf: &mut B1, type_offsets_buf: &mut B2, - blocks_buf: &mut B3, - offsets_buf: &mut B4, + offsets_buf: &mut B3, + blocks_buf: &mut B4, ) -> io::Result<()> { let mut types_present_writer = self.types_present_file.open_write().await?; let mut type_offsets_writer = self.type_offsets_file.open_write().await?; - let mut blocks_writer = self.blocks_file.open_write().await?; let mut offsets_writer = self.offsets_file.open_write().await?; + let mut blocks_writer = self.blocks_file.open_write().await?; types_present_writer .write_all_buf(types_present_buf) .await?; type_offsets_writer.write_all_buf(type_offsets_buf).await?; - blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.write_all_buf(offsets_buf).await?; + blocks_writer.write_all_buf(blocks_buf).await?; types_present_writer.flush().await?; types_present_writer.sync_all().await?; @@ -316,12 +316,12 @@ impl TypedDictionaryFiles { type_offsets_writer.flush().await?; type_offsets_writer.sync_all().await?; - blocks_writer.flush().await?; - blocks_writer.sync_all().await?; - offsets_writer.flush().await?; offsets_writer.sync_all().await?; + blocks_writer.flush().await?; + blocks_writer.sync_all().await?; + Ok(()) } } @@ -345,8 +345,8 @@ impl DictionaryFiles { let offsets_map = self.offsets_file.map().await?; Ok(DictionaryMaps { - blocks_map, offsets_map, + blocks_map, }) } @@ -355,18 +355,18 @@ impl DictionaryFiles { blocks_buf: &mut B1, offsets_buf: &mut B2, ) -> io::Result<()> { - let mut blocks_writer = self.blocks_file.open_write().await?; let mut offsets_writer = self.offsets_file.open_write().await?; + let mut blocks_writer = self.blocks_file.open_write().await?; blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.write_all_buf(offsets_buf).await?; - blocks_writer.flush().await?; - blocks_writer.sync_all().await?; - offsets_writer.flush().await?; offsets_writer.sync_all().await?; + blocks_writer.flush().await?; + blocks_writer.sync_all().await?; + Ok(()) } } diff --git a/src/storage/layer.rs b/src/storage/layer.rs index bac60e72..44b290e7 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -366,8 +366,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { FILENAMES.predicate_dictionary_offsets, FILENAMES.value_dictionary_types_present, FILENAMES.value_dictionary_type_offsets, - FILENAMES.value_dictionary_blocks, FILENAMES.value_dictionary_offsets, + FILENAMES.value_dictionary_blocks, FILENAMES.node_value_idmap_bits, FILENAMES.node_value_idmap_bit_index_blocks, FILENAMES.node_value_idmap_bit_index_sblocks, @@ -411,8 +411,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { value_dictionary_files: TypedDictionaryFiles { types_present_file: files[4].clone(), type_offsets_file: files[5].clone(), - blocks_file: files[6].clone(), - offsets_file: files[7].clone(), + offsets_file: files[6].clone(), + blocks_file: files[7].clone(), }, id_map_files: IdMapFiles { @@ -471,8 +471,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { FILENAMES.predicate_dictionary_offsets, FILENAMES.value_dictionary_types_present, FILENAMES.value_dictionary_type_offsets, - FILENAMES.value_dictionary_blocks, FILENAMES.value_dictionary_offsets, + FILENAMES.value_dictionary_blocks, FILENAMES.node_value_idmap_bits, FILENAMES.node_value_idmap_bit_index_blocks, FILENAMES.node_value_idmap_bit_index_sblocks, @@ -532,8 +532,8 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { value_dictionary_files: TypedDictionaryFiles { types_present_file: files[4].clone(), type_offsets_file: files[5].clone(), - blocks_file: files[6].clone(), - offsets_file: files[7].clone(), + offsets_file: files[6].clone(), + blocks_file: files[7].clone(), }, id_map_files: IdMapFiles { @@ -706,12 +706,12 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { ) -> io::Result> { // does layer exist? if self.directory_exists(layer).await? { - let blocks_file = self - .get_file(layer, FILENAMES.node_dictionary_blocks) - .await?; let offsets_file = self .get_file(layer, FILENAMES.node_dictionary_offsets) .await?; + let blocks_file = self + .get_file(layer, FILENAMES.node_dictionary_blocks) + .await?; Ok(DictionaryFiles { blocks_file, @@ -728,12 +728,12 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { ) -> io::Result> { // does layer exist? if self.directory_exists(layer).await? { - let blocks_file = self - .get_file(layer, FILENAMES.predicate_dictionary_blocks) - .await?; let offsets_file = self .get_file(layer, FILENAMES.predicate_dictionary_offsets) .await?; + let blocks_file = self + .get_file(layer, FILENAMES.predicate_dictionary_blocks) + .await?; Ok(DictionaryFiles { blocks_file, @@ -756,12 +756,12 @@ pub trait PersistentLayerStore: 'static + Send + Sync + Clone { let type_offsets_file = self .get_file(layer, FILENAMES.value_dictionary_type_offsets) .await?; - let blocks_file = self - .get_file(layer, FILENAMES.value_dictionary_blocks) - .await?; let offsets_file = self .get_file(layer, FILENAMES.value_dictionary_offsets) .await?; + let blocks_file = self + .get_file(layer, FILENAMES.value_dictionary_blocks) + .await?; Ok(TypedDictionaryFiles { types_present_file, @@ -1569,8 +1569,8 @@ impl u8 { - parse_block_control_word(cw).1 + dbg!(parse_block_control_word(cw).1) } pub fn parse_block_control_word(cw: u8) -> (Option, u8) { diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 542722e2..c5ab4e29 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -44,6 +44,7 @@ impl SizedDictBufBuilder { offsets: LateLogArrayBufBuilder, data_buf: B2, ) -> Self { + dbg!(block_offset); Self { record_size, block_offset, @@ -86,9 +87,13 @@ impl SizedDictBufBuilder { } pub fn finalize(mut self) -> (LateLogArrayBufBuilder, B2, u64, u64) { - if self.current_block.len() > 0 { + if dbg!(self.current_block.len()) > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); - let size = build_block_unchecked(self.record_size, &mut self.data_buf, ¤t_block); + let size = dbg!(build_block_unchecked( + self.record_size, + &mut self.data_buf, + ¤t_block + )); self.block_offset += size as u64; self.offsets.push(self.block_offset); } @@ -103,13 +108,14 @@ impl SizedDictBufBuilder { } pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { + dbg!(&offsets); // the last offset doesn't matter as it's implied by the total size offsets.pop(); let largest_element = offsets.last().cloned().unwrap_or(0); let width = calculate_width(largest_element); let mut array_builder = LogArrayBufBuilder::new(buf, width); - + dbg!(&offsets); array_builder.push_vec(offsets); array_builder.finalize(); } @@ -123,7 +129,11 @@ pub struct SizedDict { impl SizedDict { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { + dbg!(&offsets); + dbg!(&data); + dbg!(dict_offset); let offsets = MonotonicLogArray::parse(offsets).unwrap(); + dbg!(&offsets); Self::from_parts(offsets, data, dict_offset) } @@ -136,6 +146,7 @@ impl SizedDict { } fn block_offset(&self, block_index: usize) -> usize { + dbg!(block_index); let offset: usize; if block_index == 0 { offset = 0; @@ -165,12 +176,15 @@ impl SizedDict { } pub fn block_num_elements(&self, block_index: usize) -> u8 { + eprintln!("offset: {block_index}"); let offset = self.block_offset(block_index); - + eprintln!("offset: {offset}"); parse_block_control_records(self.data[offset]) } pub fn num_blocks(&self) -> usize { + dbg!(&self.offsets); + dbg!(&self.data); self.offsets.len() + 1 } @@ -252,7 +266,7 @@ impl SizedDict { } pub fn num_entries(&self) -> usize { - let num_blocks = self.num_blocks(); + let num_blocks = dbg!(self.num_blocks()); let last_block_size = self.block_num_elements(num_blocks - 1); (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index 9b2e5528..ed28fce4 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -38,13 +38,14 @@ pub async fn merge_string_dictionaries< build_dict_unchecked(None, 0, &mut offsets, &mut data_buf, sorted_iterator); build_offset_logarray(&mut offsets_buf, offsets); - blocks_file_writer.write_all(data_buf.as_ref()).await?; - blocks_file_writer.flush().await?; - blocks_file_writer.sync_all().await?; offsets_file_writer.write_all(offsets_buf.as_ref()).await?; offsets_file_writer.flush().await?; offsets_file_writer.sync_all().await?; + blocks_file_writer.write_all(data_buf.as_ref()).await?; + blocks_file_writer.flush().await?; + blocks_file_writer.sync_all().await?; + Ok(()) } @@ -97,13 +98,13 @@ pub async fn merge_typed_dictionaries< type_offsets_file_writer.flush().await?; type_offsets_file_writer.sync_all().await?; - blocks_file_writer.write_all(data_buf.as_ref()).await?; - blocks_file_writer.flush().await?; - blocks_file_writer.sync_all().await?; - offsets_file_writer.write_all(offsets_buf.as_ref()).await?; offsets_file_writer.flush().await?; offsets_file_writer.sync_all().await?; + blocks_file_writer.write_all(data_buf.as_ref()).await?; + blocks_file_writer.flush().await?; + blocks_file_writer.sync_all().await?; + Ok(()) } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 40c3bfae..758d32aa 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -35,10 +35,20 @@ impl TypedDict { block_offsets: Bytes, data: Bytes, ) -> Self { + let types_present2 = types_present.clone(); + dbg!(types_present2); + let type_offsets2 = type_offsets.clone(); + dbg!(type_offsets2); + let block_offsets2 = block_offsets.clone(); + dbg!(block_offsets2); + let data2 = data.clone(); + dbg!(data2); let types_present = MonotonicLogArray::parse(types_present).unwrap(); let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); - + dbg!(&types_present); + dbg!(&type_offsets); + dbg!(&block_offsets); let mut tally: u64 = 0; let mut type_id_offsets = Vec::with_capacity(types_present.len() - 1); for type_offset in type_offsets.iter() { @@ -61,20 +71,26 @@ impl TypedDict { 1 } else { BLOCK_SIZE - - parse_block_control_records( + - dbg!(parse_block_control_records( data[block_offsets.entry(block_offsets.len() - 1) as usize], - ) as usize + ) as usize) }; - let num_entries = (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap; - - Self { + dbg!(last_gap); + dbg!((block_offsets.len() + 1) * BLOCK_SIZE - tally as usize); + let num_entries = if block_offsets.len() == 0 { + parse_block_control_records(data[0]) as usize + } else { + (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap + }; + dbg!(num_entries); + dbg!(Self { types_present, type_offsets, block_offsets, type_id_offsets, num_entries, data, - } + }) } pub fn id>(&self, v: &Q) -> IdLookupResult { @@ -92,7 +108,7 @@ impl TypedDict { let type_offset; let block_offset; let id_offset; - + dbg!(i); if i == 0 { type_offset = 0; block_offset = 0; @@ -102,7 +118,7 @@ impl TypedDict { id_offset = self.type_id_offsets[i - 1]; block_offset = self.block_offsets.entry(type_offset as usize) as usize; } - + dbg!(block_offset); let len; if i == self.types_present.len() - 1 { if i == 0 { @@ -263,7 +279,9 @@ impl TypedDictSegment { let data2 = data.clone(); dbg!(offsets2); dbg!(data2); + dbg!(dict_offset); let dict = SizedDict::parse(offsets, data, dict_offset); + dbg!(&dict); Self { dict, _x: Default::default(), @@ -689,11 +707,10 @@ impl TypedDictBufBuilder Date: Sun, 4 Dec 2022 19:05:16 +0100 Subject: [PATCH 55/99] only 90 failing --- src/layer/internal/mod.rs | 19 +++++++++++-------- src/layer/layer.rs | 1 + src/structure/tfc/block.rs | 12 +++++++----- src/structure/tfc/dict.rs | 17 +++++++++++++---- src/structure/tfc/typed.rs | 18 +++++++++++++++--- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 650de6a2..510f4f03 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -551,6 +551,7 @@ impl Layer for InternalLayer { } fn subject_id<'a>(&'a self, subject: &str) -> Option { + eprintln!("In subject_id"); let to_result = |layer: &'a InternalLayer| { ( layer @@ -565,10 +566,12 @@ impl Layer for InternalLayer { result = to_result(layer); } let (id_option, parent_option) = result; - id_option.map(|id| 1 + id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) + eprintln!("id_option: {id_option:?}"); + id_option.map(|id| id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) } fn predicate_id<'a>(&'a self, predicate: &str) -> Option { + eprintln!("In predicate id"); let to_result = |layer: &'a InternalLayer| { ( layer @@ -583,7 +586,7 @@ impl Layer for InternalLayer { result = to_result(layer); } let (id_option, parent_option) = result; - id_option.map(|id| 1 + id + parent_option.map_or(0, |p| p.predicate_count() as u64)) + id_option.map(|id| id + parent_option.map_or(0, |p| p.predicate_count() as u64)) } fn object_node_id<'a>(&'a self, object: &str) -> Option { @@ -601,7 +604,7 @@ impl Layer for InternalLayer { result = to_result(layer); } let (id_option, parent_option) = result; - id_option.map(|id| 1 + id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) + id_option.map(|id| id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) } fn object_value_id<'a>(&'a self, object: &str) -> Option { @@ -620,14 +623,14 @@ impl Layer for InternalLayer { result = to_result(layer); } let (id_option, parent_option) = result; - id_option.map(|id| 1 + id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) + id_option.map(|id| id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) } fn id_subject(&self, id: u64) -> Option { if id == 0 { return None; } - let mut corrected_id = id - 1; + let mut corrected_id = id; let mut current_option: Option<&InternalLayer> = Some(self); let mut parent_count = self.node_and_value_count() as u64; while let Some(current_layer) = current_option { @@ -663,7 +666,7 @@ impl Layer for InternalLayer { let mut current_option: Option<&InternalLayer> = Some(self); let mut parent_count = self.predicate_count() as u64; while let Some(current_layer) = current_option { - let mut corrected_id = id - 1; + let mut corrected_id = id; if let Some(parent) = current_layer.immediate_parent() { parent_count -= current_layer.predicate_dict_len() as u64; if corrected_id >= parent_count as u64 { @@ -691,7 +694,7 @@ impl Layer for InternalLayer { if id == 0 { return None; } - let mut corrected_id = id - 1; + let mut corrected_id = id; let mut current_option: Option<&InternalLayer> = Some(self); let mut parent_count = self.node_and_value_count() as u64; while let Some(current_layer) = current_option { @@ -734,7 +737,7 @@ impl Layer for InternalLayer { return None; } - let mut corrected_id = id - 1; + let mut corrected_id = id; let mut current_option: Option<&InternalLayer> = Some(self); let mut parent_count = self.node_and_value_count() as u64; while let Some(current_layer) = current_option { diff --git a/src/layer/layer.rs b/src/layer/layer.rs index 27680d31..ca88a338 100644 --- a/src/layer/layer.rs +++ b/src/layer/layer.rs @@ -79,6 +79,7 @@ pub trait Layer: Send + Sync { /// Returns true if the given triple exists, and false otherwise. fn string_triple_exists(&self, triple: &StringTriple) -> bool { + eprintln!("I am here"); self.string_triple_to_id(triple) .map(|t| self.id_triple_exists(t)) .unwrap_or(false) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 02bee893..2569ae51 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -38,6 +38,7 @@ impl From for SizedDictError { impl SizedBlockHeader { fn parse(buf: &mut Bytes) -> Result { let cw = buf.get_u8(); + dbg!(&buf); let (record_size, num_entries) = parse_block_control_word(cw); let mut sizes = [0_usize; BLOCK_SIZE - 1]; let mut shareds = [0_usize; BLOCK_SIZE - 1]; @@ -59,13 +60,13 @@ impl SizedBlockHeader { let buffer_length = sizes.iter().sum(); - Ok(Self { + Ok(dbg!(Self { head, num_entries, buffer_length, sizes, shareds, - }) + })) } } @@ -373,7 +374,7 @@ impl SizedDictBlock { let data = bytes.split_to(header.buffer_length); - Ok(Self { header, data }) + Ok(dbg!(Self { header, data })) } pub fn num_entries(&self) -> u8 { @@ -385,6 +386,7 @@ impl SizedDictBlock { } pub fn entry(&self, index: usize) -> SizedDictEntry { + dbg!(index); if index == 0 { return SizedDictEntry::new(vec![self.header.head.clone()]); } @@ -448,7 +450,7 @@ impl SizedDictBlock { let suffix_size = self.header.sizes[index - 1]; slices.push(self.data.slice(offset..offset + suffix_size)); - SizedDictEntry::new_optimized(slices) + dbg!(SizedDictEntry::new_optimized(slices)) } fn suffixes<'a>(&'a self) -> impl Iterator + 'a { @@ -489,7 +491,7 @@ impl SizedDictBlock { let (new_common_prefix, ordering) = find_common_prefix_ord(&slice[common_prefix..], &suffix[..]); match ordering { - Ordering::Equal => return IdLookupResult::Found(ix as u64 + 1), + Ordering::Equal => return dbg!(IdLookupResult::Found(ix as u64 + 1)), Ordering::Less => return IdLookupResult::Closest(ix as u64), Ordering::Greater => { common_prefix += new_common_prefix; diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index c5ab4e29..6c682358 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -160,7 +160,7 @@ impl SizedDict { pub fn block_bytes(&self, block_index: usize) -> Bytes { let offset = self.block_offset(block_index); let block_bytes; - block_bytes = self.data.slice(offset..); + block_bytes = dbg!(self.data.slice(offset..)); block_bytes } @@ -176,10 +176,17 @@ impl SizedDict { } pub fn block_num_elements(&self, block_index: usize) -> u8 { - eprintln!("offset: {block_index}"); - let offset = self.block_offset(block_index); + eprintln!("block_index: {block_index}"); + let offset = dbg!(self.block_offset(block_index)); eprintln!("offset: {offset}"); - parse_block_control_records(self.data[offset]) + + dbg!(&self.data); + if dbg!(self.data.len()) == 0 { + eprintln!("size is zero"); + 0 + } else { + dbg!(parse_block_control_records(dbg!(self.data[offset]))) + } } pub fn num_blocks(&self) -> usize { @@ -189,6 +196,7 @@ impl SizedDict { } pub fn entry(&self, index: usize) -> Option { + dbg!(index); if index > self.num_entries() { return None; } @@ -197,6 +205,7 @@ impl SizedDict { } pub fn id(&self, slice: &[u8]) -> IdLookupResult { + dbg!(slice); // let's binary search let mut min = 0; let mut max = self.offsets.len(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 758d32aa..50fa3600 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -49,17 +49,28 @@ impl TypedDict { dbg!(&types_present); dbg!(&type_offsets); dbg!(&block_offsets); + if types_present.len() == 0 { + return Self { + types_present, + type_offsets, + block_offsets, + type_id_offsets: Vec::new(), + num_entries: 0, + data, + }; + } let mut tally: u64 = 0; let mut type_id_offsets = Vec::with_capacity(types_present.len() - 1); for type_offset in type_offsets.iter() { let last_block_len; if type_offset == 0 { - last_block_len = parse_block_control_records(data[0]); + last_block_len = dbg!(parse_block_control_records(data[0])); } else { let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); - last_block_len = - parse_block_control_records(data[last_block_offset_of_previous_type as usize]); + last_block_len = dbg!(parse_block_control_records( + data[last_block_offset_of_previous_type as usize] + )); } let gap = BLOCK_SIZE as u8 - last_block_len; @@ -295,6 +306,7 @@ impl TypedDictSegment { pub fn id>(&self, val: &Q) -> IdLookupResult { let slice = T::to_lexical(val); + dbg!(&slice); self.dict.id(&slice[..]) } From a51feede40b5518b3e1549fcd63acdf9bd17b23d Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Mon, 5 Dec 2022 00:00:14 +0100 Subject: [PATCH 56/99] Fewer debug prints --- src/storage/layer.rs | 2 +- src/structure/logarray.rs | 2 -- src/structure/tfc/block.rs | 18 +++++++----------- src/structure/tfc/dict.rs | 36 ++++++++---------------------------- src/structure/tfc/typed.rs | 37 ++++++++++++------------------------- 5 files changed, 28 insertions(+), 67 deletions(-) diff --git a/src/storage/layer.rs b/src/storage/layer.rs index 44b290e7..d628d39e 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -2286,7 +2286,7 @@ mod tests { HashMap, )> { let mut builder = store.create_base_layer().await?; - let name = dbg!(builder.name()); + let name = builder.name(); for t in BASE_TRIPLES.iter() { builder.add_string_triple(t.clone()); } diff --git a/src/structure/logarray.rs b/src/structure/logarray.rs index 6429e459..2f980868 100644 --- a/src/structure/logarray.rs +++ b/src/structure/logarray.rs @@ -296,8 +296,6 @@ impl LogArray { /// /// Panics if `index` + `length` is >= the length of the log array. pub fn slice(&self, offset: usize, len: usize) -> LogArray { - dbg!(len); - dbg!(offset); let offset = u32::try_from(offset) .unwrap_or_else(|_| panic!("expected 32-bit slice offset ({})", offset)); let len = diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 2569ae51..f4332f2d 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -38,7 +38,7 @@ impl From for SizedDictError { impl SizedBlockHeader { fn parse(buf: &mut Bytes) -> Result { let cw = buf.get_u8(); - dbg!(&buf); + let (record_size, num_entries) = parse_block_control_word(cw); let mut sizes = [0_usize; BLOCK_SIZE - 1]; let mut shareds = [0_usize; BLOCK_SIZE - 1]; @@ -60,13 +60,13 @@ impl SizedBlockHeader { let buffer_length = sizes.iter().sum(); - Ok(dbg!(Self { + Ok(Self { head, num_entries, buffer_length, sizes, shareds, - })) + }) } } @@ -374,7 +374,7 @@ impl SizedDictBlock { let data = bytes.split_to(header.buffer_length); - Ok(dbg!(Self { header, data })) + Ok(Self { header, data }) } pub fn num_entries(&self) -> u8 { @@ -386,7 +386,6 @@ impl SizedDictBlock { } pub fn entry(&self, index: usize) -> SizedDictEntry { - dbg!(index); if index == 0 { return SizedDictEntry::new(vec![self.header.head.clone()]); } @@ -450,7 +449,7 @@ impl SizedDictBlock { let suffix_size = self.header.sizes[index - 1]; slices.push(self.data.slice(offset..offset + suffix_size)); - dbg!(SizedDictEntry::new_optimized(slices)) + SizedDictEntry::new_optimized(slices) } fn suffixes<'a>(&'a self) -> impl Iterator + 'a { @@ -491,7 +490,7 @@ impl SizedDictBlock { let (new_common_prefix, ordering) = find_common_prefix_ord(&slice[common_prefix..], &suffix[..]); match ordering { - Ordering::Equal => return dbg!(IdLookupResult::Found(ix as u64 + 1)), + Ordering::Equal => return IdLookupResult::Found(ix as u64 + 1), Ordering::Less => return IdLookupResult::Closest(ix as u64), Ordering::Greater => { common_prefix += new_common_prefix; @@ -607,7 +606,7 @@ impl IdLookupResult { } pub fn parse_block_control_records(cw: u8) -> u8 { - dbg!(parse_block_control_word(cw).1) + parse_block_control_word(cw).1 } pub fn parse_block_control_word(cw: u8) -> (Option, u8) { @@ -631,7 +630,6 @@ fn record_size_encoding(record_size: Option) -> u8 { Some(4) => 3 << 3, Some(8) => 4 << 3, _ => { - dbg!(record_size); panic!("This is really bad!") } } @@ -676,8 +674,6 @@ pub(crate) fn build_block_unchecked( let (vbyte, vbyte_len) = encode_array(suffix_len as u64); buf.put_slice(&vbyte[..vbyte_len]); size += vbyte_len; - } else { - eprintln!("Fixed width: {record_size:?}"); } suffixes.push(&cur[common_prefix..]); last = cur; diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 6c682358..e49af940 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -44,7 +44,6 @@ impl SizedDictBufBuilder { offsets: LateLogArrayBufBuilder, data_buf: B2, ) -> Self { - dbg!(block_offset); Self { record_size, block_offset, @@ -87,13 +86,9 @@ impl SizedDictBufBuilder { } pub fn finalize(mut self) -> (LateLogArrayBufBuilder, B2, u64, u64) { - if dbg!(self.current_block.len()) > 0 { + if self.current_block.len() > 0 { let current_block: Vec<&[u8]> = self.current_block.iter().map(|e| e.as_ref()).collect(); - let size = dbg!(build_block_unchecked( - self.record_size, - &mut self.data_buf, - ¤t_block - )); + let size = build_block_unchecked(self.record_size, &mut self.data_buf, ¤t_block); self.block_offset += size as u64; self.offsets.push(self.block_offset); } @@ -108,14 +103,13 @@ impl SizedDictBufBuilder { } pub fn build_offset_logarray(buf: &mut B, mut offsets: Vec) { - dbg!(&offsets); // the last offset doesn't matter as it's implied by the total size offsets.pop(); let largest_element = offsets.last().cloned().unwrap_or(0); let width = calculate_width(largest_element); let mut array_builder = LogArrayBufBuilder::new(buf, width); - dbg!(&offsets); + array_builder.push_vec(offsets); array_builder.finalize(); } @@ -129,11 +123,7 @@ pub struct SizedDict { impl SizedDict { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { - dbg!(&offsets); - dbg!(&data); - dbg!(dict_offset); let offsets = MonotonicLogArray::parse(offsets).unwrap(); - dbg!(&offsets); Self::from_parts(offsets, data, dict_offset) } @@ -146,7 +136,6 @@ impl SizedDict { } fn block_offset(&self, block_index: usize) -> usize { - dbg!(block_index); let offset: usize; if block_index == 0 { offset = 0; @@ -160,7 +149,7 @@ impl SizedDict { pub fn block_bytes(&self, block_index: usize) -> Bytes { let offset = self.block_offset(block_index); let block_bytes; - block_bytes = dbg!(self.data.slice(offset..)); + block_bytes = self.data.slice(offset..); block_bytes } @@ -176,27 +165,19 @@ impl SizedDict { } pub fn block_num_elements(&self, block_index: usize) -> u8 { - eprintln!("block_index: {block_index}"); - let offset = dbg!(self.block_offset(block_index)); - eprintln!("offset: {offset}"); - - dbg!(&self.data); - if dbg!(self.data.len()) == 0 { - eprintln!("size is zero"); + let offset = self.block_offset(block_index); + if self.data.len() == 0 { 0 } else { - dbg!(parse_block_control_records(dbg!(self.data[offset]))) + parse_block_control_records(self.data[offset]) } } pub fn num_blocks(&self) -> usize { - dbg!(&self.offsets); - dbg!(&self.data); self.offsets.len() + 1 } pub fn entry(&self, index: usize) -> Option { - dbg!(index); if index > self.num_entries() { return None; } @@ -205,7 +186,6 @@ impl SizedDict { } pub fn id(&self, slice: &[u8]) -> IdLookupResult { - dbg!(slice); // let's binary search let mut min = 0; let mut max = self.offsets.len(); @@ -275,7 +255,7 @@ impl SizedDict { } pub fn num_entries(&self) -> usize { - let num_blocks = dbg!(self.num_blocks()); + let num_blocks = self.num_blocks(); let last_block_size = self.block_num_elements(num_blocks - 1); (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 50fa3600..e5aa46a7 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -36,19 +36,13 @@ impl TypedDict { data: Bytes, ) -> Self { let types_present2 = types_present.clone(); - dbg!(types_present2); let type_offsets2 = type_offsets.clone(); - dbg!(type_offsets2); let block_offsets2 = block_offsets.clone(); - dbg!(block_offsets2); let data2 = data.clone(); - dbg!(data2); + let types_present = MonotonicLogArray::parse(types_present).unwrap(); let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); - dbg!(&types_present); - dbg!(&type_offsets); - dbg!(&block_offsets); if types_present.len() == 0 { return Self { types_present, @@ -64,13 +58,12 @@ impl TypedDict { for type_offset in type_offsets.iter() { let last_block_len; if type_offset == 0 { - last_block_len = dbg!(parse_block_control_records(data[0])); + last_block_len = parse_block_control_records(data[0]); } else { let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); - last_block_len = dbg!(parse_block_control_records( - data[last_block_offset_of_previous_type as usize] - )); + last_block_len = + parse_block_control_records(data[last_block_offset_of_previous_type as usize]); } let gap = BLOCK_SIZE as u8 - last_block_len; @@ -82,26 +75,24 @@ impl TypedDict { 1 } else { BLOCK_SIZE - - dbg!(parse_block_control_records( + - parse_block_control_records( data[block_offsets.entry(block_offsets.len() - 1) as usize], - ) as usize) + ) as usize }; - dbg!(last_gap); - dbg!((block_offsets.len() + 1) * BLOCK_SIZE - tally as usize); let num_entries = if block_offsets.len() == 0 { parse_block_control_records(data[0]) as usize } else { (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap }; - dbg!(num_entries); - dbg!(Self { + + Self { types_present, type_offsets, block_offsets, type_id_offsets, num_entries, data, - }) + } } pub fn id>(&self, v: &Q) -> IdLookupResult { @@ -119,7 +110,7 @@ impl TypedDict { let type_offset; let block_offset; let id_offset; - dbg!(i); + if i == 0 { type_offset = 0; block_offset = 0; @@ -129,7 +120,7 @@ impl TypedDict { id_offset = self.type_id_offsets[i - 1]; block_offset = self.block_offsets.entry(type_offset as usize) as usize; } - dbg!(block_offset); + let len; if i == self.types_present.len() - 1 { if i == 0 { @@ -288,11 +279,8 @@ impl TypedDictSegment { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { let offsets2 = offsets.clone(); let data2 = data.clone(); - dbg!(offsets2); - dbg!(data2); - dbg!(dict_offset); let dict = SizedDict::parse(offsets, data, dict_offset); - dbg!(&dict); + Self { dict, _x: Default::default(), @@ -306,7 +294,6 @@ impl TypedDictSegment { pub fn id>(&self, val: &Q) -> IdLookupResult { let slice = T::to_lexical(val); - dbg!(&slice); self.dict.id(&slice[..]) } From 48750c25972798c2da8d14754e21de258da24ae1 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 6 Dec 2022 10:02:06 +0100 Subject: [PATCH 57/99] Ready for comparison to refactor branch --- src/layer/internal/base.rs | 4 ++-- src/layer/internal/child.rs | 2 +- src/layer/internal/mod.rs | 11 ++++------- src/layer/simple_builder.rs | 1 - src/storage/file.rs | 4 ++-- src/storage/memory.rs | 8 +++----- src/structure/tfc/dict.rs | 28 ++++++++++++++++------------ src/structure/tfc/typed.rs | 15 ++++++++------- 8 files changed, 36 insertions(+), 37 deletions(-) diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 4d6cbe69..43f63206 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -56,7 +56,7 @@ impl BaseLayer { ); let predicate_dictionary = StringDict::parse( maps.predicate_dictionary_maps.offsets_map, - maps.predicate_dictionary_maps.blocks_map, + dbg!(maps.predicate_dictionary_maps.blocks_map), 0, ); let value_dictionary = TypedDict::from_parts( @@ -617,7 +617,7 @@ pub mod tests { let builder = builder.into_phase2().await.unwrap(); builder.finalize().await.unwrap(); - eprintln!("Here"); + let layer = BaseLayer::load_from_files([1, 2, 3, 4, 5], &base_layer_files) .await .unwrap(); diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index 09744582..edc5e521 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -345,7 +345,7 @@ impl ChildLayerFileBuil } = self; builder.finalize().await?; - eprintln!("Into phase2"); + let node_dict_offsets_map = files.node_dictionary_files.offsets_file.map().await?; let node_dict_blocks_map = files.node_dictionary_files.blocks_file.map().await?; let predicate_dict_offsets_map = diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 510f4f03..9b6f3061 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -234,7 +234,7 @@ impl InternalLayer { } pub fn node_dict_get(&self, id: usize) -> Option { - self.node_dictionary().get(id) + dbg!(self.node_dictionary().get(id)) } pub fn node_dict_len(&self) -> usize { @@ -551,7 +551,6 @@ impl Layer for InternalLayer { } fn subject_id<'a>(&'a self, subject: &str) -> Option { - eprintln!("In subject_id"); let to_result = |layer: &'a InternalLayer| { ( layer @@ -566,12 +565,10 @@ impl Layer for InternalLayer { result = to_result(layer); } let (id_option, parent_option) = result; - eprintln!("id_option: {id_option:?}"); id_option.map(|id| id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) } fn predicate_id<'a>(&'a self, predicate: &str) -> Option { - eprintln!("In predicate id"); let to_result = |layer: &'a InternalLayer| { ( layer @@ -712,11 +709,11 @@ impl Layer for InternalLayer { } } - corrected_id = current_layer + corrected_id = dbg!(current_layer .node_value_id_map() - .outer_to_inner(corrected_id); + .outer_to_inner(corrected_id)); - if corrected_id >= current_layer.node_dict_len() as u64 { + if corrected_id > dbg!(current_layer.node_dict_len()) as u64 { // object, if it exists, must be a value corrected_id -= current_layer.node_dict_len() as u64; return current_layer diff --git a/src/layer/simple_builder.rs b/src/layer/simple_builder.rs index 8ae41b05..7f47c03d 100644 --- a/src/layer/simple_builder.rs +++ b/src/layer/simple_builder.rs @@ -62,7 +62,6 @@ pub struct SimpleLayerBuilder { impl SimpleLayerBuilder { /// Construct a layer builder for a base layer pub fn new(name: [u32; 5], files: BaseLayerFiles) -> Self { - eprintln!("Trying to make a new layer file"); Self { name, parent: None, diff --git a/src/storage/file.rs b/src/storage/file.rs index 89556ee0..78e85dee 100644 --- a/src/storage/file.rs +++ b/src/storage/file.rs @@ -341,8 +341,8 @@ pub struct DictionaryFiles { impl DictionaryFiles { pub async fn map_all(&self) -> io::Result { - let blocks_map = self.blocks_file.map().await?; let offsets_map = self.offsets_file.map().await?; + let blocks_map = self.blocks_file.map().await?; Ok(DictionaryMaps { offsets_map, @@ -358,8 +358,8 @@ impl DictionaryFiles { let mut offsets_writer = self.offsets_file.open_write().await?; let mut blocks_writer = self.blocks_file.open_write().await?; - blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.write_all_buf(offsets_buf).await?; + blocks_writer.write_all_buf(blocks_buf).await?; offsets_writer.flush().await?; offsets_writer.sync_all().await?; diff --git a/src/storage/memory.rs b/src/storage/memory.rs index 4febd9ec..f277314f 100644 --- a/src/storage/memory.rs +++ b/src/storage/memory.rs @@ -525,17 +525,15 @@ mod tests { builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); - eprintln!("Here1"); builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - eprintln!("Here2"); + builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); - eprintln!("Here3"); + builder.commit_boxed().await.unwrap(); - eprintln!("Here4"); let layer = store.get_layer(child_name).await.unwrap().unwrap(); - eprintln!("Here5"); + assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); assert!(layer.string_triple_exists(&StringTriple::new_node("cow", "likes", "pig"))); diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index e49af940..42fea0a0 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -128,11 +128,12 @@ impl SizedDict { } pub fn from_parts(offsets: MonotonicLogArray, data: Bytes, dict_offset: u64) -> Self { - Self { + dbg!(&data); + dbg!(Self { offsets, data, dict_offset, - } + }) } fn block_offset(&self, block_index: usize) -> usize { @@ -147,9 +148,9 @@ impl SizedDict { } pub fn block_bytes(&self, block_index: usize) -> Bytes { - let offset = self.block_offset(block_index); + let offset = dbg!(self.block_offset(block_index)); let block_bytes; - block_bytes = self.data.slice(offset..); + block_bytes = dbg!(self.data.slice(offset..)); block_bytes } @@ -178,6 +179,7 @@ impl SizedDict { } pub fn entry(&self, index: usize) -> Option { + dbg!(index); if index > self.num_entries() { return None; } @@ -190,11 +192,11 @@ impl SizedDict { let mut min = 0; let mut max = self.offsets.len(); let mut mid: usize; - + dbg!(&self); while min <= max { mid = (min + max) / 2; - - let head_slice = self.block_head(mid); + dbg!(mid); + let head_slice = dbg!(self.block_head(mid)); match slice.cmp(&head_slice[..]) { Ordering::Less => { @@ -203,20 +205,22 @@ impl SizedDict { // but since this is the first block, the string doesn't exist. return IdLookupResult::NotFound; } - max = mid - 1; + max = dbg!(mid - 1); } - Ordering::Greater => min = mid + 1, - Ordering::Equal => return IdLookupResult::Found((mid * BLOCK_SIZE + 1) as u64), // what luck! turns out the string we were looking for was the block head + Ordering::Greater => min = dbg!(mid + 1), + Ordering::Equal => { + return IdLookupResult::Found(dbg!((mid * BLOCK_SIZE + 1)) as u64) + } // what luck! turns out the string we were looking for was the block head } } let found = max; // we found the block the string should be part of. - let block = self.block(found); + let block = dbg!(self.block(found)); let block_id = block.id(slice); let offset = (found * BLOCK_SIZE) as u64 + 1; - let result = block_id.offset(offset).default(offset - 1); + let result = block_id.offset(offset).default(dbg!(offset - 1)); /* if found != 0 { // the default value will fill in the last index of the diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index e5aa46a7..dc34579d 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -35,6 +35,7 @@ impl TypedDict { block_offsets: Bytes, data: Bytes, ) -> Self { + dbg!(&data); let types_present2 = types_present.clone(); let type_offsets2 = type_offsets.clone(); let block_offsets2 = block_offsets.clone(); @@ -85,14 +86,14 @@ impl TypedDict { (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap }; - Self { + dbg!(Self { types_present, type_offsets, block_offsets, type_id_offsets, num_entries, data, - } + }) } pub fn id>(&self, v: &Q) -> IdLookupResult { @@ -102,7 +103,7 @@ impl TypedDict { } pub fn get(&self, id: usize) -> Option { - let result = self.entry(id); + let result = self.entry(dbg!(id)); result.map(|(datatype, slice)| datatype.cast(slice.into_buf())) } @@ -277,8 +278,7 @@ pub struct TypedDictSegment { impl TypedDictSegment { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { - let offsets2 = offsets.clone(); - let data2 = data.clone(); + dbg!(&data); let dict = SizedDict::parse(offsets, data, dict_offset); Self { @@ -288,12 +288,13 @@ impl TypedDictSegment { } pub fn get(&self, index: usize) -> Option { - let entry = self.dict.entry(index); + let entry = self.dict.entry(dbg!(index)); entry.map(|e| T::from_lexical(e.into_buf())) } pub fn id>(&self, val: &Q) -> IdLookupResult { - let slice = T::to_lexical(val); + dbg!(&self.dict); + let slice = dbg!(T::to_lexical(val)); self.dict.id(&slice[..]) } From 49b6c97484f51ecba839f823bd09e0110665822d Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 11:12:17 +0100 Subject: [PATCH 58/99] make lower level empty dicts work --- src/structure/tfc/dict.rs | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 42fea0a0..8d79e7f3 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -148,6 +148,9 @@ impl SizedDict { } pub fn block_bytes(&self, block_index: usize) -> Bytes { + if self.data.is_empty() { + panic!("empty dictionary has no block"); + } let offset = dbg!(self.block_offset(block_index)); let block_bytes; block_bytes = dbg!(self.data.slice(offset..)); @@ -166,16 +169,21 @@ impl SizedDict { } pub fn block_num_elements(&self, block_index: usize) -> u8 { - let offset = self.block_offset(block_index); - if self.data.len() == 0 { + if self.data.is_empty() { 0 } else { + let offset = self.block_offset(block_index); parse_block_control_records(self.data[offset]) } } pub fn num_blocks(&self) -> usize { - self.offsets.len() + 1 + if self.data.is_empty() { + 0 + } + else { + self.offsets.len() + 1 + } } pub fn entry(&self, index: usize) -> Option { @@ -193,6 +201,9 @@ impl SizedDict { let mut max = self.offsets.len(); let mut mid: usize; dbg!(&self); + if self.is_empty() { + return IdLookupResult::NotFound; + } while min <= max { mid = (min + max) / 2; dbg!(mid); @@ -260,9 +271,18 @@ impl SizedDict { pub fn num_entries(&self) -> usize { let num_blocks = self.num_blocks(); - let last_block_size = self.block_num_elements(num_blocks - 1); + if num_blocks == 0 { + 0 + } + else { + let last_block_size = self.block_num_elements(num_blocks - 1); + + (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize + } + } - (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize + pub fn is_empty(&self) -> bool { + self.data.is_empty() } } From af9abb4c0f363612c5c38b7934620f5954b19663 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 11:25:27 +0100 Subject: [PATCH 59/99] fix id correction when looking up in parent layers --- src/layer/internal/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index 9b6f3061..df16010d 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -635,7 +635,7 @@ impl Layer for InternalLayer { parent_count = parent_count - current_layer.node_dict_len() as u64 - current_layer.value_dict_len() as u64; - if corrected_id >= parent_count as u64 { + if corrected_id > parent_count as u64 { // subject, if it exists, is in this layer corrected_id -= parent_count; } else { @@ -666,7 +666,7 @@ impl Layer for InternalLayer { let mut corrected_id = id; if let Some(parent) = current_layer.immediate_parent() { parent_count -= current_layer.predicate_dict_len() as u64; - if corrected_id >= parent_count as u64 { + if corrected_id > parent_count as u64 { // subject, if it exists, is in this layer corrected_id -= parent_count; } else { @@ -700,7 +700,7 @@ impl Layer for InternalLayer { - current_layer.node_dict_len() as u64 - current_layer.value_dict_len() as u64; - if corrected_id >= parent_count { + if corrected_id > parent_count { // object, if it exists, is in this layer corrected_id -= parent_count; } else { From 3380e78ddc11a8ceab9dddebcd860da9267d93b0 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 14:48:14 +0100 Subject: [PATCH 60/99] fix id mapping for new id offset --- src/layer/id_map.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/layer/id_map.rs b/src/layer/id_map.rs index 345e5f19..e01f3331 100644 --- a/src/layer/id_map.rs +++ b/src/layer/id_map.rs @@ -32,10 +32,10 @@ impl IdMap { self.id_wtree .as_ref() .and_then(|wtree| { - if id >= wtree.len() as u64 { + if id > wtree.len() as u64 { None } else { - Some(wtree.lookup_one(id).unwrap()) + Some(wtree.lookup_one(id-1).unwrap() + 1) } }) .unwrap_or(id) @@ -45,10 +45,11 @@ impl IdMap { self.id_wtree .as_ref() .and_then(|wtree| { - if id >= wtree.len() as u64 { + if id > wtree.len() as u64 { None } else { - Some(wtree.decode_one(id.try_into().unwrap())) + let id:usize = id.try_into().unwrap(); + Some(wtree.decode_one(id - 1) + 1) } }) .unwrap_or(id) @@ -97,7 +98,7 @@ pub async fn construct_idmaps_from_structures node_iters.push( dict.into_iter() .enumerate() - .map(move |(i, e)| (idmap.inner_to_outer(i as u64) + node_offset as u64, e)), + .map(move |(i, e)| (idmap.inner_to_outer(i as u64 + 1) + node_offset as u64, e)), ); node_offset += num_entries + value_dicts[ix].num_entries(); @@ -111,7 +112,7 @@ pub async fn construct_idmaps_from_structures let num_entries = dict.num_entries(); value_iters.push(dict.into_iter().enumerate().map(move |(i, e)| { ( - idmap.inner_to_outer(i as u64 + node_count as u64) + value_offset as u64, + idmap.inner_to_outer(i as u64 + node_count as u64 + 1) + value_offset as u64, e, ) })); @@ -127,7 +128,7 @@ pub async fn construct_idmaps_from_structures predicate_iters.push( dict.into_iter() .enumerate() - .map(move |(i, e)| (idmap.inner_to_outer(i as u64) + predicate_offset as u64, e)), + .map(move |(i, e)| (idmap.inner_to_outer(i as u64 + 1) + predicate_offset as u64, e)), ); predicate_offset += num_entries; @@ -151,9 +152,9 @@ pub async fn construct_idmaps_from_structures let sorted_node_iter = sorted_iterator(node_iters, entry_comparator).map(|(i,s)|(i, (Datatype::String, s))); let sorted_value_iter = sorted_iterator(value_iters, typed_entry_comparator); - let sorted_node_value_iter = sorted_node_iter.chain(sorted_value_iter).map(|(id, _)| id); + let sorted_node_value_iter = sorted_node_iter.chain(sorted_value_iter).map(|(id, _)| id - 1); let sorted_predicate_iter = - sorted_iterator(predicate_iters, entry_comparator).map(|(id, _)| id); + sorted_iterator(predicate_iters, entry_comparator).map(|(id, _)| id - 1); let node_value_width = util::calculate_width(node_offset as u64); let node_value_build_task = tokio::spawn(build_wavelet_tree_from_iter( From 14b71a0bf3f12c953bb714d6349c1459da0b90bf Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 14:48:23 +0100 Subject: [PATCH 61/99] remove some debug expressions --- src/structure/tfc/dict.rs | 5 ++--- src/structure/tfc/typed.rs | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 8d79e7f3..0766de56 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -128,12 +128,11 @@ impl SizedDict { } pub fn from_parts(offsets: MonotonicLogArray, data: Bytes, dict_offset: u64) -> Self { - dbg!(&data); - dbg!(Self { + Self { offsets, data, dict_offset, - }) + } } fn block_offset(&self, block_index: usize) -> usize { diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index dc34579d..465c10a9 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -35,7 +35,6 @@ impl TypedDict { block_offsets: Bytes, data: Bytes, ) -> Self { - dbg!(&data); let types_present2 = types_present.clone(); let type_offsets2 = type_offsets.clone(); let block_offsets2 = block_offsets.clone(); @@ -86,14 +85,14 @@ impl TypedDict { (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap }; - dbg!(Self { + Self { types_present, type_offsets, block_offsets, type_id_offsets, num_entries, data, - }) + } } pub fn id>(&self, v: &Q) -> IdLookupResult { @@ -278,7 +277,6 @@ pub struct TypedDictSegment { impl TypedDictSegment { pub fn parse(offsets: Bytes, data: Bytes, dict_offset: u64) -> Self { - dbg!(&data); let dict = SizedDict::parse(offsets, data, dict_offset); Self { From 7c3b34031cc10a6a60be1b555ea2035bc62322db Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 16:03:02 +0100 Subject: [PATCH 62/99] removed loads of dbg! invocations, and started string dict logic --- src/layer/builder.rs | 29 ++++---------- src/layer/internal/base.rs | 8 ++-- src/layer/internal/child.rs | 6 +-- src/layer/internal/mod.rs | 8 ++-- src/storage/layer.rs | 13 ++++--- src/structure/mod.rs | 3 +- src/structure/tfc/dict.rs | 19 ++++----- src/structure/tfc/typed.rs | 77 ++++++++++++++++++++++++++++++++++--- 8 files changed, 105 insertions(+), 58 deletions(-) diff --git a/src/layer/builder.rs b/src/layer/builder.rs index 55df9d00..d8eeb111 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -3,7 +3,6 @@ use std::io; use bytes::{Bytes, BytesMut}; use futures::stream::TryStreamExt; use rayon::prelude::*; -use tfc::dict::SizedDictBufBuilder; use super::layer::*; use crate::storage::*; @@ -14,8 +13,8 @@ pub struct DictionarySetFileBuilder { node_files: DictionaryFiles, predicate_files: DictionaryFiles, value_files: TypedDictionaryFiles, - node_dictionary_builder: SizedDictBufBuilder, - predicate_dictionary_builder: SizedDictBufBuilder, + node_dictionary_builder: StringDictBufBuilder, + predicate_dictionary_builder: StringDictBufBuilder, value_dictionary_builder: TypedDictBufBuilder, } @@ -25,18 +24,12 @@ impl DictionarySetFileBuilder { predicate_files: DictionaryFiles, value_files: TypedDictionaryFiles, ) -> io::Result { - let node_dictionary_builder = SizedDictBufBuilder::new( - None, - 0, - 0, - LateLogArrayBufBuilder::new(BytesMut::new()), + let node_dictionary_builder = StringDictBufBuilder::new( + BytesMut::new(), BytesMut::new(), ); - let predicate_dictionary_builder = SizedDictBufBuilder::new( - None, - 0, - 0, - LateLogArrayBufBuilder::new(BytesMut::new()), + let predicate_dictionary_builder = StringDictBufBuilder::new( + BytesMut::new(), BytesMut::new(), ); let value_dictionary_builder = TypedDictBufBuilder::new( @@ -147,16 +140,10 @@ impl DictionarySetFileBuilder { } pub async fn finalize(self) -> io::Result<()> { - let (mut node_offsets_builder, mut node_data_buf, _, _) = + let (mut node_offsets_buf, mut node_data_buf) = self.node_dictionary_builder.finalize(); - // last offset is useless - node_offsets_builder.pop(); - let mut node_offsets_buf = node_offsets_builder.finalize(); - let (mut predicate_offsets_builder, mut predicate_data_buf, _, _) = + let (mut predicate_offsets_buf, mut predicate_data_buf) = self.predicate_dictionary_builder.finalize(); - // last offset is useless - predicate_offsets_builder.pop(); - let mut predicate_offsets_buf = predicate_offsets_builder.finalize(); let ( mut value_types_present_buf, mut value_type_offsets_buf, diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 43f63206..4b11fd06 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -52,12 +52,10 @@ impl BaseLayer { let node_dictionary = StringDict::parse( maps.node_dictionary_maps.offsets_map, maps.node_dictionary_maps.blocks_map, - 0, ); let predicate_dictionary = StringDict::parse( maps.predicate_dictionary_maps.offsets_map, - dbg!(maps.predicate_dictionary_maps.blocks_map), - 0, + maps.predicate_dictionary_maps.blocks_map, ); let value_dictionary = TypedDict::from_parts( maps.value_dictionary_maps.types_present_map, @@ -266,8 +264,8 @@ impl BaseLayerFileBuilder { let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; let value_dict_offsets_map = files.value_dictionary_files.offsets_file.map().await?; - let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map, 0); - let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map, 0); + let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map); + let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map); let val_dict = TypedDict::from_parts( value_dict_types_present_map, value_dict_type_offsets_map, diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index edc5e521..9690a297 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -65,12 +65,10 @@ impl ChildLayer { let node_dictionary = StringDict::parse( maps.node_dictionary_maps.offsets_map, maps.node_dictionary_maps.blocks_map, - 0, ); let predicate_dictionary = StringDict::parse( maps.predicate_dictionary_maps.offsets_map, maps.predicate_dictionary_maps.blocks_map, - 0, ); let value_dictionary = TypedDict::from_parts( maps.value_dictionary_maps.types_present_map, @@ -361,8 +359,8 @@ impl ChildLayerFileBuil let value_dict_offsets_map = files.value_dictionary_files.offsets_file.map().await?; let value_dict_blocks_map = files.value_dictionary_files.blocks_file.map().await?; - let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map, 0); - let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map, 0); + let node_dict = StringDict::parse(node_dict_offsets_map, node_dict_blocks_map); + let pred_dict = StringDict::parse(predicate_dict_offsets_map, predicate_dict_blocks_map); let val_dict = TypedDict::from_parts( value_dict_types_present_map, value_dict_type_offsets_map, diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index df16010d..a3ed3905 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -234,7 +234,7 @@ impl InternalLayer { } pub fn node_dict_get(&self, id: usize) -> Option { - dbg!(self.node_dictionary().get(id)) + self.node_dictionary().get(id) } pub fn node_dict_len(&self) -> usize { @@ -709,11 +709,11 @@ impl Layer for InternalLayer { } } - corrected_id = dbg!(current_layer + corrected_id = current_layer .node_value_id_map() - .outer_to_inner(corrected_id)); + .outer_to_inner(corrected_id); - if corrected_id > dbg!(current_layer.node_dict_len()) as u64 { + if corrected_id > current_layer.node_dict_len() as u64 { // object, if it exists, must be a value corrected_id -= current_layer.node_dict_len() as u64; return current_layer diff --git a/src/storage/layer.rs b/src/storage/layer.rs index d628d39e..30575ec7 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -15,7 +15,7 @@ use crate::structure::logarray::logarray_file_get_length_and_width; use crate::structure::StringDict; use crate::structure::TypedDict; use crate::structure::{ - dict_file_get_count, util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree, + util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree, }; use std::convert::TryInto; @@ -1571,7 +1571,6 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.node_dictionary_files(name).await?.blocks_file; - Ok(Some(dict_file_get_count(file).await?)) + panic!(); + //Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } @@ -1621,7 +1620,8 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.predicate_dictionary_files(name).await?.blocks_file; - Ok(Some(dict_file_get_count(file).await?)) + panic!(); + //Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } @@ -1630,7 +1630,8 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.value_dictionary_files(name).await?.blocks_file; - Ok(Some(dict_file_get_count(file).await?)) + panic!(); + //Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } diff --git a/src/structure/mod.rs b/src/structure/mod.rs index b4ada408..e5147270 100644 --- a/src/structure/mod.rs +++ b/src/structure/mod.rs @@ -8,7 +8,7 @@ pub mod bitindex; pub mod bititer; pub mod logarray; //pub mod mapped_dict; -pub mod pfc; +//pub mod pfc; pub mod tfc; pub mod util; pub mod vbyte; @@ -18,6 +18,5 @@ pub use adjacencylist::*; pub use bitarray::*; pub use bitindex::*; pub use logarray::*; -pub use pfc::*; pub use tfc::*; pub use wavelettree::*; diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 0766de56..cac4dca7 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -150,9 +150,9 @@ impl SizedDict { if self.data.is_empty() { panic!("empty dictionary has no block"); } - let offset = dbg!(self.block_offset(block_index)); + let offset = self.block_offset(block_index); let block_bytes; - block_bytes = dbg!(self.data.slice(offset..)); + block_bytes = self.data.slice(offset..); block_bytes } @@ -186,7 +186,6 @@ impl SizedDict { } pub fn entry(&self, index: usize) -> Option { - dbg!(index); if index > self.num_entries() { return None; } @@ -199,14 +198,12 @@ impl SizedDict { let mut min = 0; let mut max = self.offsets.len(); let mut mid: usize; - dbg!(&self); if self.is_empty() { return IdLookupResult::NotFound; } while min <= max { mid = (min + max) / 2; - dbg!(mid); - let head_slice = dbg!(self.block_head(mid)); + let head_slice = self.block_head(mid); match slice.cmp(&head_slice[..]) { Ordering::Less => { @@ -215,11 +212,11 @@ impl SizedDict { // but since this is the first block, the string doesn't exist. return IdLookupResult::NotFound; } - max = dbg!(mid - 1); + max = mid - 1; } - Ordering::Greater => min = dbg!(mid + 1), + Ordering::Greater => min = mid + 1, Ordering::Equal => { - return IdLookupResult::Found(dbg!((mid * BLOCK_SIZE + 1)) as u64) + return IdLookupResult::Found((mid * BLOCK_SIZE + 1) as u64) } // what luck! turns out the string we were looking for was the block head } } @@ -227,10 +224,10 @@ impl SizedDict { let found = max; // we found the block the string should be part of. - let block = dbg!(self.block(found)); + let block = self.block(found); let block_id = block.id(slice); let offset = (found * BLOCK_SIZE) as u64 + 1; - let result = block_id.offset(offset).default(dbg!(offset - 1)); + let result = block_id.offset(offset).default(offset - 1); /* if found != 0 { // the default value will fill in the last index of the diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 465c10a9..b07dab59 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -102,7 +102,7 @@ impl TypedDict { } pub fn get(&self, id: usize) -> Option { - let result = self.entry(dbg!(id)); + let result = self.entry(id); result.map(|(datatype, slice)| datatype.cast(slice.into_buf())) } @@ -286,13 +286,12 @@ impl TypedDictSegment { } pub fn get(&self, index: usize) -> Option { - let entry = self.dict.entry(dbg!(index)); + let entry = self.dict.entry(index); entry.map(|e| T::from_lexical(e.into_buf())) } pub fn id>(&self, val: &Q) -> IdLookupResult { - dbg!(&self.dict); - let slice = dbg!(T::to_lexical(val)); + let slice = T::to_lexical(val); self.dict.id(&slice[..]) } @@ -309,7 +308,75 @@ impl TypedDictSegment { } } -pub type StringDict = TypedDictSegment; +#[derive(Clone)] +pub struct StringDict(TypedDictSegment); + +impl StringDict { + pub fn parse(offsets: Bytes, data: Bytes) -> Self { + Self(TypedDictSegment::parse(offsets, data.slice(..data.len()), 0)) + } + + pub fn get(&self, index: usize) -> Option { + self.0.get(index) + } + + pub fn id>(&self, val: &Q) -> IdLookupResult { + self.0.id(val) + } + + pub fn num_entries(&self) -> usize { + self.0.num_entries() + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { + self.0.iter() + } + + pub fn into_iter(self) -> impl Iterator + Clone { + self.0.into_iter() + } +} + +pub struct StringDictBufBuilder(SizedDictBufBuilder); + +impl StringDictBufBuilder { + pub fn new( + offsets_buf: B1, + data_buf: B2, + ) -> Self { + let offsets = LateLogArrayBufBuilder::new(offsets_buf); + Self(SizedDictBufBuilder::new(None, 0, 0, offsets, data_buf)) + } + + pub fn id_offset(&self) -> u64 { + self.0.id_offset() + } + + pub fn block_offset(&self) -> u64 { + self.0.block_offset() + } + + pub fn add(&mut self, value: Bytes) -> u64 { + self.0.add(value) + } + + pub fn add_entry(&mut self, e: &SizedDictEntry) -> u64 { + self.0.add_entry(e) + } + + pub fn add_all>(&mut self, it: I) -> Vec { + self.0.add_all(it) + } + + pub fn finalize(self) -> (B1, B2) { + let (mut offsets_array, mut data_buf, _block_offset, id_offset) = self.0.finalize(); + offsets_array.pop(); + let offsets_buf = offsets_array.finalize(); + //data_buf.put_u64(id_offset); + + (offsets_buf, data_buf) + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] pub enum Datatype { From 737742359f0a93f0ade79383ffe2a9b4d6e904ab Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 6 Dec 2022 16:51:36 +0100 Subject: [PATCH 63/99] fixed all tests --- src/storage/layer.rs | 10 ++- src/structure/tfc/dict.rs | 45 +++++--------- src/structure/tfc/file.rs | 39 ++++++------ src/structure/tfc/mod.rs | 1 + src/structure/tfc/typed.rs | 122 +++++++++++-------------------------- 5 files changed, 75 insertions(+), 142 deletions(-) diff --git a/src/storage/layer.rs b/src/storage/layer.rs index 30575ec7..2f63fc6d 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -11,6 +11,7 @@ use crate::layer::{ SimpleLayerBuilder, }; use crate::structure::bitarray::bitarray_len_from_file; +use crate::structure::dict_file_get_count; use crate::structure::logarray::logarray_file_get_length_and_width; use crate::structure::StringDict; use crate::structure::TypedDict; @@ -1610,8 +1611,7 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.node_dictionary_files(name).await?.blocks_file; - panic!(); - //Ok(Some(dict_file_get_count(file).await?)) + Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } @@ -1620,8 +1620,7 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.predicate_dictionary_files(name).await?.blocks_file; - panic!(); - //Ok(Some(dict_file_get_count(file).await?)) + Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } @@ -1630,8 +1629,7 @@ impl io::Result> { if self.directory_exists(name).await? { let file = self.value_dictionary_files(name).await?.blocks_file; - panic!(); - //Ok(Some(dict_file_get_count(file).await?)) + Ok(Some(dict_file_get_count(file).await?)) } else { Ok(None) } diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index cac4dca7..16118164 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -4,29 +4,9 @@ use crate::structure::{ util::calculate_width, LateLogArrayBufBuilder, LogArrayBufBuilder, MonotonicLogArray, }; use bytes::{BufMut, Bytes}; -use itertools::Itertools; use super::block::*; -pub fn build_dict_unchecked, I: Iterator>( - record_size: Option, - start_offset: u64, - offsets: &mut Vec, - data_buf: &mut B, - iter: I, -) { - let chunk_iter = iter.chunks(BLOCK_SIZE); - - let mut offset = start_offset; - for chunk in &chunk_iter { - let slices: Vec = chunk.collect(); - let borrows: Vec<&[u8]> = slices.iter().map(|s| s.as_ref()).collect(); - let size = build_block_unchecked(record_size, data_buf, &borrows); - offset += size as u64; - offsets.push(offset); - } -} - pub struct SizedDictBufBuilder { pub(crate) record_size: Option, block_offset: u64, @@ -341,14 +321,19 @@ mod tests { use super::*; use bytes::BytesMut; - fn build_dict_and_offsets, I: Iterator>( - array_buf: &mut B1, - data_buf: &mut B2, + fn build_dict_and_offsets>( + array_buf: B1, + data_buf: B2, vals: I, - ) { - let mut offsets = Vec::new(); - build_dict_unchecked(None, 0, &mut offsets, data_buf, vals); - build_offset_logarray(array_buf, offsets); + ) -> (B1, B2) { + let offsets = LateLogArrayBufBuilder::new(array_buf); + let mut builder = SizedDictBufBuilder::new(None, 0, 0, offsets, data_buf); + builder.add_all(vals); + let (mut array, data_buf, _, _) = builder.finalize(); + array.pop(); + let array_buf = array.finalize(); + + (array_buf, data_buf) } #[test] @@ -372,7 +357,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(|s|Bytes::from(s))); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -463,7 +448,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(Bytes::from)); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -495,7 +480,7 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter()); + build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(Bytes::from)); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index ed28fce4..645c85b6 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -1,13 +1,11 @@ +use byteorder::{BigEndian, ByteOrder}; use bytes::BytesMut; use std::io; -use tokio::io::AsyncWriteExt; +use tokio::io::{AsyncWriteExt, AsyncReadExt}; use crate::{storage::*, structure::util::sorted_iterator}; -use super::{ - dict::{build_dict_unchecked, build_offset_logarray}, - *, -}; +use super::*; pub async fn merge_string_dictionaries< 'a, @@ -32,11 +30,9 @@ pub async fn merge_string_dictionaries< let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; - let mut offsets = Vec::new(); - let mut offsets_buf = BytesMut::new(); - let mut data_buf = BytesMut::new(); - build_dict_unchecked(None, 0, &mut offsets, &mut data_buf, sorted_iterator); - build_offset_logarray(&mut offsets_buf, offsets); + let mut builder = StringDictBufBuilder::new(BytesMut::new(), BytesMut::new()); + builder.add_all(sorted_iterator); + let (offsets_buf, data_buf) = builder.finalize(); offsets_file_writer.write_all(offsets_buf.as_ref()).await?; offsets_file_writer.flush().await?; @@ -74,17 +70,9 @@ pub async fn merge_typed_dictionaries< let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; - let mut types_present_buf = BytesMut::new(); - let mut type_offsets_buf = BytesMut::new(); - let mut offsets_buf = BytesMut::new(); - let mut data_buf = BytesMut::new(); - build_multiple_segments( - &mut types_present_buf, - &mut type_offsets_buf, - &mut offsets_buf, - &mut data_buf, - sorted_iterator, - ); + let mut builder = TypedDictBufBuilder::new(BytesMut::new(), BytesMut::new(), BytesMut::new(), BytesMut::new()); + builder.add_all(sorted_iterator); + let (types_present_buf, type_offsets_buf, offsets_buf, data_buf) = builder.finalize(); types_present_file_writer .write_all(types_present_buf.as_ref()) @@ -108,3 +96,12 @@ pub async fn merge_typed_dictionaries< Ok(()) } + +pub async fn dict_file_get_count(file: F) -> io::Result { + let mut result = vec![0; 8]; + file.open_read_from(file.size().await? - 8) + .await? + .read_exact(&mut result) + .await?; + Ok(BigEndian::read_u64(&result)) +} diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index d6297508..665e65f0 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -7,3 +7,4 @@ pub mod file; pub use typed::*; pub use block::{SizedDictEntry, SizedDictEntryBuf, OwnedSizedDictEntryBuf}; +pub use file::*; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index b07dab59..5ee0a0d6 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -1,11 +1,9 @@ use crate::structure::{ tfc::block::{parse_block_control_records, BLOCK_SIZE}, - util::calculate_width, - LateLogArrayBufBuilder, LogArrayBufBuilder, MonotonicLogArray, + LateLogArrayBufBuilder, MonotonicLogArray, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use itertools::*; use num_derive::FromPrimitive; use num_traits::FromPrimitive; use rug::Integer; @@ -14,7 +12,7 @@ use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, decimal::{decimal_to_storage, storage_to_decimal}, - dict::{build_dict_unchecked, build_offset_logarray, SizedDict, SizedDictBufBuilder}, + dict::{SizedDict, SizedDictBufBuilder}, integer::{bigint_to_storage, storage_to_bigint}, }; @@ -35,11 +33,6 @@ impl TypedDict { block_offsets: Bytes, data: Bytes, ) -> Self { - let types_present2 = types_present.clone(); - let type_offsets2 = type_offsets.clone(); - let block_offsets2 = block_offsets.clone(); - let data2 = data.clone(); - let types_present = MonotonicLogArray::parse(types_present).unwrap(); let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); @@ -50,7 +43,7 @@ impl TypedDict { block_offsets, type_id_offsets: Vec::new(), num_entries: 0, - data, + data: data.slice(..data.len()-8), }; } let mut tally: u64 = 0; @@ -91,7 +84,7 @@ impl TypedDict { block_offsets, type_id_offsets, num_entries, - data, + data: data.slice(..data.len()-8), } } @@ -313,7 +306,7 @@ pub struct StringDict(TypedDictSegment); impl StringDict { pub fn parse(offsets: Bytes, data: Bytes) -> Self { - Self(TypedDictSegment::parse(offsets, data.slice(..data.len()), 0)) + Self(TypedDictSegment::parse(offsets, data.slice(..data.len()-8), 0)) } pub fn get(&self, index: usize) -> Option { @@ -372,7 +365,7 @@ impl StringDictBufBuilder { let (mut offsets_array, mut data_buf, _block_offset, id_offset) = self.0.finalize(); offsets_array.pop(); let offsets_buf = offsets_array.finalize(); - //data_buf.put_u64(id_offset); + data_buf.put_u64(id_offset); (offsets_buf, data_buf) } @@ -635,69 +628,6 @@ impl ToLexical for Decimal { } } -pub fn build_segment, I: Iterator>( - record_size: Option, - offsets: &mut Vec, - data_buf: &mut B, - iter: I, -) { - let slices = iter.map(|val| val.to_lexical()); - build_dict_unchecked(record_size, 0, offsets, data_buf, slices); -} - -pub fn build_multiple_segments< - B1: BufMut, - B2: BufMut, - B3: BufMut, - B4: BufMut, - R: AsRef<[u8]>, - I: Iterator, ->( - used_types_buf: &mut B1, - type_offsets_buf: &mut B2, - block_offsets_buf: &mut B3, - data_buf: &mut B4, - iter: I, -) { - let mut types: Vec = Vec::new(); - let mut type_offsets: Vec = Vec::new(); - let mut offsets = Vec::with_capacity(iter.size_hint().0); - for (key, group) in iter.group_by(|v| v.0).into_iter() { - let start_offset = offsets.last().map(|t| *t).unwrap_or(0_u64); - let start_type_offset = offsets.len(); - types.push(key); - type_offsets.push(start_type_offset as u64); - build_dict_unchecked( - key.record_size(), - start_offset, - &mut offsets, - data_buf, - group.map(|v| v.1), - ); - } - - build_offset_logarray(block_offsets_buf, offsets); - let largest_type = types.last().unwrap(); - let largest_type_offset = type_offsets.last().unwrap(); - - let types_width = calculate_width(*largest_type as u64); - let type_offsets_width = calculate_width(*largest_type_offset); - - let mut types_builder = LogArrayBufBuilder::new(used_types_buf, types_width); - let mut type_offsets_builder = LogArrayBufBuilder::new(type_offsets_buf, type_offsets_width); - - for t in types { - types_builder.push(t as u64); - } - - for o in type_offsets.into_iter().skip(1) { - type_offsets_builder.push(o - 1); - } - - types_builder.finalize(); - type_offsets_builder.finalize(); -} - pub struct TypedDictBufBuilder { types_present_builder: LateLogArrayBufBuilder, type_offsets_builder: LateLogArrayBufBuilder, @@ -769,13 +699,14 @@ impl TypedDictBufBuilder TypedDictBufBuilder, + >( + used_types_buf: &mut B1, + type_offsets_buf: &mut B2, + block_offsets_buf: &mut B3, + data_buf: &mut B4, + iter: I, + ) { + let mut builder = TypedDictBufBuilder::new(used_types_buf, type_offsets_buf, block_offsets_buf, data_buf); + builder.add_all(iter); + builder.finalize(); + } + fn build_segment_and_offsets< B1: BufMut, @@ -799,13 +746,18 @@ mod tests { I: Iterator, >( dt: Datatype, - array_buf: &mut B1, - data_buf: &mut B2, + array_buf: B1, + data_buf: B2, iter: I, - ) { - let mut offsets = Vec::new(); - build_segment(dt.record_size(), &mut offsets, data_buf, iter); - build_offset_logarray(array_buf, offsets); + ) -> (B1, B2) { + let offsets = LateLogArrayBufBuilder::new(array_buf); + let mut builder = SizedDictBufBuilder::new(dt.record_size(), 0, 0, offsets, data_buf); + builder.add_all(iter.map(|v|v.to_lexical())); + let (mut offsets_array, data_buf, _, _) = builder.finalize(); + offsets_array.pop(); + let offsets_buf = offsets_array.finalize(); + + (offsets_buf, data_buf) } #[test] From b017c5e6b2c79f35c87907c50e9894d8e13ab835 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 6 Dec 2022 17:34:41 +0100 Subject: [PATCH 64/99] Adding multiblock logic --- src/structure/tfc/typed.rs | 146 ++++++++++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 12 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 5ee0a0d6..39b62a67 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -43,7 +43,7 @@ impl TypedDict { block_offsets, type_id_offsets: Vec::new(), num_entries: 0, - data: data.slice(..data.len()-8), + data: data.slice(..data.len() - 8), }; } let mut tally: u64 = 0; @@ -84,7 +84,7 @@ impl TypedDict { block_offsets, type_id_offsets, num_entries, - data: data.slice(..data.len()-8), + data: data.slice(..data.len() - 8), } } @@ -134,6 +134,8 @@ impl TypedDict { if len == 0 { // any slice will do logarray_slice = self.block_offsets.slice(0, 0); + } else if i == 0 { + logarray_slice = self.block_offsets.slice(type_offset, len); } else { logarray_slice = self.block_offsets.slice(type_offset + 1, len); } @@ -306,7 +308,11 @@ pub struct StringDict(TypedDictSegment); impl StringDict { pub fn parse(offsets: Bytes, data: Bytes) -> Self { - Self(TypedDictSegment::parse(offsets, data.slice(..data.len()-8), 0)) + Self(TypedDictSegment::parse( + offsets, + data.slice(..data.len() - 8), + 0, + )) } pub fn get(&self, index: usize) -> Option { @@ -333,10 +339,7 @@ impl StringDict { pub struct StringDictBufBuilder(SizedDictBufBuilder); impl StringDictBufBuilder { - pub fn new( - offsets_buf: B1, - data_buf: B2, - ) -> Self { + pub fn new(offsets_buf: B1, data_buf: B2) -> Self { let offsets = LateLogArrayBufBuilder::new(offsets_buf); Self(SizedDictBufBuilder::new(None, 0, 0, offsets, data_buf)) } @@ -720,24 +723,28 @@ impl TypedDictBufBuilder, - >( + >( used_types_buf: &mut B1, type_offsets_buf: &mut B2, block_offsets_buf: &mut B3, data_buf: &mut B4, iter: I, ) { - let mut builder = TypedDictBufBuilder::new(used_types_buf, type_offsets_buf, block_offsets_buf, data_buf); + let mut builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); builder.add_all(iter); builder.finalize(); } - fn build_segment_and_offsets< B1: BufMut, B2: BufMut, @@ -752,7 +759,7 @@ mod tests { ) -> (B1, B2) { let offsets = LateLogArrayBufBuilder::new(array_buf); let mut builder = SizedDictBufBuilder::new(dt.record_size(), 0, 0, offsets, data_buf); - builder.add_all(iter.map(|v|v.to_lexical())); + builder.add_all(iter.map(|v| v.to_lexical())); let (mut offsets_array, data_buf, _, _) = builder.finalize(); offsets_array.pop(); let offsets_buf = offsets_array.finalize(); @@ -1278,4 +1285,119 @@ mod tests { assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) } } + + #[test] + fn test_two_blocks() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + String::make_entry(&"fdsa"), + String::make_entry(&"a"), + String::make_entry(&"bc"), + String::make_entry(&"bcd"), + String::make_entry(&"z"), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + String::make_entry(&"apple"), + String::make_entry(&"donkey"), + ]; + vec.sort(); + + let mut typed_builder = TypedDictBufBuilder::new( + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + ); + + let _results: Vec = vec + .clone() + .into_iter() + .map(|(dt, entry)| typed_builder.add(dt, entry)) + .collect(); + + let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + + for i in 0..vec.len() { + assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + } + } + + #[test] + fn test_three_blocks() { + let mut vec: Vec<(Datatype, Bytes)> = vec![ + String::make_entry(&"fdsa"), + String::make_entry(&"a"), + String::make_entry(&"bc"), + String::make_entry(&"bcd"), + String::make_entry(&"z"), + String::make_entry(&"Batty"), + String::make_entry(&"Batman"), + String::make_entry(&"apple"), + String::make_entry(&"donkey"), + String::make_entry(&"pickle"), + String::make_entry(&"Pacify"), + String::make_entry(&"Buckle"), + String::make_entry(&"possibilities"), + String::make_entry(&"suspicious"), + String::make_entry(&"babble"), + String::make_entry(&"reformat"), + String::make_entry(&"refactor"), + String::make_entry(&"prereserve"), + String::make_entry(&"full"), + String::make_entry(&"block"), + String::make_entry(&"precalculate"), + String::make_entry(&"make"), + String::make_entry(&"Fix"), + String::make_entry(&"Remove"), + String::make_entry(&"Two"), + String::make_entry(&"typed"), + String::make_entry(&"fix"), + String::make_entry(&"Working"), + String::make_entry(&"write"), + String::make_entry(&"refactor"), + String::make_entry(&"only"), + String::make_entry(&"Implementation"), + String::make_entry(&"Add"), + String::make_entry(&"typed"), + String::make_entry(&"renamed"), + String::make_entry(&"move"), + String::make_entry(&"look"), + String::make_entry(&"implement"), + String::make_entry(&"test"), + String::make_entry(&"lookup"), + ]; + vec.sort(); + + let mut typed_builder = TypedDictBufBuilder::new( + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + ); + + let _results: Vec = vec + .clone() + .into_iter() + .map(|(dt, entry)| typed_builder.add(dt, entry)) + .collect(); + + let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); + + let dict = TypedDict::from_parts( + used_types.freeze(), + type_offsets.freeze(), + block_offsets.freeze(), + data.freeze(), + ); + + for i in 0..vec.len() { + assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + } + } } From 29c0640994561994509b21809825ad9c65d954b7 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 7 Dec 2022 09:39:26 +0100 Subject: [PATCH 65/99] Moving types into their own file --- src/structure/tfc/datatypes.rs | 264 ++++++++++++++++++++++++++++++++ src/structure/tfc/integer.rs | 4 +- src/structure/tfc/mod.rs | 8 +- src/structure/tfc/typed.rs | 265 +-------------------------------- 4 files changed, 273 insertions(+), 268 deletions(-) create mode 100644 src/structure/tfc/datatypes.rs diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs new file mode 100644 index 00000000..32389c14 --- /dev/null +++ b/src/structure/tfc/datatypes.rs @@ -0,0 +1,264 @@ +use super::{ + decimal::{decimal_to_storage, storage_to_decimal}, + integer::{bigint_to_storage, storage_to_bigint}, +}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use num_derive::FromPrimitive; +use rug::Integer; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] +pub enum Datatype { + String = 0, + UInt32, + Int32, + Float32, + UInt64, + Int64, + Float64, + Decimal, + BigInt, +} + +impl Datatype { + pub fn cast(self, b: B) -> T { + if T::datatype() != self { + panic!("not the right datatype"); + } + + T::from_lexical(b) + } + + pub fn record_size(&self) -> Option { + match self { + Datatype::String => None, + Datatype::UInt32 => Some(4), + Datatype::Int32 => Some(4), + Datatype::UInt64 => Some(8), + Datatype::Int64 => Some(8), + Datatype::Float32 => Some(4), + Datatype::Float64 => Some(8), + Datatype::Decimal => None, + Datatype::BigInt => None, + } + } +} + +pub trait TdbDataType { + fn datatype() -> Datatype; + fn from_lexical(b: B) -> Self; + + fn to_lexical(val: &T) -> Bytes + where + T: ToLexical + ?Sized, + { + val.to_lexical() + } + + fn make_entry(val: &T) -> (Datatype, Bytes) + where + T: ToLexical + ?Sized, + { + (Self::datatype(), val.to_lexical()) + } +} + +pub trait ToLexical { + fn to_lexical(&self) -> Bytes; +} + +impl> ToLexical for T { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self.as_ref().as_bytes()) + } +} + +impl TdbDataType for String { + fn datatype() -> Datatype { + Datatype::String + } + + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + String::from_utf8(vec).unwrap() + } +} + +impl TdbDataType for u32 { + fn datatype() -> Datatype { + Datatype::UInt32 + } + + fn from_lexical(b: B) -> Self { + b.reader().read_u32::().unwrap() + } +} + +impl ToLexical for u32 { + fn to_lexical(&self) -> Bytes { + let mut buf = BytesMut::new().writer(); + buf.write_u32::(*self).unwrap(); + + buf.into_inner().freeze() + } +} + +const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); +impl TdbDataType for i32 { + fn datatype() -> Datatype { + Datatype::Int32 + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + (I32_BYTE_MASK ^ i) as i32 + } +} + +impl ToLexical for i32 { + fn to_lexical(&self) -> Bytes { + let sign_flip = I32_BYTE_MASK ^ (*self as u32); + let mut buf = BytesMut::new().writer(); + buf.write_u32::(sign_flip).unwrap(); + buf.into_inner().freeze() + } +} + +impl TdbDataType for u64 { + fn datatype() -> Datatype { + Datatype::UInt64 + } + + fn from_lexical(b: B) -> Self { + b.reader().read_u64::().unwrap() + } +} + +impl ToLexical for u64 { + fn to_lexical(&self) -> Bytes { + let mut buf = BytesMut::new().writer(); + buf.write_u64::(*self).unwrap(); + + buf.into_inner().freeze() + } +} + +const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); +impl TdbDataType for i64 { + fn datatype() -> Datatype { + Datatype::Int64 + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + (I64_BYTE_MASK ^ i) as i64 + } +} + +impl ToLexical for i64 { + fn to_lexical(&self) -> Bytes { + let sign_flip = I64_BYTE_MASK ^ (*self as u64); + let mut buf = BytesMut::new().writer(); + buf.write_u64::(sign_flip).unwrap(); + buf.into_inner().freeze() + } +} + +const F32_SIGN_MASK: u32 = 0x8000_0000; +const F32_COMPLEMENT: u32 = 0xffff_ffff; +impl TdbDataType for f32 { + fn datatype() -> Datatype { + Datatype::Float32 + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u32::().unwrap(); + if i & F32_SIGN_MASK > 0 { + f32::from_bits(i ^ F32_SIGN_MASK) + } else { + f32::from_bits(i ^ F32_COMPLEMENT) + } + } +} + +impl ToLexical for f32 { + fn to_lexical(&self) -> Bytes { + let f = *self; + let g: u32 = if f.signum() == -1.0 { + f.to_bits() ^ F32_COMPLEMENT + } else { + f.to_bits() ^ F32_SIGN_MASK + }; + let mut buf = BytesMut::new().writer(); + buf.write_u32::(g).unwrap(); + buf.into_inner().freeze() + } +} + +const F64_SIGN_MASK: u64 = 0x8000_0000_0000_0000; +const F64_COMPLEMENT: u64 = 0xffff_ffff_ffff_ffff; +impl TdbDataType for f64 { + fn datatype() -> Datatype { + Datatype::Float64 + } + + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u64::().unwrap(); + if i & F64_SIGN_MASK > 0 { + f64::from_bits(i ^ F64_SIGN_MASK) + } else { + f64::from_bits(i ^ F64_COMPLEMENT) + } + } +} + +impl ToLexical for f64 { + fn to_lexical(&self) -> Bytes { + let f = *self; + let g: u64; + if f.signum() == -1.0 { + g = f.to_bits() ^ F64_COMPLEMENT; + } else { + g = f.to_bits() ^ F64_SIGN_MASK; + }; + let mut buf = BytesMut::new().writer(); + buf.write_u64::(g).unwrap(); + buf.into_inner().freeze() + } +} + +impl TdbDataType for Integer { + fn datatype() -> Datatype { + Datatype::BigInt + } + + fn from_lexical(mut b: B) -> Self { + storage_to_bigint(&mut b) + } +} + +impl ToLexical for Integer { + fn to_lexical(&self) -> Bytes { + Bytes::from(bigint_to_storage(self.clone())) + } +} + +#[derive(PartialEq, Debug)] +pub struct Decimal(String); + +impl TdbDataType for Decimal { + fn datatype() -> Datatype { + Datatype::Decimal + } + + fn from_lexical(mut b: B) -> Self { + Decimal(storage_to_decimal(&mut b)) + } +} + +impl ToLexical for Decimal { + fn to_lexical(&self) -> Bytes { + Bytes::from(decimal_to_storage(&self.0)) + } +} diff --git a/src/structure/tfc/integer.rs b/src/structure/tfc/integer.rs index 25617add..f841b897 100644 --- a/src/structure/tfc/integer.rs +++ b/src/structure/tfc/integer.rs @@ -91,8 +91,8 @@ pub fn bigint_to_storage(bigint: Integer) -> Vec { } number_vec.extend(size_bytes); if is_neg { - for i in 0..number_vec.len() { - number_vec[i] = !number_vec[i] + for e in number_vec.iter_mut() { + *e = !*e; } } number_vec.reverse(); diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index 665e65f0..2c2f120c 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -1,10 +1,12 @@ pub mod block; +pub mod datatypes; pub mod decimal; pub mod dict; +pub mod file; pub mod integer; pub mod typed; -pub mod file; -pub use typed::*; -pub use block::{SizedDictEntry, SizedDictEntryBuf, OwnedSizedDictEntryBuf}; +pub use block::{OwnedSizedDictEntryBuf, SizedDictEntry, SizedDictEntryBuf}; +pub use datatypes::*; pub use file::*; +pub use typed::*; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 39b62a67..443ef34a 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -2,18 +2,14 @@ use crate::structure::{ tfc::block::{parse_block_control_records, BLOCK_SIZE}, LateLogArrayBufBuilder, MonotonicLogArray, }; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use num_derive::FromPrimitive; +use bytes::{BufMut, Bytes}; use num_traits::FromPrimitive; -use rug::Integer; use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, - decimal::{decimal_to_storage, storage_to_decimal}, dict::{SizedDict, SizedDictBufBuilder}, - integer::{bigint_to_storage, storage_to_bigint}, + Datatype, TdbDataType, ToLexical, }; #[derive(Clone, Debug)] @@ -374,263 +370,6 @@ impl StringDictBufBuilder { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] -pub enum Datatype { - String = 0, - UInt32, - Int32, - Float32, - UInt64, - Int64, - Float64, - Decimal, - BigInt, -} - -impl Datatype { - pub fn cast(self, b: B) -> T { - if T::datatype() != self { - panic!("not the right datatype"); - } - - T::from_lexical(b) - } - - pub fn record_size(&self) -> Option { - match self { - Datatype::String => None, - Datatype::UInt32 => Some(4), - Datatype::Int32 => Some(4), - Datatype::UInt64 => Some(8), - Datatype::Int64 => Some(8), - Datatype::Float32 => Some(4), - Datatype::Float64 => Some(8), - Datatype::Decimal => None, - Datatype::BigInt => None, - } - } -} - -pub trait TdbDataType { - fn datatype() -> Datatype; - fn from_lexical(b: B) -> Self; - - fn to_lexical(val: &T) -> Bytes - where - T: ToLexical + ?Sized, - { - val.to_lexical() - } - - fn make_entry(val: &T) -> (Datatype, Bytes) - where - T: ToLexical + ?Sized, - { - (Self::datatype(), val.to_lexical()) - } -} - -pub trait ToLexical { - fn to_lexical(&self) -> Bytes; -} - -impl> ToLexical for T { - fn to_lexical(&self) -> Bytes { - Bytes::copy_from_slice(self.as_ref().as_bytes()) - } -} - -impl TdbDataType for String { - fn datatype() -> Datatype { - Datatype::String - } - - fn from_lexical(mut b: B) -> Self { - let mut vec = vec![0; b.remaining()]; - b.copy_to_slice(&mut vec); - String::from_utf8(vec).unwrap() - } -} - -impl TdbDataType for u32 { - fn datatype() -> Datatype { - Datatype::UInt32 - } - - fn from_lexical(b: B) -> Self { - b.reader().read_u32::().unwrap() - } -} - -impl ToLexical for u32 { - fn to_lexical(&self) -> Bytes { - let mut buf = BytesMut::new().writer(); - buf.write_u32::(*self).unwrap(); - - buf.into_inner().freeze() - } -} - -const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); -impl TdbDataType for i32 { - fn datatype() -> Datatype { - Datatype::Int32 - } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u32::().unwrap(); - (I32_BYTE_MASK ^ i) as i32 - } -} - -impl ToLexical for i32 { - fn to_lexical(&self) -> Bytes { - let sign_flip = I32_BYTE_MASK ^ (*self as u32); - let mut buf = BytesMut::new().writer(); - buf.write_u32::(sign_flip).unwrap(); - buf.into_inner().freeze() - } -} - -impl TdbDataType for u64 { - fn datatype() -> Datatype { - Datatype::UInt64 - } - - fn from_lexical(b: B) -> Self { - b.reader().read_u64::().unwrap() - } -} - -impl ToLexical for u64 { - fn to_lexical(&self) -> Bytes { - let mut buf = BytesMut::new().writer(); - buf.write_u64::(*self).unwrap(); - - buf.into_inner().freeze() - } -} - -const I64_BYTE_MASK: u64 = 0b1000_0000 << (7 * 8); -impl TdbDataType for i64 { - fn datatype() -> Datatype { - Datatype::Int64 - } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u64::().unwrap(); - (I64_BYTE_MASK ^ i) as i64 - } -} - -impl ToLexical for i64 { - fn to_lexical(&self) -> Bytes { - let sign_flip = I64_BYTE_MASK ^ (*self as u64); - let mut buf = BytesMut::new().writer(); - buf.write_u64::(sign_flip).unwrap(); - buf.into_inner().freeze() - } -} - -const F32_SIGN_MASK: u32 = 0x8000_0000; -const F32_COMPLEMENT: u32 = 0xffff_ffff; -impl TdbDataType for f32 { - fn datatype() -> Datatype { - Datatype::Float32 - } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u32::().unwrap(); - if i & F32_SIGN_MASK > 0 { - f32::from_bits(i ^ F32_SIGN_MASK) - } else { - f32::from_bits(i ^ F32_COMPLEMENT) - } - } -} - -impl ToLexical for f32 { - fn to_lexical(&self) -> Bytes { - let f = *self; - let g: u32; - if f.signum() == -1.0 { - g = f.to_bits() ^ F32_COMPLEMENT; - } else { - g = f.to_bits() ^ F32_SIGN_MASK; - }; - let mut buf = BytesMut::new().writer(); - buf.write_u32::(g).unwrap(); - buf.into_inner().freeze() - } -} - -const F64_SIGN_MASK: u64 = 0x8000_0000_0000_0000; -const F64_COMPLEMENT: u64 = 0xffff_ffff_ffff_ffff; -impl TdbDataType for f64 { - fn datatype() -> Datatype { - Datatype::Float64 - } - - fn from_lexical(b: B) -> Self { - let i = b.reader().read_u64::().unwrap(); - if i & F64_SIGN_MASK > 0 { - f64::from_bits(i ^ F64_SIGN_MASK) - } else { - f64::from_bits(i ^ F64_COMPLEMENT) - } - } -} - -impl ToLexical for f64 { - fn to_lexical(&self) -> Bytes { - let f = *self; - let g: u64; - if f.signum() == -1.0 { - g = f.to_bits() ^ F64_COMPLEMENT; - } else { - g = f.to_bits() ^ F64_SIGN_MASK; - }; - let mut buf = BytesMut::new().writer(); - buf.write_u64::(g).unwrap(); - buf.into_inner().freeze() - } -} - -impl TdbDataType for Integer { - fn datatype() -> Datatype { - Datatype::BigInt - } - - fn from_lexical(mut b: B) -> Self { - storage_to_bigint(&mut b) - } -} - -impl ToLexical for Integer { - fn to_lexical(&self) -> Bytes { - Bytes::from(bigint_to_storage(self.clone())) - } -} - -#[derive(PartialEq, Debug)] -pub struct Decimal(String); - -impl TdbDataType for Decimal { - fn datatype() -> Datatype { - Datatype::Decimal - } - - fn from_lexical(mut b: B) -> Self { - Decimal(storage_to_decimal(&mut b)) - } -} - -impl ToLexical for Decimal { - fn to_lexical(&self) -> Bytes { - Bytes::from(decimal_to_storage(&self.0)) - } -} - pub struct TypedDictBufBuilder { types_present_builder: LateLogArrayBufBuilder, type_offsets_builder: LateLogArrayBufBuilder, From f4e84072197e9321f7b45d28ffc3c28a2f7af730 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 7 Dec 2022 09:46:22 +0100 Subject: [PATCH 66/99] Make tests pass (imports) --- src/structure/tfc/datatypes.rs | 2 +- src/structure/tfc/typed.rs | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 32389c14..a1ae6b7f 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -245,7 +245,7 @@ impl ToLexical for Integer { } #[derive(PartialEq, Debug)] -pub struct Decimal(String); +pub struct Decimal(pub String); impl TdbDataType for Decimal { fn datatype() -> Datatype { diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 443ef34a..c33a4676 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -460,6 +460,11 @@ impl TypedDictBufBuilder Date: Wed, 7 Dec 2022 10:26:09 +0100 Subject: [PATCH 67/99] Satisfy linter --- src/structure/tfc/datatypes.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index a1ae6b7f..1505030e 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -216,11 +216,10 @@ impl TdbDataType for f64 { impl ToLexical for f64 { fn to_lexical(&self) -> Bytes { let f = *self; - let g: u64; - if f.signum() == -1.0 { - g = f.to_bits() ^ F64_COMPLEMENT; + let g: u64 = if f.signum() == -1.0 { + f.to_bits() ^ F64_COMPLEMENT } else { - g = f.to_bits() ^ F64_SIGN_MASK; + f.to_bits() ^ F64_SIGN_MASK }; let mut buf = BytesMut::new().writer(); buf.write_u64::(g).unwrap(); From 641b55cc08b6616033466cce63cd1633136f0028 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 11:16:54 +0100 Subject: [PATCH 68/99] Made SizedDictEntry slightly more efficient for single byte structs --- src/structure/tfc/block.rs | 114 ++++++++++++++++++++++++++----------- src/structure/tfc/typed.rs | 4 +- 2 files changed, 84 insertions(+), 34 deletions(-) diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index f4332f2d..7dc8847c 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -3,6 +3,7 @@ use std::cmp::Ordering; use std::hash::{Hash, Hasher}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use itertools::Either; use crate::structure::{ util::{find_common_prefix, find_common_prefix_ord}, @@ -71,11 +72,26 @@ impl SizedBlockHeader { } #[derive(Clone, Debug)] -pub struct SizedDictEntry(pub Vec); +pub enum SizedDictEntry { + Single(Bytes), + Rope(Vec) +} + +impl From for SizedDictEntry { + fn from(val: Bytes) -> Self { + Self::Single(val) + } +} + +impl From> for SizedDictEntry { + fn from(val: Vec) -> Self { + Self::Rope(val) + } +} impl SizedDictEntry { pub fn new(parts: Vec) -> Self { - Self(parts) + Self::Rope(parts) } pub fn new_optimized(parts: Vec) -> Self { @@ -86,21 +102,41 @@ impl SizedDictEntry { } pub fn to_bytes(&self) -> Bytes { - if self.0.len() == 1 { - self.0[0].clone() - } else { - let mut buf = BytesMut::with_capacity(self.len()); - for slice in self.0.iter() { - buf.extend_from_slice(&slice[..]); + match self { + Self::Single(b) => b.clone(), + Self::Rope(v) => { + if v.len() == 1 { + v[0].clone() + } else { + let mut buf = BytesMut::with_capacity(self.len()); + for slice in v.iter() { + buf.extend_from_slice(&slice[..]); + } + + buf.freeze() + } } + } + } - buf.freeze() + pub fn chunks(&self) -> impl Iterator { + match self { + Self::Single(b) => Either::Left(std::iter::once(b)), + Self::Rope(v) => Either::Right(v.iter()) + } + } + + pub fn into_chunks(self) -> impl Iterator { + match self { + Self::Single(b) => Either::Left(std::iter::once(b)), + Self::Rope(v) => Either::Right(v.into_iter()) } } + pub fn to_vec(&self) -> Vec { let mut v = Vec::with_capacity(self.len()); - for slice in self.0.iter() { + for slice in self.chunks() { v.extend_from_slice(slice); } @@ -124,7 +160,14 @@ impl SizedDictEntry { } pub fn len(&self) -> usize { - self.0.iter().map(|s| s.len()).sum() + self.chunks().map(|s| s.len()).sum() + } + + fn rope_len(&self) -> usize { + match self { + Self::Single(_) => 1, + Self::Rope(v) => v.len() + } } /// optimize size @@ -133,15 +176,15 @@ impl SizedDictEntry { /// efficient than a copy of the string. This will copy the /// underlying string if that is the case. pub fn optimize(&mut self) { - let overhead_size = std::mem::size_of::() * self.0.len(); + let overhead_size = std::mem::size_of::() * self.rope_len(); if std::mem::size_of::() + self.len() < overhead_size { let mut bytes = BytesMut::with_capacity(self.len()); - for part in self.0.iter() { + for part in self.chunks() { bytes.extend(part); } - self.0 = vec![bytes.freeze()]; + *self = Self::Single(bytes.freeze()); } } @@ -151,7 +194,7 @@ impl SizedDictEntry { } else if self.len() == 0 { true } else { - let mut it = self.0.iter(); + let mut it = self.chunks(); let mut part = it.next().unwrap(); loop { let slice = b.chunk(); @@ -197,7 +240,7 @@ impl Eq for SizedDictEntry {} impl Hash for SizedDictEntry { fn hash(&self, state: &mut H) { - for part in self.0.iter() { + for part in self.chunks() { state.write(part); } } @@ -210,8 +253,8 @@ impl Ord for SizedDictEntry { return Ordering::Equal; } - let mut it1 = self.0.iter(); - let mut it2 = other.0.iter(); + let mut it1 = self.chunks(); + let mut it2 = other.chunks(); let mut part1 = it1.next().unwrap().clone(); let mut part2 = it2.next().unwrap().clone(); @@ -289,14 +332,22 @@ pub struct SizedDictEntryBuf<'a> { pos_in_slice: usize, } +impl<'a> SizedDictEntryBuf<'a> { + fn current_slice(&self) -> &Bytes { + match self.entry.as_ref() { + SizedDictEntry::Single(b) => &b, + SizedDictEntry::Rope(v) => &v[self.slice_ix] + } + } +} + impl<'a> Buf for SizedDictEntryBuf<'a> { fn remaining(&self) -> usize { { let pos_in_slice = self.pos_in_slice; let total: usize = self .entry - .0 - .iter() + .chunks() .skip(self.slice_ix) .map(|s| s.len()) .sum(); @@ -307,10 +358,10 @@ impl<'a> Buf for SizedDictEntryBuf<'a> { fn chunk(&self) -> &[u8] { { let pos_in_slice = self.pos_in_slice; - if self.slice_ix >= self.entry.0.len() { + if self.slice_ix >= self.entry.rope_len() { &[] } else { - let slice = &self.entry.0[self.slice_ix]; + let slice = self.current_slice(); &slice[pos_in_slice..] } } @@ -318,37 +369,36 @@ impl<'a> Buf for SizedDictEntryBuf<'a> { fn advance(&mut self, cnt: usize) { { - let pos_in_slice: &mut usize = &mut self.pos_in_slice; let mut cnt = cnt; - if self.slice_ix < self.entry.0.len() { - let slice = &self.entry.0[self.slice_ix]; - let remaining_in_slice = slice.len() - *pos_in_slice; + if self.slice_ix < self.entry.rope_len() { + let slice = self.current_slice(); + let remaining_in_slice = slice.len() - self.pos_in_slice; if remaining_in_slice > cnt { // we remain in the slice we're at. - *pos_in_slice += cnt; + self.pos_in_slice += cnt; } else { // we are starting at the next slice cnt -= remaining_in_slice; self.slice_ix += 1; loop { - if self.entry.0.len() >= self.slice_ix { + if self.entry.rope_len() >= self.slice_ix { // past the end - *pos_in_slice = 0; + self.pos_in_slice = 0; break; } - let slice_len = self.entry.0[self.slice_ix].len(); + let slice_len = self.current_slice().len(); if cnt < slice_len { // this is our slice - *pos_in_slice = cnt; + self.pos_in_slice = cnt; break; } // not our slice, so advance to next - cnt -= self.entry.0.len(); + cnt -= self.entry.rope_len(); self.slice_ix += 1; } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index c33a4676..c0b8a663 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -713,7 +713,7 @@ mod tests { for i in 1..vec.len() + 1 { let (t, s) = dict.entry(i).unwrap(); - assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); + assert_eq!(vec[i - 1], (t, s.into_chunks().flatten().collect())); } assert_eq!( @@ -781,7 +781,7 @@ mod tests { for i in 1..vec.len() + 1 { let (t, s) = dict.entry(i).unwrap(); - assert_eq!(vec[i - 1], (t, s.0.into_iter().flatten().collect())); + assert_eq!(vec[i - 1], (t, s.into_chunks().flatten().collect())); } assert_eq!("Batman".to_string(), dict.get::(1).unwrap()); From bcca7cfd4917c3e833731ccd6cb0a00d6d769234 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 12:27:13 +0100 Subject: [PATCH 69/99] implement TypedDictEntry --- src/layer/builder.rs | 2 +- src/layer/id_map.rs | 4 +- src/structure/tfc/datatypes.rs | 8 +-- src/structure/tfc/file.rs | 4 +- src/structure/tfc/typed.rs | 123 +++++++++++++++++++-------------- 5 files changed, 79 insertions(+), 62 deletions(-) diff --git a/src/layer/builder.rs b/src/layer/builder.rs index d8eeb111..a2e753c2 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -77,7 +77,7 @@ impl DictionarySetFileBuilder { pub fn add_value(&mut self, value: &str) -> u64 { let id = self .value_dictionary_builder - .add(Datatype::String, Bytes::copy_from_slice(value.as_bytes())); + .add(TypedDictEntry::new(Datatype::String, Bytes::copy_from_slice(value.as_bytes()).into())); id } diff --git a/src/layer/id_map.rs b/src/layer/id_map.rs index e01f3331..6857a11d 100644 --- a/src/layer/id_map.rs +++ b/src/layer/id_map.rs @@ -142,7 +142,7 @@ pub async fn construct_idmaps_from_structures .map(|x| x.0) }; - let typed_entry_comparator = |vals: &[Option<&(u64, (Datatype, SizedDictEntry))>]| { + let typed_entry_comparator = |vals: &[Option<&(u64, TypedDictEntry)>]| { vals.iter() .enumerate() .filter(|(_, x)| x.is_some()) @@ -150,7 +150,7 @@ pub async fn construct_idmaps_from_structures .map(|x| x.0) }; - let sorted_node_iter = sorted_iterator(node_iters, entry_comparator).map(|(i,s)|(i, (Datatype::String, s))); + let sorted_node_iter = sorted_iterator(node_iters, entry_comparator).map(|(i,s)|(i, TypedDictEntry::new(Datatype::String, s))); let sorted_value_iter = sorted_iterator(value_iters, typed_entry_comparator); let sorted_node_value_iter = sorted_node_iter.chain(sorted_value_iter).map(|(id, _)| id - 1); let sorted_predicate_iter = diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 1505030e..63705feb 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -1,13 +1,13 @@ use super::{ decimal::{decimal_to_storage, storage_to_decimal}, - integer::{bigint_to_storage, storage_to_bigint}, + integer::{bigint_to_storage, storage_to_bigint}, TypedDictEntry, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use num_derive::FromPrimitive; use rug::Integer; -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, FromPrimitive, Hash)] pub enum Datatype { String = 0, UInt32, @@ -55,11 +55,11 @@ pub trait TdbDataType { val.to_lexical() } - fn make_entry(val: &T) -> (Datatype, Bytes) + fn make_entry(val: &T) -> TypedDictEntry where T: ToLexical + ?Sized, { - (Self::datatype(), val.to_lexical()) + TypedDictEntry::new(Self::datatype(), val.to_lexical().into()) } } diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index 645c85b6..316076cb 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -55,7 +55,7 @@ pub async fn merge_typed_dictionaries< ) -> io::Result<()> { let iterators: Vec<_> = dictionaries.map(|d| d.iter()).collect(); - let pick_fn = |vals: &[Option<&(Datatype, SizedDictEntry)>]| { + let pick_fn = |vals: &[Option<&TypedDictEntry>]| { vals.iter() .enumerate() .filter(|(_, v)| v.is_some()) @@ -63,7 +63,7 @@ pub async fn merge_typed_dictionaries< .map(|(ix, _)| ix) }; - let sorted_iterator = sorted_iterator(iterators, pick_fn).map(|(dt, elt)| (dt, elt.to_bytes())); + let sorted_iterator = sorted_iterator(iterators, pick_fn); let mut types_present_file_writer = dict_files.types_present_file.open_write().await?; let mut type_offsets_file_writer = dict_files.type_offsets_file.open_write().await?; diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index c0b8a663..142c7b53 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -9,9 +9,34 @@ use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, dict::{SizedDict, SizedDictBufBuilder}, - Datatype, TdbDataType, ToLexical, + Datatype, TdbDataType, ToLexical, SizedDictEntryBuf, OwnedSizedDictEntryBuf, }; +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct TypedDictEntry { + datatype: Datatype, + entry: SizedDictEntry +} + +impl TypedDictEntry { + pub fn new(datatype: Datatype, entry: SizedDictEntry) -> Self { + Self { + datatype, entry + } + } + pub fn to_bytes(&self) -> Bytes { + self.entry.to_bytes() + } + + pub fn as_buf(&self) -> SizedDictEntryBuf { + self.entry.as_buf() + } + + pub fn into_buf(self) -> OwnedSizedDictEntryBuf { + self.entry.into_buf() + } +} + #[derive(Clone, Debug)] pub struct TypedDict { types_present: MonotonicLogArray, @@ -85,14 +110,14 @@ impl TypedDict { } pub fn id>(&self, v: &Q) -> IdLookupResult { - let (datatype, bytes) = T::make_entry(v); + let entry = T::make_entry(v); - self.id_slice(datatype, bytes.as_ref()) + self.id_slice(entry.datatype, &entry.to_bytes()) } pub fn get(&self, id: usize) -> Option { let result = self.entry(id); - result.map(|(datatype, slice)| datatype.cast(slice.into_buf())) + result.map(|entry| entry.datatype.cast(entry.into_buf())) } fn inner_type_segment(&self, i: usize) -> (SizedDict, u64) { @@ -179,7 +204,7 @@ impl TypedDict { FromPrimitive::from_u64(self.types_present.entry(type_index)).unwrap() } - pub fn entry(&self, id: usize) -> Option<(Datatype, SizedDictEntry)> { + pub fn entry(&self, id: usize) -> Option { if id > self.num_entries() { return None; } @@ -187,7 +212,7 @@ impl TypedDict { let (dict, offset) = self.inner_type_segment(type_index); let dt = self.type_for_type_index(type_index); - dict.entry(id - offset as usize).map(|e| (dt, e)) + dict.entry(id - offset as usize).map(|e| TypedDictEntry::new(dt, e)) } pub fn num_entries(&self) -> usize { @@ -226,14 +251,14 @@ impl TypedDict { }) } - pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { + pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { self.block_iter() - .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| (datatype, entry))) + .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| TypedDictEntry::new(datatype, entry))) } - pub fn into_iter(self) -> impl Iterator + Clone { + pub fn into_iter(self) -> impl Iterator + Clone { self.into_block_iter() - .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| (datatype, entry))) + .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| TypedDictEntry::new(datatype, entry))) } } @@ -397,43 +422,39 @@ impl TypedDictBufBuilder u64 { + pub fn add(&mut self, value: TypedDictEntry) -> u64 { if self.current_datatype == None { - self.current_datatype = Some(dt); - self.types_present_builder.push(dt as u64); + self.current_datatype = Some(value.datatype); + self.types_present_builder.push(value.datatype as u64); self.sized_dict_buf_builder .as_mut() - .map(|b| b.record_size = dt.record_size()); + .map(|b| b.record_size = value.datatype.record_size()); } - if self.current_datatype != Some(dt) { + if self.current_datatype != Some(value.datatype) { let (block_offset_builder, data_buf, block_offset, id_offset) = self.sized_dict_buf_builder.take().unwrap().finalize(); - self.types_present_builder.push(dt as u64); + self.types_present_builder.push(value.datatype as u64); self.type_offsets_builder .push(block_offset_builder.count() as u64 - 1); self.sized_dict_buf_builder = Some(SizedDictBufBuilder::new( - dt.record_size(), + value.datatype.record_size(), block_offset, id_offset, block_offset_builder, data_buf, )); - self.current_datatype = Some(dt); + self.current_datatype = Some(value.datatype); } self.sized_dict_buf_builder .as_mut() - .map(|s| s.add(value)) + .map(|s| s.add(value.entry.to_bytes())) .unwrap() } - pub fn add_entry(&mut self, dt: Datatype, e: &SizedDictEntry) -> u64 { - self.add(dt, e.to_bytes()) - } - - pub fn add_all>(&mut self, it: I) -> Vec { - it.map(|(dt, val)| self.add(dt, val)).collect() + pub fn add_all>(&mut self, it: I) -> Vec { + it.map(|val| self.add(val)).collect() } pub fn finalize(self) -> (B1, B2, B3, B4) { @@ -471,7 +492,7 @@ mod tests { B2: BufMut, B3: BufMut, B4: BufMut, - I: Iterator, + I: Iterator, >( used_types_buf: &mut B1, type_offsets_buf: &mut B2, @@ -669,7 +690,7 @@ mod tests { #[test] fn test_multi_segment() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ Decimal::make_entry(&Decimal("-1".to_string())), String::make_entry(&"asdf"), Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), @@ -712,8 +733,8 @@ mod tests { assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); for i in 1..vec.len() + 1 { - let (t, s) = dict.entry(i).unwrap(); - assert_eq!(vec[i - 1], (t, s.into_chunks().flatten().collect())); + let entry = dict.entry(i).unwrap(); + assert_eq!(vec[i - 1], entry); } assert_eq!( @@ -724,7 +745,7 @@ mod tests { #[test] fn test_full_blocks() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), String::make_entry(&"a"), String::make_entry(&"bc"), @@ -780,8 +801,8 @@ mod tests { assert_eq!(31, dict.num_entries()); for i in 1..vec.len() + 1 { - let (t, s) = dict.entry(i).unwrap(); - assert_eq!(vec[i - 1], (t, s.into_chunks().flatten().collect())); + let entry = dict.entry(i).unwrap(); + assert_eq!(vec[i - 1], entry); } assert_eq!("Batman".to_string(), dict.get::(1).unwrap()); @@ -822,7 +843,7 @@ mod tests { #[test] fn iterate_full_blocks() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), String::make_entry(&"a"), String::make_entry(&"bc"), @@ -875,18 +896,14 @@ mod tests { data.freeze(), ); - let actual: Vec<_> = dict.iter().map(|(dt, e)| (dt, e.to_bytes())).collect(); + let actual: Vec<_> = dict.iter().collect(); assert_eq!(vec, actual); } - fn convert_entry(e: (Datatype, SizedDictEntry)) -> (Datatype, Bytes) { - (e.0, e.1.to_bytes()) - } - #[test] fn test_one_string() { - let vec: Vec<(Datatype, Bytes)> = vec![String::make_entry(&"fdsa")]; + let vec: Vec = vec![String::make_entry(&"fdsa")]; let used_types_buf = BytesMut::new(); let type_offsets_buf = BytesMut::new(); let block_offsets_buf = BytesMut::new(); @@ -902,7 +919,7 @@ mod tests { let _results: Vec = vec .clone() .into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)) + .map(|entry| typed_builder.add(entry)) .collect(); let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); @@ -913,12 +930,12 @@ mod tests { block_offsets.freeze(), data.freeze(), ); - assert_eq!(vec[0], convert_entry(dict.entry(1).unwrap())) + assert_eq!(vec[0], dict.entry(1).unwrap()) } #[test] fn test_incremental_builder() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), String::make_entry(&"a"), String::make_entry(&"bc"), @@ -968,7 +985,7 @@ mod tests { let _results: Vec = vec .clone() .into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)) + .map(|entry| typed_builder.add(entry)) .collect(); let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); @@ -981,13 +998,13 @@ mod tests { ); for i in 0..vec.len() { - assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + assert_eq!(vec[i], dict.entry(i + 1).unwrap()) } } #[test] fn test_incremental_builder_small_dicts() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), i32::make_entry(&-500_i32), u32::make_entry(&20_u32), @@ -1013,7 +1030,7 @@ mod tests { let _results: Vec = vec .clone() .into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)) + .map(|entry| typed_builder.add(entry)) .collect(); let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); @@ -1026,13 +1043,13 @@ mod tests { ); for i in 0..vec.len() { - assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + assert_eq!(vec[i], dict.entry(i + 1).unwrap()) } } #[test] fn test_two_blocks() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), String::make_entry(&"a"), String::make_entry(&"bc"), @@ -1055,7 +1072,7 @@ mod tests { let _results: Vec = vec .clone() .into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)) + .map(|entry| typed_builder.add(entry)) .collect(); let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); @@ -1068,13 +1085,13 @@ mod tests { ); for i in 0..vec.len() { - assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + assert_eq!(vec[i], dict.entry(i + 1).unwrap()) } } #[test] fn test_three_blocks() { - let mut vec: Vec<(Datatype, Bytes)> = vec![ + let mut vec: Vec = vec![ String::make_entry(&"fdsa"), String::make_entry(&"a"), String::make_entry(&"bc"), @@ -1128,7 +1145,7 @@ mod tests { let _results: Vec = vec .clone() .into_iter() - .map(|(dt, entry)| typed_builder.add(dt, entry)) + .map(|entry| typed_builder.add(entry)) .collect(); let (used_types, type_offsets, block_offsets, data) = typed_builder.finalize(); @@ -1141,7 +1158,7 @@ mod tests { ); for i in 0..vec.len() { - assert_eq!(vec[i], convert_entry(dict.entry(i + 1).unwrap())) + assert_eq!(vec[i], dict.entry(i + 1).unwrap()) } } } From bdd4bec9364a642a82e6a1ad7a1943ddffe07e66 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 12:27:33 +0100 Subject: [PATCH 70/99] reformat everything --- src/layer/builder.rs | 21 ++++++++------------- src/layer/id_map.rs | 24 ++++++++++++++---------- src/storage/layer.rs | 14 +++----------- src/structure/tfc/block.rs | 14 +++++++------- src/structure/tfc/datatypes.rs | 3 ++- src/structure/tfc/dict.rs | 28 ++++++++++++++++++---------- src/structure/tfc/file.rs | 9 +++++++-- src/structure/tfc/typed.rs | 25 +++++++++++++++---------- 8 files changed, 74 insertions(+), 64 deletions(-) diff --git a/src/layer/builder.rs b/src/layer/builder.rs index a2e753c2..68adcbe4 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -24,14 +24,9 @@ impl DictionarySetFileBuilder { predicate_files: DictionaryFiles, value_files: TypedDictionaryFiles, ) -> io::Result { - let node_dictionary_builder = StringDictBufBuilder::new( - BytesMut::new(), - BytesMut::new(), - ); - let predicate_dictionary_builder = StringDictBufBuilder::new( - BytesMut::new(), - BytesMut::new(), - ); + let node_dictionary_builder = StringDictBufBuilder::new(BytesMut::new(), BytesMut::new()); + let predicate_dictionary_builder = + StringDictBufBuilder::new(BytesMut::new(), BytesMut::new()); let value_dictionary_builder = TypedDictBufBuilder::new( BytesMut::new(), BytesMut::new(), @@ -75,9 +70,10 @@ impl DictionarySetFileBuilder { /// /// Panics if the given value string is not a lexical successor of the previous value string. pub fn add_value(&mut self, value: &str) -> u64 { - let id = self - .value_dictionary_builder - .add(TypedDictEntry::new(Datatype::String, Bytes::copy_from_slice(value.as_bytes()).into())); + let id = self.value_dictionary_builder.add(TypedDictEntry::new( + Datatype::String, + Bytes::copy_from_slice(value.as_bytes()).into(), + )); id } @@ -140,8 +136,7 @@ impl DictionarySetFileBuilder { } pub async fn finalize(self) -> io::Result<()> { - let (mut node_offsets_buf, mut node_data_buf) = - self.node_dictionary_builder.finalize(); + let (mut node_offsets_buf, mut node_data_buf) = self.node_dictionary_builder.finalize(); let (mut predicate_offsets_buf, mut predicate_data_buf) = self.predicate_dictionary_builder.finalize(); let ( diff --git a/src/layer/id_map.rs b/src/layer/id_map.rs index 6857a11d..832972df 100644 --- a/src/layer/id_map.rs +++ b/src/layer/id_map.rs @@ -35,7 +35,7 @@ impl IdMap { if id > wtree.len() as u64 { None } else { - Some(wtree.lookup_one(id-1).unwrap() + 1) + Some(wtree.lookup_one(id - 1).unwrap() + 1) } }) .unwrap_or(id) @@ -48,7 +48,7 @@ impl IdMap { if id > wtree.len() as u64 { None } else { - let id:usize = id.try_into().unwrap(); + let id: usize = id.try_into().unwrap(); Some(wtree.decode_one(id - 1) + 1) } }) @@ -91,7 +91,7 @@ pub async fn construct_idmaps_from_structures let mut node_iters = Vec::with_capacity(len); let mut node_offset = 0; - let node_entries_len: Vec<_> = node_dicts.iter().map(|d|d.num_entries()).collect(); + let node_entries_len: Vec<_> = node_dicts.iter().map(|d| d.num_entries()).collect(); for (ix, dict) in node_dicts.into_iter().enumerate() { let idmap = node_value_idmaps[ix].clone(); let num_entries = dict.num_entries(); @@ -125,11 +125,12 @@ pub async fn construct_idmaps_from_structures for (ix, dict) in predicate_dicts.into_iter().enumerate() { let idmap = predicate_idmaps[ix].clone(); let num_entries = dict.num_entries(); - predicate_iters.push( - dict.into_iter() - .enumerate() - .map(move |(i, e)| (idmap.inner_to_outer(i as u64 + 1) + predicate_offset as u64, e)), - ); + predicate_iters.push(dict.into_iter().enumerate().map(move |(i, e)| { + ( + idmap.inner_to_outer(i as u64 + 1) + predicate_offset as u64, + e, + ) + })); predicate_offset += num_entries; } @@ -150,9 +151,12 @@ pub async fn construct_idmaps_from_structures .map(|x| x.0) }; - let sorted_node_iter = sorted_iterator(node_iters, entry_comparator).map(|(i,s)|(i, TypedDictEntry::new(Datatype::String, s))); + let sorted_node_iter = sorted_iterator(node_iters, entry_comparator) + .map(|(i, s)| (i, TypedDictEntry::new(Datatype::String, s))); let sorted_value_iter = sorted_iterator(value_iters, typed_entry_comparator); - let sorted_node_value_iter = sorted_node_iter.chain(sorted_value_iter).map(|(id, _)| id - 1); + let sorted_node_value_iter = sorted_node_iter + .chain(sorted_value_iter) + .map(|(id, _)| id - 1); let sorted_predicate_iter = sorted_iterator(predicate_iters, entry_comparator).map(|(id, _)| id - 1); diff --git a/src/storage/layer.rs b/src/storage/layer.rs index 2f63fc6d..323e6732 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -15,9 +15,7 @@ use crate::structure::dict_file_get_count; use crate::structure::logarray::logarray_file_get_length_and_width; use crate::structure::StringDict; use crate::structure::TypedDict; -use crate::structure::{ - util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree, -}; +use crate::structure::{util, AdjacencyList, BitIndex, LogArray, MonotonicLogArray, WaveletTree}; use std::convert::TryInto; use std::io; @@ -1569,10 +1567,7 @@ impl) + Rope(Vec), } impl From for SizedDictEntry { @@ -119,17 +119,17 @@ impl SizedDictEntry { } } - pub fn chunks(&self) -> impl Iterator { + pub fn chunks(&self) -> impl Iterator { match self { Self::Single(b) => Either::Left(std::iter::once(b)), - Self::Rope(v) => Either::Right(v.iter()) + Self::Rope(v) => Either::Right(v.iter()), } } - pub fn into_chunks(self) -> impl Iterator { + pub fn into_chunks(self) -> impl Iterator { match self { Self::Single(b) => Either::Left(std::iter::once(b)), - Self::Rope(v) => Either::Right(v.into_iter()) + Self::Rope(v) => Either::Right(v.into_iter()), } } @@ -166,7 +166,7 @@ impl SizedDictEntry { fn rope_len(&self) -> usize { match self { Self::Single(_) => 1, - Self::Rope(v) => v.len() + Self::Rope(v) => v.len(), } } @@ -336,7 +336,7 @@ impl<'a> SizedDictEntryBuf<'a> { fn current_slice(&self) -> &Bytes { match self.entry.as_ref() { SizedDictEntry::Single(b) => &b, - SizedDictEntry::Rope(v) => &v[self.slice_ix] + SizedDictEntry::Rope(v) => &v[self.slice_ix], } } } diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 63705feb..44f790c1 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -1,6 +1,7 @@ use super::{ decimal::{decimal_to_storage, storage_to_decimal}, - integer::{bigint_to_storage, storage_to_bigint}, TypedDictEntry, + integer::{bigint_to_storage, storage_to_bigint}, + TypedDictEntry, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; diff --git a/src/structure/tfc/dict.rs b/src/structure/tfc/dict.rs index 16118164..ce4deee1 100644 --- a/src/structure/tfc/dict.rs +++ b/src/structure/tfc/dict.rs @@ -159,8 +159,7 @@ impl SizedDict { pub fn num_blocks(&self) -> usize { if self.data.is_empty() { 0 - } - else { + } else { self.offsets.len() + 1 } } @@ -195,9 +194,7 @@ impl SizedDict { max = mid - 1; } Ordering::Greater => min = mid + 1, - Ordering::Equal => { - return IdLookupResult::Found((mid * BLOCK_SIZE + 1) as u64) - } // what luck! turns out the string we were looking for was the block head + Ordering::Equal => return IdLookupResult::Found((mid * BLOCK_SIZE + 1) as u64), // what luck! turns out the string we were looking for was the block head } } @@ -249,8 +246,7 @@ impl SizedDict { let num_blocks = self.num_blocks(); if num_blocks == 0 { 0 - } - else { + } else { let last_block_size = self.block_num_elements(num_blocks - 1); (num_blocks - 1) * BLOCK_SIZE + last_block_size as usize @@ -357,7 +353,11 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(|s|Bytes::from(s))); + build_dict_and_offsets( + &mut array_buf, + &mut data_buf, + strings.clone().into_iter().map(|s| Bytes::from(s)), + ); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -448,7 +448,11 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(Bytes::from)); + build_dict_and_offsets( + &mut array_buf, + &mut data_buf, + strings.clone().into_iter().map(Bytes::from), + ); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); @@ -480,7 +484,11 @@ mod tests { let mut array_buf = BytesMut::new(); let mut data_buf = BytesMut::new(); - build_dict_and_offsets(&mut array_buf, &mut data_buf, strings.clone().into_iter().map(Bytes::from)); + build_dict_and_offsets( + &mut array_buf, + &mut data_buf, + strings.clone().into_iter().map(Bytes::from), + ); let array_bytes = array_buf.freeze(); let data_bytes = data_buf.freeze(); diff --git a/src/structure/tfc/file.rs b/src/structure/tfc/file.rs index 316076cb..12e9cad7 100644 --- a/src/structure/tfc/file.rs +++ b/src/structure/tfc/file.rs @@ -1,7 +1,7 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::BytesMut; use std::io; -use tokio::io::{AsyncWriteExt, AsyncReadExt}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use crate::{storage::*, structure::util::sorted_iterator}; @@ -70,7 +70,12 @@ pub async fn merge_typed_dictionaries< let mut blocks_file_writer = dict_files.blocks_file.open_write().await?; let mut offsets_file_writer = dict_files.offsets_file.open_write().await?; - let mut builder = TypedDictBufBuilder::new(BytesMut::new(), BytesMut::new(), BytesMut::new(), BytesMut::new()); + let mut builder = TypedDictBufBuilder::new( + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + BytesMut::new(), + ); builder.add_all(sorted_iterator); let (types_present_buf, type_offsets_buf, offsets_buf, data_buf) = builder.finalize(); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 142c7b53..cb999636 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -9,20 +9,18 @@ use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, dict::{SizedDict, SizedDictBufBuilder}, - Datatype, TdbDataType, ToLexical, SizedDictEntryBuf, OwnedSizedDictEntryBuf, + Datatype, OwnedSizedDictEntryBuf, SizedDictEntryBuf, TdbDataType, ToLexical, }; #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct TypedDictEntry { datatype: Datatype, - entry: SizedDictEntry + entry: SizedDictEntry, } impl TypedDictEntry { pub fn new(datatype: Datatype, entry: SizedDictEntry) -> Self { - Self { - datatype, entry - } + Self { datatype, entry } } pub fn to_bytes(&self) -> Bytes { self.entry.to_bytes() @@ -212,7 +210,8 @@ impl TypedDict { let (dict, offset) = self.inner_type_segment(type_index); let dt = self.type_for_type_index(type_index); - dict.entry(id - offset as usize).map(|e| TypedDictEntry::new(dt, e)) + dict.entry(id - offset as usize) + .map(|e| TypedDictEntry::new(dt, e)) } pub fn num_entries(&self) -> usize { @@ -252,13 +251,19 @@ impl TypedDict { } pub fn iter<'a>(&'a self) -> impl Iterator + 'a + Clone { - self.block_iter() - .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| TypedDictEntry::new(datatype, entry))) + self.block_iter().flat_map(|(datatype, segment)| { + segment + .into_iter() + .map(move |entry| TypedDictEntry::new(datatype, entry)) + }) } pub fn into_iter(self) -> impl Iterator + Clone { - self.into_block_iter() - .flat_map(|(datatype, segment)| segment.into_iter().map(move |entry| TypedDictEntry::new(datatype, entry))) + self.into_block_iter().flat_map(|(datatype, segment)| { + segment + .into_iter() + .map(move |entry| TypedDictEntry::new(datatype, entry)) + }) } } From a0e9546a99edf786623f3a4cc0365be1cc592cb9 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 13:26:18 +0100 Subject: [PATCH 71/99] split from_lexical to its own trait --- src/structure/tfc/datatypes.rs | 35 +++++++++++++++++++++++++++------- src/structure/tfc/typed.rs | 1 + 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 44f790c1..90dff6a9 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -45,9 +45,8 @@ impl Datatype { } } -pub trait TdbDataType { +pub trait TdbDataType: FromLexical { fn datatype() -> Datatype; - fn from_lexical(b: B) -> Self; fn to_lexical(val: &T) -> Bytes where @@ -68,17 +67,17 @@ pub trait ToLexical { fn to_lexical(&self) -> Bytes; } +pub trait FromLexical { + fn from_lexical(b: B) -> Self; +} + impl> ToLexical for T { fn to_lexical(&self) -> Bytes { Bytes::copy_from_slice(self.as_ref().as_bytes()) } } -impl TdbDataType for String { - fn datatype() -> Datatype { - Datatype::String - } - +impl FromLexical for String { fn from_lexical(mut b: B) -> Self { let mut vec = vec![0; b.remaining()]; b.copy_to_slice(&mut vec); @@ -86,11 +85,19 @@ impl TdbDataType for String { } } +impl TdbDataType for String { + fn datatype() -> Datatype { + Datatype::String + } +} + impl TdbDataType for u32 { fn datatype() -> Datatype { Datatype::UInt32 } +} +impl FromLexical for u32 { fn from_lexical(b: B) -> Self { b.reader().read_u32::().unwrap() } @@ -110,7 +117,9 @@ impl TdbDataType for i32 { fn datatype() -> Datatype { Datatype::Int32 } +} +impl FromLexical for i32 { fn from_lexical(b: B) -> Self { let i = b.reader().read_u32::().unwrap(); (I32_BYTE_MASK ^ i) as i32 @@ -130,7 +139,9 @@ impl TdbDataType for u64 { fn datatype() -> Datatype { Datatype::UInt64 } +} +impl FromLexical for u64 { fn from_lexical(b: B) -> Self { b.reader().read_u64::().unwrap() } @@ -150,7 +161,9 @@ impl TdbDataType for i64 { fn datatype() -> Datatype { Datatype::Int64 } +} +impl FromLexical for i64 { fn from_lexical(b: B) -> Self { let i = b.reader().read_u64::().unwrap(); (I64_BYTE_MASK ^ i) as i64 @@ -172,7 +185,9 @@ impl TdbDataType for f32 { fn datatype() -> Datatype { Datatype::Float32 } +} +impl FromLexical for f32 { fn from_lexical(b: B) -> Self { let i = b.reader().read_u32::().unwrap(); if i & F32_SIGN_MASK > 0 { @@ -203,7 +218,9 @@ impl TdbDataType for f64 { fn datatype() -> Datatype { Datatype::Float64 } +} +impl FromLexical for f64 { fn from_lexical(b: B) -> Self { let i = b.reader().read_u64::().unwrap(); if i & F64_SIGN_MASK > 0 { @@ -232,7 +249,9 @@ impl TdbDataType for Integer { fn datatype() -> Datatype { Datatype::BigInt } +} +impl FromLexical for Integer { fn from_lexical(mut b: B) -> Self { storage_to_bigint(&mut b) } @@ -251,7 +270,9 @@ impl TdbDataType for Decimal { fn datatype() -> Datatype { Datatype::Decimal } +} +impl FromLexical for Decimal { fn from_lexical(mut b: B) -> Self { Decimal(storage_to_decimal(&mut b)) } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index cb999636..d9e1097c 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -489,6 +489,7 @@ mod tests { use bytes::BytesMut; use rug::Integer; + use super::super::datatypes::FromLexical; use crate::structure::Decimal; use super::*; From 6147b2b35da30f651a13889c236b845ca21530ae Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 13:32:51 +0100 Subject: [PATCH 72/99] put trait bound on TdbDataType ensuring FromLexical and ToLexical are implemented --- src/structure/tfc/datatypes.rs | 16 ++++++++-------- src/structure/tfc/typed.rs | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 90dff6a9..d12e4e7b 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -45,16 +45,9 @@ impl Datatype { } } -pub trait TdbDataType: FromLexical { +pub trait TdbDataType: FromLexical + ToLexical { fn datatype() -> Datatype; - fn to_lexical(val: &T) -> Bytes - where - T: ToLexical + ?Sized, - { - val.to_lexical() - } - fn make_entry(val: &T) -> TypedDictEntry where T: ToLexical + ?Sized, @@ -257,6 +250,13 @@ impl FromLexical for Integer { } } +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + // TODO make this better + storage_to_bigint(&mut b).to_string() + } +} + impl ToLexical for Integer { fn to_lexical(&self) -> Bytes { Bytes::from(bigint_to_storage(self.clone())) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index d9e1097c..a3fa1aed 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -312,7 +312,7 @@ impl TypedDictSegment { } pub fn id>(&self, val: &Q) -> IdLookupResult { - let slice = T::to_lexical(val); + let slice = val.to_lexical(); self.dict.id(&slice[..]) } @@ -608,7 +608,7 @@ mod tests { where D: TdbDataType + PartialEq + Debug + ToLexical, { - let j = D::from_lexical(::to_lexical(&d)); + let j = D::from_lexical(d.to_lexical()); assert_eq!(d, j) } From 4e2b9e8b0788b218862d773b6906174ef4c0ffeb Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 13:39:26 +0100 Subject: [PATCH 73/99] remove debug print --- src/layer/layer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/layer/layer.rs b/src/layer/layer.rs index ca88a338..27680d31 100644 --- a/src/layer/layer.rs +++ b/src/layer/layer.rs @@ -79,7 +79,6 @@ pub trait Layer: Send + Sync { /// Returns true if the given triple exists, and false otherwise. fn string_triple_exists(&self, triple: &StringTriple) -> bool { - eprintln!("I am here"); self.string_triple_to_id(triple) .map(|t| self.id_triple_exists(t)) .unwrap_or(false) From 534068aa42c6f2b9f01b9a0e9cd8dee0d2abc954 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 14:45:55 +0100 Subject: [PATCH 74/99] change interface to allow adding of arbitrary values, not just strings --- benches/bench.rs | 4 +- benches/builder/data.rs | 8 +- benches/builder/main.rs | 8 +- examples/print_graph.rs | 5 +- examples/write_to_graph.rs | 12 +- src/layer/builder.rs | 11 +- src/layer/internal/base.rs | 15 +- src/layer/internal/child.rs | 30 ++-- src/layer/internal/mod.rs | 22 +-- src/layer/internal/object_iterator.rs | 36 ++--- src/layer/internal/predicate_iterator.rs | 38 ++--- src/layer/internal/subject_iterator.rs | 117 ++++++++-------- src/layer/layer.rs | 94 ++++++++----- src/layer/simple_builder.rs | 170 ++++++++++++++--------- src/lib.rs | 2 +- src/storage/cache.rs | 26 ++-- src/storage/delta.rs | 66 ++++----- src/storage/directory.rs | 67 +++++---- src/storage/layer.rs | 70 +++++----- src/storage/memory.rs | 18 +-- src/storage/pack.rs | 12 +- src/store/mod.rs | 90 ++++++------ src/store/sync.rs | 51 ++++--- src/structure/tfc/typed.rs | 5 + 24 files changed, 535 insertions(+), 442 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 274b2618..e5522071 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,7 +3,7 @@ extern crate test; use tempfile::tempdir; use terminus_store; -use terminus_store::layer::StringTriple; +use terminus_store::layer::ValueTriple; use test::Bencher; #[bench] @@ -14,7 +14,7 @@ fn bench_add_string_triple(b: &mut Bencher) { let mut count = 1; b.iter(|| { layer_builder - .add_string_triple(StringTriple::new_value( + .add_value_triple(ValueTriple::new_string_value( &count.to_string(), &count.to_string(), &count.to_string(), diff --git a/benches/builder/data.rs b/benches/builder/data.rs index fd3285e6..506209c7 100644 --- a/benches/builder/data.rs +++ b/benches/builder/data.rs @@ -1,7 +1,7 @@ use rand::distributions::Alphanumeric; use rand::prelude::*; use std::iter; -use terminus_store::layer::StringTriple; +use terminus_store::layer::ValueTriple; fn random_string(rand: &mut R, len_min: usize, len_max: usize) -> String { let len: usize = rand.gen_range(len_min..len_max); @@ -50,19 +50,19 @@ impl TestData { } } - pub fn random_triple(&mut self) -> StringTriple { + pub fn random_triple(&mut self) -> ValueTriple { let subject_ix = self.rand.gen_range(0..self.nodes.len()); let predicate_ix = self.rand.gen_range(0..self.predicates.len()); if self.rand.gen() { let object_ix = self.rand.gen_range(0..self.nodes.len()); - StringTriple::new_node( + ValueTriple::new_node( &self.nodes[subject_ix], &self.predicates[predicate_ix], &self.nodes[object_ix], ) } else { let object_ix = self.rand.gen_range(0..self.values.len()); - StringTriple::new_value( + ValueTriple::new_string_value( &self.nodes[subject_ix], &self.predicates[predicate_ix], &self.values[object_ix], diff --git a/benches/builder/main.rs b/benches/builder/main.rs index 83943370..9557c93b 100644 --- a/benches/builder/main.rs +++ b/benches/builder/main.rs @@ -38,7 +38,7 @@ fn build_base_layer_1000(b: &mut Bencher) { let builder = store.create_base_layer().unwrap(); for triple in triples.iter() { - builder.add_string_triple(triple.clone()).unwrap(); + builder.add_value_triple(triple.clone()).unwrap(); } let _base_layer = builder.commit().unwrap(); @@ -78,7 +78,7 @@ fn build_nonempty_child_layer_on_empty_base_layer(b: &mut Bencher) { let builder = base_layer.open_write().unwrap(); for triple in triples.iter() { - builder.add_string_triple(triple.clone()).unwrap(); + builder.add_value_triple(triple.clone()).unwrap(); } builder.commit().unwrap(); @@ -97,7 +97,7 @@ fn build_nonempty_child_layer_on_nonempty_base_layer(b: &mut Bencher) { let builder = store.create_base_layer().unwrap(); for _ in 0..1000 { - builder.add_string_triple(data.random_triple()).unwrap(); + builder.add_value_triple(data.random_triple()).unwrap(); } let base_layer = builder.commit().unwrap(); @@ -110,7 +110,7 @@ fn build_nonempty_child_layer_on_nonempty_base_layer(b: &mut Bencher) { let builder = base_layer.open_write().unwrap(); for triple in triples.iter() { - builder.add_string_triple(triple.clone()).unwrap(); + builder.add_value_triple(triple.clone()).unwrap(); } builder.commit().unwrap(); diff --git a/examples/print_graph.rs b/examples/print_graph.rs index 55776a1c..85513e91 100644 --- a/examples/print_graph.rs +++ b/examples/print_graph.rs @@ -1,6 +1,7 @@ use std::env; use std::io; +use terminus_store::structure::TdbDataType; use terminus_store::*; use tokio; @@ -21,7 +22,7 @@ async fn print_graph(store_path: &str, graph: &str) -> io::Result<()> { .expect("expected id triple to be mapable to string"); println!( - "{}, {}, {} {}", + "{}, {}, {} {:?}", triple.subject, triple.predicate, match triple.object { @@ -29,7 +30,7 @@ async fn print_graph(store_path: &str, graph: &str) -> io::Result<()> { ObjectType::Value(_) => "value", }, match triple.object { - ObjectType::Node(n) => n, + ObjectType::Node(n) => String::make_entry(&n), ObjectType::Value(v) => v, } ); diff --git a/examples/write_to_graph.rs b/examples/write_to_graph.rs index 90bcfde3..5a67ceb6 100644 --- a/examples/write_to_graph.rs +++ b/examples/write_to_graph.rs @@ -7,8 +7,8 @@ use tokio; use tokio::io::{self, AsyncBufReadExt}; enum Command { - Add(StringTriple), - Remove(StringTriple), + Add(ValueTriple), + Remove(ValueTriple), } async fn parse_command(s: &str) -> io::Result { @@ -25,8 +25,8 @@ async fn parse_command(s: &str) -> io::Result { let object = &matches[5]; let triple = match object_type_name { - "node" => StringTriple::new_node(subject, predicate, object), - "value" => StringTriple::new_value(subject, predicate, object), + "node" => ValueTriple::new_node(subject, predicate, object), + "value" => ValueTriple::new_string_value(subject, predicate, object), _ => { return Err(io::Error::new( io::ErrorKind::InvalidData, @@ -84,8 +84,8 @@ async fn process_commands(store_path: &str, graph: &str) -> io::Result<()> { // Since no io is happening, adding triples to the builder is // not a future. match command { - Command::Add(triple) => builder.add_string_triple(triple)?, - Command::Remove(triple) => builder.remove_string_triple(triple)?, + Command::Add(triple) => builder.add_value_triple(triple)?, + Command::Remove(triple) => builder.remove_value_triple(triple)?, } } diff --git a/src/layer/builder.rs b/src/layer/builder.rs index 68adcbe4..d8a18d73 100644 --- a/src/layer/builder.rs +++ b/src/layer/builder.rs @@ -69,11 +69,8 @@ impl DictionarySetFileBuilder { /// Add a value string. /// /// Panics if the given value string is not a lexical successor of the previous value string. - pub fn add_value(&mut self, value: &str) -> u64 { - let id = self.value_dictionary_builder.add(TypedDictEntry::new( - Datatype::String, - Bytes::copy_from_slice(value.as_bytes()).into(), - )); + pub fn add_value(&mut self, value: TypedDictEntry) -> u64 { + let id = self.value_dictionary_builder.add(value); id } @@ -119,7 +116,7 @@ impl DictionarySetFileBuilder { /// Add values from an iterable. /// /// Panics if the values are not in lexical order, or if previous added values are a lexical succesor of any of these values. - pub fn add_values + Unpin + Send + Sync>( + pub fn add_values + Unpin + Send + Sync>( &mut self, values: I, ) -> Vec @@ -128,7 +125,7 @@ impl DictionarySetFileBuilder { { let mut ids = Vec::new(); for value in values { - let id = self.add_value(&value); + let id = self.add_value(value); ids.push(id); } diff --git a/src/layer/internal/base.rs b/src/layer/internal/base.rs index 4b11fd06..eb7654cd 100644 --- a/src/layer/internal/base.rs +++ b/src/layer/internal/base.rs @@ -189,7 +189,7 @@ impl BaseLayerFileBuilder { /// Add a value string. /// /// Panics if the given value string is not a lexical successor of the previous value string. - pub fn add_value(&mut self, value: &str) -> u64 { + pub fn add_value(&mut self, value: TypedDictEntry) -> u64 { let id = self.builder.add_value(value); id @@ -230,7 +230,7 @@ impl BaseLayerFileBuilder { /// Add values from an iterable. /// /// Panics if the values are not in lexical order, or if previous added values are a lexical succesor of any of these values. - pub fn add_values + Send>( + pub fn add_values + Send>( &mut self, values: I, ) -> Vec @@ -466,7 +466,7 @@ pub mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await?; @@ -513,7 +513,12 @@ pub mod tests { assert_eq!(3, base_layer.subject_id("bbbbb").unwrap()); assert_eq!(2, base_layer.predicate_id("fghij").unwrap()); assert_eq!(1, base_layer.object_node_id("aaaaa").unwrap()); - assert_eq!(6, base_layer.object_value_id("chicken").unwrap()); + assert_eq!( + 6, + base_layer + .object_value_id(&String::make_entry(&"chicken")) + .unwrap() + ); assert_eq!("bbbbb", base_layer.id_subject(3).unwrap()); assert_eq!("fghij", base_layer.id_predicate(2).unwrap()); @@ -522,7 +527,7 @@ pub mod tests { base_layer.id_object(1).unwrap() ); assert_eq!( - ObjectType::Value("chicken".to_string()), + ObjectType::Value(String::make_entry(&"chicken")), base_layer.id_object(6).unwrap() ); } diff --git a/src/layer/internal/child.rs b/src/layer/internal/child.rs index 9690a297..37a368d7 100644 --- a/src/layer/internal/child.rs +++ b/src/layer/internal/child.rs @@ -258,8 +258,8 @@ impl ChildLayerFileBuil /// Does nothing if the value already exists in the paretn, and /// panics if the given value string is not a lexical successor of /// the previous value string. - pub fn add_value(&mut self, value: &str) -> u64 { - match self.parent.object_value_id(value) { + pub fn add_value(&mut self, value: TypedDictEntry) -> u64 { + match self.parent.object_value_id(&value) { None => self.builder.add_value(value), Some(id) => id, } @@ -317,7 +317,7 @@ impl ChildLayerFileBuil /// added values are a lexical succesor of any of these /// values. Skips any nodes that are already part of the base /// layer. - pub fn add_values + Send>( + pub fn add_values + Send>( &mut self, values: I, ) -> Vec @@ -327,7 +327,7 @@ impl ChildLayerFileBuil // TODO bulk check predicate existence let mut result = Vec::new(); for value in values { - let id = self.add_value(&value); + let id = self.add_value(value); result.push(id); } @@ -958,7 +958,7 @@ pub mod tests { .unwrap(); b.add_node("foo"); b.add_predicate("bar"); - b.add_value("baz"); + b.add_value(String::make_entry(&"baz")); let b = b.into_phase2().await.unwrap(); b.finalize().await.unwrap(); @@ -970,7 +970,12 @@ pub mod tests { assert_eq!(3, child_layer.subject_id("bbbbb").unwrap()); assert_eq!(2, child_layer.predicate_id("fghij").unwrap()); assert_eq!(1, child_layer.object_node_id("aaaaa").unwrap()); - assert_eq!(6, child_layer.object_value_id("chicken").unwrap()); + assert_eq!( + 6, + child_layer + .object_value_id(&String::make_entry(&"chicken")) + .unwrap() + ); assert_eq!("bbbbb", child_layer.id_subject(3).unwrap()); assert_eq!("fghij", child_layer.id_predicate(2).unwrap()); @@ -979,7 +984,7 @@ pub mod tests { child_layer.id_object(1).unwrap() ); assert_eq!( - ObjectType::Value("chicken".to_string()), + ObjectType::Value(String::make_entry(&"chicken")), child_layer.id_object(6).unwrap() ); } @@ -996,7 +1001,7 @@ pub mod tests { .unwrap(); b.add_node("foo"); b.add_predicate("bar"); - b.add_value("baz"); + b.add_value(String::make_entry(&"baz")); let b = b.into_phase2().await.unwrap(); b.finalize().await.unwrap(); @@ -1008,7 +1013,12 @@ pub mod tests { assert_eq!(11, child_layer.subject_id("foo").unwrap()); assert_eq!(5, child_layer.predicate_id("bar").unwrap()); assert_eq!(11, child_layer.object_node_id("foo").unwrap()); - assert_eq!(12, child_layer.object_value_id("baz").unwrap()); + assert_eq!( + 12, + child_layer + .object_value_id(&String::make_entry(&"baz")) + .unwrap() + ); assert_eq!("foo", child_layer.id_subject(11).unwrap()); assert_eq!("bar", child_layer.id_predicate(5).unwrap()); @@ -1017,7 +1027,7 @@ pub mod tests { child_layer.id_object(11).unwrap() ); assert_eq!( - ObjectType::Value("baz".to_string()), + ObjectType::Value(String::make_entry(&"baz")), child_layer.id_object(12).unwrap() ); } diff --git a/src/layer/internal/mod.rs b/src/layer/internal/mod.rs index a3ed3905..cd8d3455 100644 --- a/src/layer/internal/mod.rs +++ b/src/layer/internal/mod.rs @@ -241,16 +241,16 @@ impl InternalLayer { self.node_dictionary().num_entries() } - pub fn value_dict_id(&self, value: &str) -> IdLookupResult { - self.value_dictionary().id(&value) + pub fn value_dict_id(&self, value: &TypedDictEntry) -> IdLookupResult { + self.value_dictionary().id_entry(value) } pub fn value_dict_len(&self) -> usize { self.value_dictionary().num_entries() } - pub fn value_dict_get(&self, id: usize) -> Option { - self.value_dictionary().get(id) + pub fn value_dict_get(&self, id: usize) -> Option { + self.value_dictionary().entry(id) } pub fn internal_triple_addition_exists( @@ -604,7 +604,7 @@ impl Layer for InternalLayer { id_option.map(|id| id + parent_option.map_or(0, |p| p.node_and_value_count() as u64)) } - fn object_value_id<'a>(&'a self, object: &str) -> Option { + fn object_value_id<'a>(&'a self, object: &TypedDictEntry) -> Option { let to_result = |layer: &'a InternalLayer| { ( layer.value_dict_id(object).into_option().map(|i| { @@ -1057,13 +1057,13 @@ mod tests { let builder = store.create_base_layer().unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); builder - .add_string_triple(StringTriple::new_node("cow", "likes", "duck")) + .add_value_triple(ValueTriple::new_node("cow", "likes", "duck")) .unwrap(); builder - .add_string_triple(StringTriple::new_value("duck", "says", "quack")) + .add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")) .unwrap(); builder.commit().unwrap() @@ -1085,10 +1085,10 @@ mod tests { let builder = base_layer.open_write().unwrap(); builder - .remove_string_triple(StringTriple::new_value("cow", "says", "moo")) + .remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); builder - .add_string_triple(StringTriple::new_value("horse", "says", "neigh")) + .add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")) .unwrap(); let layer = builder.commit().unwrap(); @@ -1109,7 +1109,7 @@ mod tests { let mut builder = BaseLayerFileBuilder::from_files(&files).await.unwrap(); builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(3, 3, 3).await.unwrap(); builder.finalize().await.unwrap(); diff --git a/src/layer/internal/object_iterator.rs b/src/layer/internal/object_iterator.rs index 85a1fd98..aa2faf9e 100644 --- a/src/layer/internal/object_iterator.rs +++ b/src/layer/internal/object_iterator.rs @@ -244,7 +244,7 @@ mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 2).await.unwrap(); @@ -409,40 +409,40 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "horse")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child1_name).await.unwrap(); let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child2_name).await.unwrap(); let child3_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child3_name).await.unwrap(); let child4_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("field", "contains", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("field", "contains", "cow")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(child4_name).await.unwrap().unwrap(); @@ -454,9 +454,9 @@ mod tests { .collect(); let expected = vec![ - StringTriple::new_node("duck", "likes", "cow"), - StringTriple::new_node("horse", "likes", "cow"), - StringTriple::new_node("field", "contains", "cow"), + ValueTriple::new_node("duck", "likes", "cow"), + ValueTriple::new_node("horse", "likes", "cow"), + ValueTriple::new_node("field", "contains", "cow"), ]; assert_eq!(expected, triples); diff --git a/src/layer/internal/predicate_iterator.rs b/src/layer/internal/predicate_iterator.rs index ca980e5a..35a54a8b 100644 --- a/src/layer/internal/predicate_iterator.rs +++ b/src/layer/internal/predicate_iterator.rs @@ -257,38 +257,38 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "horse")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child1_name).await.unwrap(); let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child2_name).await.unwrap(); let child3_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child3_name).await.unwrap(); let child4_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(child4_name).await.unwrap().unwrap(); @@ -300,9 +300,9 @@ mod tests { .collect(); let expected = vec![ - StringTriple::new_node("cow", "likes", "duck"), - StringTriple::new_node("duck", "likes", "cow"), - StringTriple::new_node("horse", "likes", "horse"), + ValueTriple::new_node("cow", "likes", "duck"), + ValueTriple::new_node("duck", "likes", "cow"), + ValueTriple::new_node("horse", "likes", "horse"), ]; assert_eq!(expected, triples); @@ -314,8 +314,8 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_node("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_node("cow", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_node("cow", "says", "quack")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(base_name).await.unwrap().unwrap(); @@ -326,8 +326,8 @@ mod tests { .collect(); let expected = vec![ - StringTriple::new_node("cow", "says", "moo"), - StringTriple::new_node("cow", "says", "quack"), + ValueTriple::new_node("cow", "says", "moo"), + ValueTriple::new_node("cow", "says", "quack"), ]; assert_eq!(expected, triples); diff --git a/src/layer/internal/subject_iterator.rs b/src/layer/internal/subject_iterator.rs index db54274f..5c356b50 100644 --- a/src/layer/internal/subject_iterator.rs +++ b/src/layer/internal/subject_iterator.rs @@ -455,6 +455,7 @@ mod tests { use crate::layer::base::tests::*; use crate::layer::child::tests::*; use crate::layer::*; + use crate::structure::TdbDataType; use std::sync::Arc; @@ -496,7 +497,7 @@ mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); @@ -529,7 +530,7 @@ mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); @@ -605,7 +606,7 @@ mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(3, 2, 5).await.unwrap(); builder.add_triple(3, 3, 5).await.unwrap(); @@ -638,7 +639,7 @@ mod tests { builder.add_nodes(nodes.into_iter().map(|s| s.to_string())); builder.add_predicates(predicates.into_iter().map(|s| s.to_string())); - builder.add_values(values.into_iter().map(|s| s.to_string())); + builder.add_values(values.into_iter().map(|s| String::make_entry(&s))); let mut builder = builder.into_phase2().await.unwrap(); builder.add_triple(1, 1, 1).await.unwrap(); builder.add_triple(3, 2, 4).await.unwrap(); @@ -870,38 +871,38 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "horse")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child1_name).await.unwrap(); let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child2_name).await.unwrap(); let child3_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child3_name).await.unwrap(); let child4_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(child4_name).await.unwrap().unwrap(); @@ -913,8 +914,8 @@ mod tests { .collect(); let expected = vec![ - StringTriple::new_node("duck", "likes", "cow"), - StringTriple::new_value("duck", "says", "quack"), + ValueTriple::new_node("duck", "likes", "cow"), + ValueTriple::new_string_value("duck", "says", "quack"), ]; assert_eq!(expected, triples); @@ -926,42 +927,42 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "horse")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child1_name).await.unwrap(); let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "horse")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "horse")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child2_name).await.unwrap(); let child3_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "pig")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child3_name).await.unwrap(); let child4_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.remove_string_triple(StringTriple::new_node("duck", "likes", "horse")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "rabbit")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "rabbit")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(child4_name).await.unwrap().unwrap(); @@ -974,9 +975,9 @@ mod tests { .collect(); let expected = vec![ - StringTriple::new_node("duck", "likes", "cow"), - StringTriple::new_node("duck", "likes", "pig"), - StringTriple::new_node("duck", "likes", "rabbit"), + ValueTriple::new_node("duck", "likes", "cow"), + ValueTriple::new_node("duck", "likes", "pig"), + ValueTriple::new_node("duck", "likes", "rabbit"), ]; assert_eq!(expected, triples); @@ -987,38 +988,38 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("sheep", "says", "baa")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("sheep", "says", "baa")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("horse", "says", "woof")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "horse")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "woof")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "horse")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child1_name).await.unwrap(); let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("horse", "says", "woof")); - builder.remove_string_triple(StringTriple::new_value("sheep", "says", "baa")); + builder.remove_value_triple(ValueTriple::new_string_value("horse", "says", "woof")); + builder.remove_value_triple(ValueTriple::new_string_value("sheep", "says", "baa")); - builder.add_string_triple(StringTriple::new_value("horse", "says", "quack")); - builder.add_string_triple(StringTriple::new_value("rabbit", "says", "sniff")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("rabbit", "says", "sniff")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(child2_name).await.unwrap(); let child3_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.remove_string_triple(StringTriple::new_value("horse", "says", "quack")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.remove_value_triple(ValueTriple::new_string_value("horse", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); builder.commit_boxed().await.unwrap(); ( @@ -1050,15 +1051,15 @@ mod tests { .collect(); let expected_additions = vec![ - StringTriple::new_node("duck", "likes", "cow"), - StringTriple::new_value("horse", "says", "neigh"), - StringTriple::new_value("rabbit", "says", "sniff"), + ValueTriple::new_node("duck", "likes", "cow"), + ValueTriple::new_string_value("horse", "says", "neigh"), + ValueTriple::new_string_value("rabbit", "says", "sniff"), ]; let expected_removals = vec![ - StringTriple::new_node("duck", "hates", "cow"), - StringTriple::new_value("sheep", "says", "baa"), - StringTriple::new_value("horse", "says", "woof"), + ValueTriple::new_node("duck", "hates", "cow"), + ValueTriple::new_string_value("sheep", "says", "baa"), + ValueTriple::new_string_value("horse", "says", "woof"), ]; assert_eq!(expected_additions, additions); diff --git a/src/layer/layer.rs b/src/layer/layer.rs index 27680d31..fa6fe21b 100644 --- a/src/layer/layer.rs +++ b/src/layer/layer.rs @@ -2,6 +2,8 @@ use std::collections::HashMap; use std::hash::Hash; +use crate::structure::{TdbDataType, TypedDictEntry}; + /// A layer containing dictionary entries and triples. /// /// A layer can be queried. To answer queries, layers will check their @@ -26,7 +28,7 @@ pub trait Layer: Send + Sync { /// The numerical id of a node object, or None if the node object cannot be found. fn object_node_id(&self, object: &str) -> Option; /// The numerical id of a value object, or None if the value object cannot be found. - fn object_value_id(&self, object: &str) -> Option; + fn object_value_id(&self, object: &TypedDictEntry) -> Option; /// The subject corresponding to a numerical id, or None if it cannot be found. fn id_subject(&self, id: u64) -> Option; @@ -44,7 +46,7 @@ pub trait Layer: Send + Sync { } /// The object value corresponding to a numerical id, or None if it cannot be found. Panics if the object is actually a node. - fn id_object_value(&self, id: u64) -> Option { + fn id_object_value(&self, id: u64) -> Option { self.id_object(id).map(|o| { o.value() .expect("Expected ObjectType to be value but got a node") @@ -78,8 +80,8 @@ pub trait Layer: Send + Sync { } /// Returns true if the given triple exists, and false otherwise. - fn string_triple_exists(&self, triple: &StringTriple) -> bool { - self.string_triple_to_id(triple) + fn value_triple_exists(&self, triple: &ValueTriple) -> bool { + self.value_triple_to_id(triple) .map(|t| self.id_triple_exists(t)) .unwrap_or(false) } @@ -91,8 +93,8 @@ pub trait Layer: Send + Sync { fn triples_sp(&self, subject: u64, predicate: u64) -> Box + Send>; - /// Convert a `StringTriple` to an `IdTriple`, returning None if any of the strings in the triple could not be resolved. - fn string_triple_to_id(&self, triple: &StringTriple) -> Option { + /// Convert a `ValueTriple` to an `IdTriple`, returning None if any of the strings in the triple could not be resolved. + fn value_triple_to_id(&self, triple: &ValueTriple) -> Option { self.subject_id(&triple.subject).and_then(|subject| { self.predicate_id(&triple.predicate).and_then(|predicate| { match &triple.object { @@ -113,7 +115,7 @@ pub trait Layer: Send + Sync { fn triples_o(&self, object: u64) -> Box + Send>; /// Convert all known strings in the given string triple to ids. - fn string_triple_to_partially_resolved(&self, triple: StringTriple) -> PartiallyResolvedTriple { + fn value_triple_to_partially_resolved(&self, triple: ValueTriple) -> PartiallyResolvedTriple { PartiallyResolvedTriple { subject: self .subject_id(&triple.subject) @@ -137,10 +139,10 @@ pub trait Layer: Send + Sync { } /// Convert an id triple to the corresponding string version, returning None if any of those ids could not be converted. - fn id_triple_to_string(&self, triple: &IdTriple) -> Option { + fn id_triple_to_string(&self, triple: &IdTriple) -> Option { self.id_subject(triple.subject).and_then(|subject| { self.id_predicate(triple.predicate).and_then(|predicate| { - self.id_object(triple.object).map(|object| StringTriple { + self.id_object(triple.object).map(|object| ValueTriple { subject, predicate, object, @@ -199,18 +201,18 @@ impl IdTriple { /// A triple stored as strings. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct StringTriple { +pub struct ValueTriple { pub subject: String, pub predicate: String, pub object: ObjectType, } -impl StringTriple { +impl ValueTriple { /// Construct a triple with a node object. /// /// Nodes may appear in both the subject and object position. - pub fn new_node(subject: &str, predicate: &str, object: &str) -> StringTriple { - StringTriple { + pub fn new_node(subject: &str, predicate: &str, object: &str) -> ValueTriple { + ValueTriple { subject: subject.to_owned(), predicate: predicate.to_owned(), object: ObjectType::Node(object.to_owned()), @@ -220,11 +222,22 @@ impl StringTriple { /// Construct a triple with a value object. /// /// Values may only appear in the object position. - pub fn new_value(subject: &str, predicate: &str, object: &str) -> StringTriple { - StringTriple { + pub fn new_value(subject: &str, predicate: &str, object: TypedDictEntry) -> ValueTriple { + ValueTriple { + subject: subject.to_owned(), + predicate: predicate.to_owned(), + object: ObjectType::Value(object), + } + } + + /// Construct a triple with a string value object. + /// + /// Values may only appear in the object position. + pub fn new_string_value(subject: &str, predicate: &str, object: &str) -> ValueTriple { + ValueTriple { subject: subject.to_owned(), predicate: predicate.to_owned(), - object: ObjectType::Value(object.to_owned()), + object: ObjectType::Value(String::make_entry(&object)), } } @@ -293,7 +306,7 @@ impl PartiallyResolvedTriple { &self, node_map: &HashMap, predicate_map: &HashMap, - value_map: &HashMap, + value_map: &HashMap, ) -> Option { let subject = match self.subject.as_ref() { PossiblyResolved::Unresolved(s) => *node_map.get(s)?, @@ -356,7 +369,7 @@ impl PartiallyResolvedTriple { #[derive(Debug, Clone, PartialOrd, PartialEq, Eq, Ord, Hash)] pub enum ObjectType { Node(String), - Value(String), + Value(TypedDictEntry), } impl ObjectType { @@ -374,17 +387,17 @@ impl ObjectType { } } - pub fn value(self) -> Option { + pub fn value(self) -> Option { match self { ObjectType::Node(_) => None, - ObjectType::Value(s) => Some(s), + ObjectType::Value(v) => Some(v), } } - pub fn value_ref(&self) -> Option<&str> { + pub fn value_ref(&self) -> Option<&TypedDictEntry> { match self { ObjectType::Node(_) => None, - ObjectType::Value(s) => Some(s), + ObjectType::Value(v) => Some(v), } } } @@ -405,8 +418,8 @@ mod tests { let files = base_layer_files(); let mut builder = SimpleLayerBuilder::new([1, 2, 3, 4, 5], files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("cow", "says", "sniff")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "sniff")); builder.commit().await.unwrap(); @@ -420,7 +433,7 @@ mod tests { let files = child_layer_files(); let mut builder = SimpleLayerBuilder::from_parent([5, 4, 3, 2, 1], base.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child: Arc = Arc::new( @@ -452,7 +465,7 @@ mod tests { .collect(); assert_eq!( - vec![StringTriple::new_value("cow", "says", "sniff")], + vec![ValueTriple::new_string_value("cow", "says", "sniff")], triples ); } @@ -462,7 +475,7 @@ mod tests { let files = base_layer_files(); let mut builder = SimpleLayerBuilder::new([1, 2, 3, 4, 5], files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); @@ -476,7 +489,7 @@ mod tests { let files = child_layer_files(); let mut builder = SimpleLayerBuilder::from_parent([5, 4, 3, 2, 1], base.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child: Arc = Arc::new( @@ -489,7 +502,7 @@ mod tests { let files = child_layer_files(); let mut builder = SimpleLayerBuilder::from_parent([5, 4, 3, 2, 2], child.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child: Arc = Arc::new( @@ -504,7 +517,10 @@ mod tests { .map(|t| child.id_triple_to_string(&t).unwrap()) .collect(); - assert_eq!(vec![StringTriple::new_value("cow", "says", "moo")], triples); + assert_eq!( + vec![ValueTriple::new_string_value("cow", "says", "moo")], + triples + ); } #[tokio::test] @@ -512,8 +528,8 @@ mod tests { let files = base_layer_files(); let mut builder = SimpleLayerBuilder::new([1, 2, 3, 4, 5], files.clone()); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "neigh")); builder.commit().await.unwrap(); @@ -527,7 +543,7 @@ mod tests { let files = child_layer_files(); let mut builder = SimpleLayerBuilder::from_parent([5, 4, 3, 2, 1], base.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "neigh")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "neigh")); builder.commit().await.unwrap(); let child: Arc = Arc::new( @@ -540,7 +556,7 @@ mod tests { let files = child_layer_files(); let mut builder = SimpleLayerBuilder::from_parent([5, 4, 3, 2, 2], child.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child: Arc = Arc::new( @@ -566,7 +582,13 @@ mod tests { .unwrap(); let triple_2 = child.id_triple_to_string(&id_triple_2).unwrap(); - assert_eq!(StringTriple::new_value("cow", "says", "moo"), triple_1); - assert_eq!(StringTriple::new_value("duck", "says", "quack"), triple_2); + assert_eq!( + ValueTriple::new_string_value("cow", "says", "moo"), + triple_1 + ); + assert_eq!( + ValueTriple::new_string_value("duck", "says", "quack"), + triple_2 + ); } } diff --git a/src/layer/simple_builder.rs b/src/layer/simple_builder.rs index 7f47c03d..90c4a23e 100644 --- a/src/layer/simple_builder.rs +++ b/src/layer/simple_builder.rs @@ -12,6 +12,7 @@ use super::internal::*; use super::layer::*; use crate::storage::*; +use crate::structure::TypedDictEntry; use std::collections::{HashMap, HashSet}; use std::io; use std::pin::Pin; @@ -31,11 +32,11 @@ pub trait LayerBuilder: Send + Sync { /// Return the parent if it exists fn parent(&self) -> Option>; /// Add a string triple - fn add_string_triple(&mut self, triple: StringTriple); + fn add_value_triple(&mut self, triple: ValueTriple); /// Add an id triple fn add_id_triple(&mut self, triple: IdTriple); /// Remove a string triple - fn remove_string_triple(&mut self, triple: StringTriple); + fn remove_value_triple(&mut self, triple: ValueTriple); /// Remove an id triple fn remove_id_triple(&mut self, triple: IdTriple); /// Commit the layer to storage @@ -53,9 +54,9 @@ pub struct SimpleLayerBuilder { name: [u32; 5], parent: Option>, files: LayerFiles, - additions: Vec, + additions: Vec, id_additions: Vec, - removals: Vec, + removals: Vec, id_removals: Vec, } @@ -96,7 +97,7 @@ impl LayerBuilder for SimpleLayerBuil self.parent.clone() } - fn add_string_triple(&mut self, triple: StringTriple) { + fn add_value_triple(&mut self, triple: ValueTriple) { self.additions.push(triple); } @@ -104,7 +105,7 @@ impl LayerBuilder for SimpleLayerBuil self.id_additions.push(triple); } - fn remove_string_triple(&mut self, triple: StringTriple) { + fn remove_value_triple(&mut self, triple: ValueTriple) { self.removals.push(triple); } @@ -132,7 +133,7 @@ impl LayerBuilder for SimpleLayerBuil .collect(), Some(parent) => additions .into_par_iter() - .map(move |triple| parent.string_triple_to_partially_resolved(triple)) + .map(move |triple| parent.value_triple_to_partially_resolved(triple)) .collect(), }; @@ -150,7 +151,7 @@ impl LayerBuilder for SimpleLayerBuil .collect(), Some(parent) => removals .into_par_iter() - .map(move |triple| parent.string_triple_to_partially_resolved(triple)) + .map(move |triple| parent.value_triple_to_partially_resolved(triple)) .collect(), }; @@ -318,7 +319,7 @@ fn zero_equivalents( fn collect_unresolved_strings( triples: &[PartiallyResolvedTriple], -) -> (Vec, Vec, Vec) { +) -> (Vec, Vec, Vec) { let (unresolved_nodes, (unresolved_predicates, unresolved_values)) = rayon::join( || { let unresolved_nodes_set: HashSet<_> = triples @@ -394,6 +395,7 @@ mod tests { use super::*; use crate::layer::internal::InternalLayer; use crate::storage::memory::*; + use crate::structure::TdbDataType; fn new_base_files() -> BaseLayerFiles { // TODO inline @@ -410,9 +412,9 @@ mod tests { let files = new_base_files(); let mut builder = SimpleLayerBuilder::new(name, files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit().await.unwrap(); @@ -424,9 +426,9 @@ mod tests { async fn simple_base_layer_construction() { let layer = example_base_layer().await; - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] @@ -436,9 +438,9 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "cow")); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); let child_layer = Arc::new( async { @@ -450,15 +452,17 @@ mod tests { .unwrap(), ); + assert!(child_layer + .value_triple_exists(&ValueTriple::new_string_value("horse", "says", "neigh"))); + assert!(child_layer.value_triple_exists(&ValueTriple::new_node("horse", "likes", "cow"))); assert!( - child_layer.string_triple_exists(&StringTriple::new_value("horse", "says", "neigh")) + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) ); - assert!(child_layer.string_triple_exists(&StringTriple::new_node("horse", "likes", "cow"))); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); assert!( - !child_layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack")) + child_layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink")) ); + assert!(!child_layer + .value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] @@ -469,9 +473,9 @@ mod tests { let mut builder = SimpleLayerBuilder::from_parent(name2, base_layer.clone(), files2.clone()); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "cow")); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit().await.unwrap(); let layer2: Arc = Arc::new( @@ -484,9 +488,9 @@ mod tests { let name3 = [0, 0, 0, 0, 1]; let files3 = new_child_files(); builder = SimpleLayerBuilder::from_parent(name3, layer2.clone(), files3.clone()); - builder.remove_string_triple(StringTriple::new_node("horse", "likes", "cow")); - builder.add_string_triple(StringTriple::new_node("horse", "likes", "pig")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.remove_value_triple(ValueTriple::new_node("horse", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_node("horse", "likes", "pig")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit().await.unwrap(); let layer3: Arc = Arc::new( @@ -499,8 +503,8 @@ mod tests { let name4 = [0, 0, 0, 0, 1]; let files4 = new_child_files(); builder = SimpleLayerBuilder::from_parent(name4, layer3.clone(), files4.clone()); - builder.remove_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "horse")); + builder.remove_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "horse")); builder.commit().await.unwrap(); let layer4: Arc = Arc::new( ChildLayer::load_from_files(name4, layer3, &files4) @@ -509,14 +513,16 @@ mod tests { .into(), ); - assert!(layer4.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer4.string_triple_exists(&StringTriple::new_value("duck", "says", "quack"))); - assert!(layer4.string_triple_exists(&StringTriple::new_value("horse", "says", "neigh"))); - assert!(layer4.string_triple_exists(&StringTriple::new_node("horse", "likes", "pig"))); - assert!(layer4.string_triple_exists(&StringTriple::new_node("cow", "likes", "horse"))); + assert!(layer4.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer4.value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); + assert!( + layer4.value_triple_exists(&ValueTriple::new_string_value("horse", "says", "neigh")) + ); + assert!(layer4.value_triple_exists(&ValueTriple::new_node("horse", "likes", "pig"))); + assert!(layer4.value_triple_exists(&ValueTriple::new_node("cow", "likes", "horse"))); - assert!(!layer4.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(!layer4.string_triple_exists(&StringTriple::new_node("horse", "likes", "cow"))); + assert!(!layer4.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); + assert!(!layer4.value_triple_exists(&ValueTriple::new_node("horse", "likes", "cow"))); } #[tokio::test] @@ -525,8 +531,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::new(name, files.clone()); - builder.remove_string_triple(StringTriple::new_value("crow", "says", "caw")); - builder.add_string_triple(StringTriple::new_value("crow", "says", "caw")); + builder.remove_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); + builder.add_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); builder.commit().await.unwrap(); let base_layer: Arc = Arc::new( @@ -536,7 +542,9 @@ mod tests { .into(), ); - assert!(!base_layer.string_triple_exists(&StringTriple::new_value("crow", "says", "caw"))); + assert!( + !base_layer.value_triple_exists(&ValueTriple::new_string_value("crow", "says", "caw")) + ); } #[tokio::test] @@ -545,8 +553,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::new(name, files.clone()); - builder.add_string_triple(StringTriple::new_value("crow", "says", "caw")); - builder.remove_string_triple(StringTriple::new_value("crow", "says", "caw")); + builder.add_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); + builder.remove_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); builder.commit().await.unwrap(); let base_layer: Arc = Arc::new( @@ -556,7 +564,9 @@ mod tests { .into(), ); - assert!(!base_layer.string_triple_exists(&StringTriple::new_value("crow", "says", "caw"))); + assert!( + !base_layer.value_triple_exists(&ValueTriple::new_string_value("crow", "says", "caw")) + ); } #[tokio::test] @@ -566,8 +576,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -577,7 +587,9 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -587,8 +599,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -598,7 +610,9 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -608,8 +622,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("crow", "says", "caw")); - builder.add_string_triple(StringTriple::new_value("crow", "says", "caw")); + builder.remove_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); + builder.add_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -619,7 +633,9 @@ mod tests { .into(), ); - assert!(!child_layer.string_triple_exists(&StringTriple::new_value("crow", "says", "caw"))); + assert!( + !child_layer.value_triple_exists(&ValueTriple::new_string_value("crow", "says", "caw")) + ); } #[tokio::test] @@ -629,8 +645,8 @@ mod tests { let name = [0, 0, 0, 0, 0]; let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("crow", "says", "caw")); - builder.remove_string_triple(StringTriple::new_value("crow", "says", "caw")); + builder.add_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); + builder.remove_value_triple(ValueTriple::new_string_value("crow", "says", "caw")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -640,7 +656,9 @@ mod tests { .into(), ); - assert!(!child_layer.string_triple_exists(&StringTriple::new_value("crow", "says", "caw"))); + assert!( + !child_layer.value_triple_exists(&ValueTriple::new_string_value("crow", "says", "caw")) + ); } #[tokio::test] @@ -650,10 +668,12 @@ mod tests { let name = [0, 0, 0, 0, 0]; let node_id = base_layer.subject_id("cow").unwrap(); let predicate_id = base_layer.predicate_id("says").unwrap(); - let value_id = base_layer.object_value_id("moo").unwrap(); + let value_id = base_layer + .object_value_id(&String::make_entry(&"moo")) + .unwrap(); let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.add_id_triple(IdTriple::new(node_id, predicate_id, value_id)); builder.commit().await.unwrap(); @@ -664,7 +684,9 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -674,11 +696,13 @@ mod tests { let name = [0, 0, 0, 0, 0]; let node_id = base_layer.subject_id("cow").unwrap(); let predicate_id = base_layer.predicate_id("says").unwrap(); - let value_id = base_layer.object_value_id("moo").unwrap(); + let value_id = base_layer + .object_value_id(&String::make_entry(&"moo")) + .unwrap(); let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); builder.remove_id_triple(IdTriple::new(node_id, predicate_id, value_id)); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -688,7 +712,9 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -698,10 +724,12 @@ mod tests { let name = [0, 0, 0, 0, 0]; let node_id = base_layer.subject_id("cow").unwrap(); let predicate_id = base_layer.predicate_id("says").unwrap(); - let value_id = base_layer.object_value_id("moo").unwrap(); + let value_id = base_layer + .object_value_id(&String::make_entry(&"moo")) + .unwrap(); let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.remove_id_triple(IdTriple::new(node_id, predicate_id, value_id)); builder.commit().await.unwrap(); @@ -712,7 +740,9 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -722,11 +752,13 @@ mod tests { let name = [0, 0, 0, 0, 0]; let node_id = base_layer.subject_id("cow").unwrap(); let predicate_id = base_layer.predicate_id("says").unwrap(); - let value_id = base_layer.object_value_id("moo").unwrap(); + let value_id = base_layer + .object_value_id(&String::make_entry(&"moo")) + .unwrap(); let mut builder = SimpleLayerBuilder::from_parent(name, base_layer.clone(), files.clone()); builder.add_id_triple(IdTriple::new(node_id, predicate_id, value_id)); - builder.remove_string_triple(StringTriple::new_value("cow", "says", "moo")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); builder.commit().await.unwrap(); let child_layer: Arc = Arc::new( @@ -736,6 +768,8 @@ mod tests { .into(), ); - assert!(child_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + child_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } } diff --git a/src/lib.rs b/src/lib.rs index ec2f0544..3e434f3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,6 +37,6 @@ pub mod storage; pub mod store; pub mod structure; -pub use layer::{IdTriple, Layer, ObjectType, StringTriple}; +pub use layer::{IdTriple, Layer, ObjectType, ValueTriple}; pub use store::sync::{open_sync_directory_store, open_sync_memory_store}; pub use store::{open_directory_store, open_memory_store}; diff --git a/src/storage/cache.rs b/src/storage/cache.rs index abb5bca0..0a1762f1 100644 --- a/src/storage/cache.rs +++ b/src/storage/cache.rs @@ -557,17 +557,17 @@ pub mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await.unwrap(); @@ -591,17 +591,17 @@ pub mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await.unwrap(); @@ -621,9 +621,9 @@ pub mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); diff --git a/src/storage/delta.rs b/src/storage/delta.rs index 16fe6353..fb291899 100644 --- a/src/storage/delta.rs +++ b/src/storage/delta.rs @@ -445,30 +445,30 @@ mod tests { ) -> io::Result<(Arc, Arc, Arc)> { let mut builder = store.create_base_layer().await?; let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await?; let base_layer = store.get_layer(base_name).await?.unwrap(); builder = store.create_child_layer(base_name).await?; let child1_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); - builder.add_string_triple(StringTriple::new_value("horse", "says", "neigh")); - builder.add_string_triple(StringTriple::new_node("pig", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); + builder.add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")); + builder.add_value_triple(ValueTriple::new_node("pig", "likes", "pig")); builder.commit_boxed().await?; let child1_layer = store.get_layer(child1_name).await?.unwrap(); builder = store.create_child_layer(child1_name).await?; let child2_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("pig", "likes", "pig")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("sheep", "says", "baah")); - builder.add_string_triple(StringTriple::new_value("pig", "likes", "sheep")); + builder.remove_value_triple(ValueTriple::new_node("pig", "likes", "pig")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("sheep", "says", "baah")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "likes", "sheep")); builder.commit_boxed().await?; let child2_layer = store.get_layer(child2_name).await?.unwrap(); @@ -505,7 +505,7 @@ mod tests { ); for t in expected { - assert!(delta_layer.string_triple_exists(&t)); + assert!(delta_layer.value_triple_exists(&t)); } } @@ -541,7 +541,7 @@ mod tests { ); for t in expected { - assert!(delta_layer.string_triple_exists(&t)); + assert!(delta_layer.value_triple_exists(&t)); } let change_expected: Vec<_> = @@ -604,50 +604,50 @@ mod tests { ); for t in expected { - assert!(delta_layer2.string_triple_exists(&t)); + assert!(delta_layer2.value_triple_exists(&t)); } } async fn create_layer_stack(store: &S) -> Vec<[u32; 5]> { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("a", "a", "a")); - builder.add_string_triple(StringTriple::new_value("a", "b", "c")); + builder.add_value_triple(ValueTriple::new_string_value("a", "a", "a")); + builder.add_value_triple(ValueTriple::new_string_value("a", "b", "c")); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(base_name).await.unwrap(); let child1_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("a", "a", "b")); - builder.add_string_triple(StringTriple::new_value("a", "b", "a")); - builder.add_string_triple(StringTriple::new_value("b", "a", "a")); - builder.add_string_triple(StringTriple::new_value("d", "d", "d")); - builder.remove_string_triple(StringTriple::new_value("a", "a", "a")); + builder.add_value_triple(ValueTriple::new_string_value("a", "a", "b")); + builder.add_value_triple(ValueTriple::new_string_value("a", "b", "a")); + builder.add_value_triple(ValueTriple::new_string_value("b", "a", "a")); + builder.add_value_triple(ValueTriple::new_string_value("d", "d", "d")); + builder.remove_value_triple(ValueTriple::new_string_value("a", "a", "a")); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(child1_name).await.unwrap(); - builder.add_string_triple(StringTriple::new_value("a", "b", "b")); - builder.add_string_triple(StringTriple::new_value("b", "a", "b")); - builder.add_string_triple(StringTriple::new_value("e", "e", "e")); - builder.remove_string_triple(StringTriple::new_value("a", "a", "b")); + builder.add_value_triple(ValueTriple::new_string_value("a", "b", "b")); + builder.add_value_triple(ValueTriple::new_string_value("b", "a", "b")); + builder.add_value_triple(ValueTriple::new_string_value("e", "e", "e")); + builder.remove_value_triple(ValueTriple::new_string_value("a", "a", "b")); let child2_name = builder.name(); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(child2_name).await.unwrap(); - builder.add_string_triple(StringTriple::new_value("a", "a", "b")); - builder.add_string_triple(StringTriple::new_value("b", "b", "a")); - builder.add_string_triple(StringTriple::new_value("f", "f", "f")); + builder.add_value_triple(ValueTriple::new_string_value("a", "a", "b")); + builder.add_value_triple(ValueTriple::new_string_value("b", "b", "a")); + builder.add_value_triple(ValueTriple::new_string_value("f", "f", "f")); let child3_name = builder.name(); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(child3_name).await.unwrap(); - builder.add_string_triple(StringTriple::new_value("b", "b", "c")); - builder.add_string_triple(StringTriple::new_value("g", "g", "g")); + builder.add_value_triple(ValueTriple::new_string_value("b", "b", "c")); + builder.add_value_triple(ValueTriple::new_string_value("g", "g", "g")); let child4_name = builder.name(); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(child4_name).await.unwrap(); - builder.add_string_triple(StringTriple::new_value("c", "a", "b")); - builder.add_string_triple(StringTriple::new_value("h", "h", "h")); + builder.add_value_triple(ValueTriple::new_string_value("c", "a", "b")); + builder.add_value_triple(ValueTriple::new_string_value("h", "h", "h")); let child5_name = builder.name(); builder.commit_boxed().await.unwrap(); diff --git a/src/storage/directory.rs b/src/storage/directory.rs index a5f4f27e..2466d691 100644 --- a/src/storage/directory.rs +++ b/src/storage/directory.rs @@ -441,17 +441,17 @@ mod tests { let mut builder = store.create_base_layer().await?; let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await?; let mut builder = store.create_child_layer(base_name).await?; let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await?; @@ -461,10 +461,10 @@ mod tests { .unwrap() .unwrap(); - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(layer.string_triple_exists(&StringTriple::new_node("cow", "likes", "pig"))); - assert!(!layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_node("cow", "likes", "pig"))); + assert!(!layer.value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] @@ -551,17 +551,17 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await.unwrap(); @@ -575,12 +575,15 @@ mod tests { _ => panic!("not a rollup"), } - assert!(rolled_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(rolled_layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(rolled_layer.string_triple_exists(&StringTriple::new_node("cow", "likes", "pig"))); assert!( - !rolled_layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack")) + rolled_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) ); + assert!( + rolled_layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink")) + ); + assert!(rolled_layer.value_triple_exists(&ValueTriple::new_node("cow", "likes", "pig"))); + assert!(!rolled_layer + .value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] @@ -591,25 +594,25 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await.unwrap(); let mut builder = store.create_child_layer(child_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("cow", "likes", "pig")); - builder.add_string_triple(StringTriple::new_node("cow", "hates", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("cow", "likes", "pig")); + builder.add_value_triple(ValueTriple::new_node("cow", "hates", "pig")); builder.commit_boxed().await.unwrap(); @@ -627,13 +630,17 @@ mod tests { _ => panic!("not a rollup"), } - assert!(rolled_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(rolled_layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(rolled_layer.string_triple_exists(&StringTriple::new_node("cow", "hates", "pig"))); - assert!(!rolled_layer.string_triple_exists(&StringTriple::new_value("cow", "likes", "pig"))); assert!( - !rolled_layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack")) + rolled_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); + assert!( + rolled_layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink")) ); + assert!(rolled_layer.value_triple_exists(&ValueTriple::new_node("cow", "hates", "pig"))); + assert!(!rolled_layer + .value_triple_exists(&ValueTriple::new_string_value("cow", "likes", "pig"))); + assert!(!rolled_layer + .value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] diff --git a/src/storage/layer.rs b/src/storage/layer.rs index 323e6732..53337ea8 100644 --- a/src/storage/layer.rs +++ b/src/storage/layer.rs @@ -2228,7 +2228,7 @@ pub(crate) async fn file_triple_layer_count( #[cfg(test)] mod tests { use super::*; - use crate::layer::{Layer, ObjectType, StringTriple}; + use crate::layer::{Layer, ObjectType, ValueTriple}; use crate::storage::directory::DirectoryLayerStore; use crate::storage::memory::MemoryLayerStore; use std::collections::HashMap; @@ -2238,33 +2238,33 @@ mod tests { // They test functionality that should really work for both lazy_static! { - static ref BASE_TRIPLES: Vec = vec![ - StringTriple::new_value("cow", "says", "moo"), - StringTriple::new_value("cow", "says", "mooo"), - StringTriple::new_node("cow", "likes", "duck"), - StringTriple::new_node("cow", "likes", "pig"), - StringTriple::new_value("cow", "name", "clarabelle"), - StringTriple::new_value("pig", "says", "oink"), - StringTriple::new_node("pig", "hates", "cow"), - StringTriple::new_value("duck", "says", "quack"), - StringTriple::new_node("duck", "hates", "cow"), - StringTriple::new_node("duck", "hates", "pig"), - StringTriple::new_value("duck", "name", "donald"), + static ref BASE_TRIPLES: Vec = vec![ + ValueTriple::new_string_value("cow", "says", "moo"), + ValueTriple::new_string_value("cow", "says", "mooo"), + ValueTriple::new_node("cow", "likes", "duck"), + ValueTriple::new_node("cow", "likes", "pig"), + ValueTriple::new_string_value("cow", "name", "clarabelle"), + ValueTriple::new_string_value("pig", "says", "oink"), + ValueTriple::new_node("pig", "hates", "cow"), + ValueTriple::new_string_value("duck", "says", "quack"), + ValueTriple::new_node("duck", "hates", "cow"), + ValueTriple::new_node("duck", "hates", "pig"), + ValueTriple::new_string_value("duck", "name", "donald"), ]; - static ref CHILD_ADDITION_TRIPLES: Vec = vec![ - StringTriple::new_value("cow", "says", "moooo"), - StringTriple::new_value("cow", "says", "mooooo"), - StringTriple::new_node("cow", "likes", "horse"), - StringTriple::new_node("pig", "likes", "platypus"), - StringTriple::new_node("duck", "hates", "platypus"), + static ref CHILD_ADDITION_TRIPLES: Vec = vec![ + ValueTriple::new_string_value("cow", "says", "moooo"), + ValueTriple::new_string_value("cow", "says", "mooooo"), + ValueTriple::new_node("cow", "likes", "horse"), + ValueTriple::new_node("pig", "likes", "platypus"), + ValueTriple::new_node("duck", "hates", "platypus"), ]; - static ref CHILD_REMOVAL_TRIPLES: Vec = vec![ - StringTriple::new_value("cow", "says", "mooo"), - StringTriple::new_value("cow", "name", "clarabelle"), - StringTriple::new_node("pig", "hates", "cow"), - StringTriple::new_node("duck", "hates", "cow"), - StringTriple::new_node("duck", "hates", "pig"), - StringTriple::new_value("duck", "name", "donald"), + static ref CHILD_REMOVAL_TRIPLES: Vec = vec![ + ValueTriple::new_string_value("cow", "says", "mooo"), + ValueTriple::new_string_value("cow", "name", "clarabelle"), + ValueTriple::new_node("pig", "hates", "cow"), + ValueTriple::new_node("duck", "hates", "cow"), + ValueTriple::new_node("duck", "hates", "pig"), + ValueTriple::new_string_value("duck", "name", "donald"), ]; } @@ -2274,19 +2274,19 @@ mod tests { ) -> io::Result<( [u32; 5], Option>, - HashMap, + HashMap, )> { let mut builder = store.create_base_layer().await?; let name = builder.name(); for t in BASE_TRIPLES.iter() { - builder.add_string_triple(t.clone()); + builder.add_value_triple(t.clone()); } builder.commit_boxed().await?; let layer = store.get_layer(name).await?.unwrap(); let mut contents = HashMap::with_capacity(BASE_TRIPLES.len()); for t in BASE_TRIPLES.iter() { - let t_id = layer.string_triple_to_id(t).unwrap(); + let t_id = layer.value_triple_to_id(t).unwrap(); contents.insert(t.clone(), t_id); } @@ -2304,30 +2304,30 @@ mod tests { ) -> io::Result<( [u32; 5], Option>, - HashMap, - HashMap, + HashMap, + HashMap, )> { let (base_name, _base_layer, _) = example_base_layer(store, false).await?; let mut builder = store.create_child_layer(base_name).await?; let name = builder.name(); for t in CHILD_ADDITION_TRIPLES.iter() { - builder.add_string_triple(t.clone()); + builder.add_value_triple(t.clone()); } for t in CHILD_REMOVAL_TRIPLES.iter() { - builder.remove_string_triple(t.clone()); + builder.remove_value_triple(t.clone()); } builder.commit_boxed().await?; let layer = store.get_layer(name).await?.unwrap(); let mut add_contents = HashMap::with_capacity(BASE_TRIPLES.len()); for t in CHILD_ADDITION_TRIPLES.iter() { - let t_id = layer.string_triple_to_id(t).unwrap(); + let t_id = layer.value_triple_to_id(t).unwrap(); add_contents.insert(t.clone(), t_id); } let mut remove_contents = HashMap::with_capacity(BASE_TRIPLES.len()); for t in CHILD_REMOVAL_TRIPLES.iter() { - let t_id = layer.string_triple_to_id(t).unwrap(); + let t_id = layer.value_triple_to_id(t).unwrap(); remove_contents.insert(t.clone(), t_id); } diff --git a/src/storage/memory.rs b/src/storage/memory.rs index f277314f..26150a5a 100644 --- a/src/storage/memory.rs +++ b/src/storage/memory.rs @@ -520,24 +520,24 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_value("cow", "says", "moo")); - builder.add_string_triple(StringTriple::new_value("pig", "says", "oink")); - builder.add_string_triple(StringTriple::new_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")); + builder.add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")); + builder.add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); builder.commit_boxed().await.unwrap(); builder = store.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_value("duck", "says", "quack")); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "pig")); + builder.remove_value_triple(ValueTriple::new_string_value("duck", "says", "quack")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "pig")); builder.commit_boxed().await.unwrap(); let layer = store.get_layer(child_name).await.unwrap().unwrap(); - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); - assert!(layer.string_triple_exists(&StringTriple::new_node("cow", "likes", "pig"))); - assert!(!layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_node("cow", "likes", "pig"))); + assert!(!layer.value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); } #[tokio::test] diff --git a/src/storage/pack.rs b/src/storage/pack.rs index c598a88b..44a634f1 100644 --- a/src/storage/pack.rs +++ b/src/storage/pack.rs @@ -335,16 +335,16 @@ mod tests { let mut builder = store1.create_base_layer().await.unwrap(); let base_name = builder.name(); - builder.add_string_triple(StringTriple::new_node("cow", "likes", "duck")); - builder.add_string_triple(StringTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("cow", "likes", "duck")); + builder.add_value_triple(ValueTriple::new_node("duck", "hates", "cow")); builder.commit_boxed().await.unwrap(); let mut builder = store1.create_child_layer(base_name).await.unwrap(); let child_name = builder.name(); - builder.remove_string_triple(StringTriple::new_node("duck", "hates", "cow")); - builder.add_string_triple(StringTriple::new_node("duck", "likes", "cow")); + builder.remove_value_triple(ValueTriple::new_node("duck", "hates", "cow")); + builder.add_value_triple(ValueTriple::new_node("duck", "likes", "cow")); builder.commit_boxed().await.unwrap(); @@ -369,8 +369,8 @@ mod tests { .collect(); assert_eq!( vec![ - StringTriple::new_node("cow", "likes", "duck"), - StringTriple::new_node("duck", "likes", "cow") + ValueTriple::new_node("cow", "likes", "duck"), + ValueTriple::new_node("duck", "likes", "cow") ], triples ); diff --git a/src/store/mod.rs b/src/store/mod.rs index 0a587b1c..2e28ca82 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -6,10 +6,11 @@ pub mod sync; use std::path::PathBuf; use std::sync::{Arc, RwLock}; -use crate::layer::{IdTriple, Layer, LayerBuilder, LayerCounts, ObjectType, StringTriple}; +use crate::layer::{IdTriple, Layer, LayerBuilder, LayerCounts, ObjectType, ValueTriple}; use crate::storage::directory::{DirectoryLabelStore, DirectoryLayerStore}; use crate::storage::memory::{MemoryLabelStore, MemoryLayerStore}; use crate::storage::{CachedLayerStore, LabelStore, LayerStore, LockingHashMapLayerCache}; +use crate::structure::TypedDictEntry; use std::io; @@ -90,8 +91,8 @@ impl StoreLayerBuilder { } /// Add a string triple. - pub fn add_string_triple(&self, triple: StringTriple) -> Result<(), io::Error> { - self.with_builder(move |b| b.add_string_triple(triple)) + pub fn add_value_triple(&self, triple: ValueTriple) -> Result<(), io::Error> { + self.with_builder(move |b| b.add_value_triple(triple)) } /// Add an id triple. @@ -100,8 +101,8 @@ impl StoreLayerBuilder { } /// Remove a string triple. - pub fn remove_string_triple(&self, triple: StringTriple) -> Result<(), io::Error> { - self.with_builder(move |b| b.remove_string_triple(triple)) + pub fn remove_value_triple(&self, triple: ValueTriple) -> Result<(), io::Error> { + self.with_builder(move |b| b.remove_value_triple(triple)) } /// Remove an id triple. @@ -165,14 +166,14 @@ impl StoreLayerBuilder { triple_additions.par_bridge().for_each(|t| { delta .id_triple_to_string(&t) - .map(|st| self.add_string_triple(st)); + .map(|st| self.add_value_triple(st)); }); }, move || { triple_removals.par_bridge().for_each(|t| { delta .id_triple_to_string(&t) - .map(|st| self.remove_string_triple(st)); + .map(|st| self.remove_value_triple(st)); }) }, ); @@ -189,8 +190,8 @@ impl StoreLayerBuilder { if let Some(this) = self.parent() { this.triples().par_bridge().for_each(|t| { if let Some(st) = this.id_triple_to_string(&t) { - if !other.string_triple_exists(&st) { - self.remove_string_triple(st).unwrap() + if !other.value_triple_exists(&st) { + self.remove_value_triple(st).unwrap() } } }) @@ -200,11 +201,11 @@ impl StoreLayerBuilder { other.triples().par_bridge().for_each(|t| { if let Some(st) = other.id_triple_to_string(&t) { if let Some(this) = self.parent() { - if !this.string_triple_exists(&st) { - self.add_string_triple(st).unwrap() + if !this.value_triple_exists(&st) { + self.add_value_triple(st).unwrap() } } else { - self.add_string_triple(st).unwrap() + self.add_value_triple(st).unwrap() }; } }) @@ -278,7 +279,7 @@ impl StoreLayer { let new_builder = self.store.create_base_layer().await?; self.triples().par_bridge().for_each(|t| { let st = self.id_triple_to_string(&t).unwrap(); - new_builder.add_string_triple(st).unwrap() + new_builder.add_value_triple(st).unwrap() }); new_builder.commit().await @@ -577,7 +578,7 @@ impl Layer for StoreLayer { self.layer.object_node_id(object) } - fn object_value_id(&self, object: &str) -> Option { + fn object_value_id(&self, object: &TypedDictEntry) -> Option { self.layer.object_value_id(object) } @@ -892,7 +893,7 @@ mod tests { let mut builder = store.create_base_layer().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().await.unwrap(); @@ -900,7 +901,7 @@ mod tests { builder = layer.open_write().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("pig", "says", "oink")) + .add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")) .unwrap(); let layer2 = builder.commit().await.unwrap(); @@ -910,8 +911,8 @@ mod tests { let layer = database.head().await.unwrap().unwrap(); assert_eq!(layer2_name, layer.name()); - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); } #[tokio::test] @@ -934,7 +935,7 @@ mod tests { let store = open_memory_store(); let builder = store.create_base_layer().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().await.unwrap(); @@ -943,7 +944,7 @@ mod tests { let layer2 = store.get_layer_from_id(id).await.unwrap().unwrap(); - assert!(layer2.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!(layer2.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); } #[tokio::test] @@ -952,7 +953,7 @@ mod tests { let builder = store.create_base_layer().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); assert!(!builder.committed()); @@ -969,7 +970,7 @@ mod tests { let builder1 = store.create_base_layer().await.unwrap(); builder1 - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer1 = builder1.commit().await.unwrap(); @@ -978,7 +979,7 @@ mod tests { let builder2 = store.create_base_layer().await.unwrap(); builder2 - .add_string_triple(StringTriple::new_value("duck", "says", "quack")) + .add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")) .unwrap(); let layer2 = builder2.commit().await.unwrap(); @@ -987,8 +988,12 @@ mod tests { let new_layer = database.head().await.unwrap().unwrap(); - assert!(new_layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack"))); - assert!(!new_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!( + new_layer.value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack")) + ); + assert!( + !new_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); } #[tokio::test] @@ -996,7 +1001,7 @@ mod tests { let store = open_memory_store(); let builder = store.create_base_layer().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().await.unwrap(); @@ -1004,15 +1009,15 @@ mod tests { let builder2 = layer.open_write().await.unwrap(); builder2 - .add_string_triple(StringTriple::new_value("dog", "says", "woof")) + .add_value_triple(ValueTriple::new_string_value("dog", "says", "woof")) .unwrap(); let layer2 = builder2.commit().await.unwrap(); let new = layer2.squash().await.unwrap(); - assert!(new.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(new.string_triple_exists(&StringTriple::new_value("dog", "says", "woof"))); + assert!(new.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(new.value_triple_exists(&ValueTriple::new_string_value("dog", "says", "woof"))); assert!(new.parent().await.unwrap().is_none()); } @@ -1022,7 +1027,7 @@ mod tests { let builder = store.create_base_layer().await.unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().await.unwrap(); @@ -1030,7 +1035,7 @@ mod tests { let builder2 = layer.open_write().await.unwrap(); builder2 - .add_string_triple(StringTriple::new_value("dog", "says", "woof")) + .add_value_triple(ValueTriple::new_string_value("dog", "says", "woof")) .unwrap(); let layer2 = builder2.commit().await.unwrap(); @@ -1038,10 +1043,10 @@ mod tests { let delta_builder_1 = store.create_base_layer().await.unwrap(); delta_builder_1 - .add_string_triple(StringTriple::new_value("dog", "says", "woof")) + .add_value_triple(ValueTriple::new_string_value("dog", "says", "woof")) .unwrap(); delta_builder_1 - .add_string_triple(StringTriple::new_value("cat", "says", "meow")) + .add_value_triple(ValueTriple::new_string_value("cat", "says", "meow")) .unwrap(); let delta_1 = delta_builder_1.commit().await.unwrap(); @@ -1049,10 +1054,10 @@ mod tests { let delta_builder_2 = delta_1.open_write().await.unwrap(); delta_builder_2 - .add_string_triple(StringTriple::new_value("crow", "says", "caw")) + .add_value_triple(ValueTriple::new_string_value("crow", "says", "caw")) .unwrap(); delta_builder_2 - .remove_string_triple(StringTriple::new_value("cat", "says", "meow")) + .remove_value_triple(ValueTriple::new_string_value("cat", "says", "meow")) .unwrap(); let delta = delta_builder_2.commit().await.unwrap(); @@ -1063,10 +1068,17 @@ mod tests { let rebase_layer = rebase_builder.commit().await.unwrap(); - assert!(rebase_layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(rebase_layer.string_triple_exists(&StringTriple::new_value("crow", "says", "caw"))); - assert!(rebase_layer.string_triple_exists(&StringTriple::new_value("dog", "says", "woof"))); - assert!(!rebase_layer.string_triple_exists(&StringTriple::new_value("cat", "says", "meow"))); + assert!( + rebase_layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo")) + ); + assert!( + rebase_layer.value_triple_exists(&ValueTriple::new_string_value("crow", "says", "caw")) + ); + assert!( + rebase_layer.value_triple_exists(&ValueTriple::new_string_value("dog", "says", "woof")) + ); + assert!(!rebase_layer + .value_triple_exists(&ValueTriple::new_string_value("cat", "says", "meow"))); } async fn cached_layer_name_does_not_change_after_rollup(store: Store) { diff --git a/src/store/sync.rs b/src/store/sync.rs index ca1dc04f..b304299d 100644 --- a/src/store/sync.rs +++ b/src/store/sync.rs @@ -11,10 +11,11 @@ use tokio::runtime::Runtime; use std::io; use std::path::PathBuf; -use crate::layer::{IdTriple, Layer, LayerCounts, ObjectType, StringTriple}; +use crate::layer::{IdTriple, Layer, LayerCounts, ObjectType, ValueTriple}; use crate::store::{ open_directory_store, open_memory_store, NamedGraph, Store, StoreLayer, StoreLayerBuilder, }; +use crate::structure::TypedDictEntry; lazy_static! { static ref RUNTIME: Runtime = Runtime::new().unwrap(); @@ -54,8 +55,8 @@ impl SyncStoreLayerBuilder { } /// Add a string triple. - pub fn add_string_triple(&self, triple: StringTriple) -> Result<(), io::Error> { - self.inner.add_string_triple(triple) + pub fn add_value_triple(&self, triple: ValueTriple) -> Result<(), io::Error> { + self.inner.add_value_triple(triple) } /// Add an id triple. @@ -64,8 +65,8 @@ impl SyncStoreLayerBuilder { } /// Remove a string triple. - pub fn remove_string_triple(&self, triple: StringTriple) -> Result<(), io::Error> { - self.inner.remove_string_triple(triple) + pub fn remove_value_triple(&self, triple: ValueTriple) -> Result<(), io::Error> { + self.inner.remove_value_triple(triple) } /// Remove an id triple. @@ -376,7 +377,7 @@ impl Layer for SyncStoreLayer { self.inner.object_node_id(object) } - fn object_value_id(&self, object: &str) -> Option { + fn object_value_id(&self, object: &TypedDictEntry) -> Option { self.inner.object_value_id(object) } @@ -603,7 +604,7 @@ mod tests { let mut builder = store.create_base_layer().unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().unwrap(); @@ -611,7 +612,7 @@ mod tests { builder = layer.open_write().unwrap(); builder - .add_string_triple(StringTriple::new_value("pig", "says", "oink")) + .add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")) .unwrap(); let layer2 = builder.commit().unwrap(); @@ -621,8 +622,8 @@ mod tests { let layer = database.head().unwrap().unwrap(); assert_eq!(layer2_name, layer.name()); - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); } #[test] @@ -636,7 +637,7 @@ mod tests { let mut builder = store.create_base_layer().unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().unwrap(); @@ -644,7 +645,7 @@ mod tests { builder = layer.open_write().unwrap(); builder - .add_string_triple(StringTriple::new_value("pig", "says", "oink")) + .add_value_triple(ValueTriple::new_string_value("pig", "says", "oink")) .unwrap(); let layer2 = builder.commit().unwrap(); @@ -654,8 +655,8 @@ mod tests { let layer = database.head().unwrap().unwrap(); assert_eq!(layer2_name, layer.name()); - assert!(layer.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); - assert!(layer.string_triple_exists(&StringTriple::new_value("pig", "says", "oink"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); + assert!(layer.value_triple_exists(&ValueTriple::new_string_value("pig", "says", "oink"))); } #[test] @@ -663,7 +664,7 @@ mod tests { let store = open_sync_memory_store(); let builder = store.create_base_layer().unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer = builder.commit().unwrap(); @@ -671,7 +672,7 @@ mod tests { let id = layer.name(); let layer2 = store.get_layer_from_id(id).unwrap().unwrap(); - assert!(layer2.string_triple_exists(&StringTriple::new_value("cow", "says", "moo"))); + assert!(layer2.value_triple_exists(&ValueTriple::new_string_value("cow", "says", "moo"))); } #[test] @@ -679,7 +680,7 @@ mod tests { let store = open_sync_memory_store(); let builder = store.create_base_layer().unwrap(); builder - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); assert!(!builder.committed()); @@ -700,19 +701,19 @@ mod tests { let builder1 = store1.create_base_layer().unwrap(); builder1 - .add_string_triple(StringTriple::new_value("cow", "says", "moo")) + .add_value_triple(ValueTriple::new_string_value("cow", "says", "moo")) .unwrap(); let layer1 = builder1.commit().unwrap(); let builder2 = store1.create_base_layer().unwrap(); builder2 - .add_string_triple(StringTriple::new_value("duck", "says", "quack")) + .add_value_triple(ValueTriple::new_string_value("duck", "says", "quack")) .unwrap(); let layer2 = builder2.commit().unwrap(); let builder3 = layer2.open_write().unwrap(); builder3 - .add_string_triple(StringTriple::new_value("horse", "says", "neigh")) + .add_value_triple(ValueTriple::new_string_value("horse", "says", "neigh")) .unwrap(); let layer3 = builder3.commit().unwrap(); @@ -733,11 +734,9 @@ mod tests { .unwrap(); let result_layer = store2.get_layer_from_id(layer3.name()).unwrap().unwrap(); - assert!( - result_layer.string_triple_exists(&StringTriple::new_value("duck", "says", "quack")) - ); - assert!( - result_layer.string_triple_exists(&StringTriple::new_value("horse", "says", "neigh")) - ); + assert!(result_layer + .value_triple_exists(&ValueTriple::new_string_value("duck", "says", "quack"))); + assert!(result_layer + .value_triple_exists(&ValueTriple::new_string_value("horse", "says", "neigh"))); } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index a3fa1aed..a344177e 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -110,6 +110,10 @@ impl TypedDict { pub fn id>(&self, v: &Q) -> IdLookupResult { let entry = T::make_entry(v); + self.id_entry(&entry) + } + + pub fn id_entry(&self, entry: &TypedDictEntry) -> IdLookupResult { self.id_slice(entry.datatype, &entry.to_bytes()) } @@ -174,6 +178,7 @@ impl TypedDict { } } + // TOOD: would be nice if this worked on a buf instead of a slice pub fn id_slice(&self, dt: Datatype, slice: &[u8]) -> IdLookupResult { if let Some((dict, offset)) = self.type_segment(dt) { let result = dict.id(slice).offset(offset); From f741afef329b80c27707d168004ca04dbbeaa2f6 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Wed, 7 Dec 2022 15:06:35 +0100 Subject: [PATCH 75/99] value extraction from typed dict + test --- src/layer/layer.rs | 65 ++++++++++++++++++++++++++++++++++++++ src/structure/tfc/typed.rs | 8 +++-- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/layer/layer.rs b/src/layer/layer.rs index fa6fe21b..51652384 100644 --- a/src/layer/layer.rs +++ b/src/layer/layer.rs @@ -591,4 +591,69 @@ mod tests { triple_2 ); } + + #[tokio::test] + async fn find_nonstring_triples() { + let files = base_layer_files(); + let mut builder = SimpleLayerBuilder::new([1, 2, 3, 4, 5], files.clone()); + + builder.add_value_triple(ValueTriple::new_value( + "duck", + "num_feet", + u32::make_entry(&2), + )); + builder.add_value_triple(ValueTriple::new_value( + "cow", + "num_feet", + u32::make_entry(&4), + )); + builder.add_value_triple(ValueTriple::new_value( + "disabled_cow", + "num_feet", + u32::make_entry(&3), + )); + builder.add_value_triple(ValueTriple::new_value( + "duck", + "swims", + String::make_entry(&"true"), + )); + builder.add_value_triple(ValueTriple::new_value( + "cow", + "swims", + String::make_entry(&"false"), + )); + builder.add_value_triple(ValueTriple::new_value( + "disabled_cow", + "swims", + String::make_entry(&"false"), + )); + + builder.commit().await.unwrap(); + + let base: Arc = Arc::new( + BaseLayer::load_from_files([1, 2, 3, 4, 5], &files) + .await + .unwrap() + .into(), + ); + + let mut results: Vec<_> = base + .triples_p(base.predicate_id("num_feet").unwrap()) + .map(|t| { + ( + base.id_subject(t.subject).unwrap(), + base.id_object_value(t.object).unwrap().as_val::(), + ) + }) + .collect(); + results.sort(); + + let expected = vec![ + ("cow".to_owned(), 4), + ("disabled_cow".to_owned(), 3), + ("duck".to_owned(), 2), + ]; + + assert_eq!(expected, results); + } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index a344177e..101978d4 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -9,7 +9,7 @@ use std::{borrow::Cow, marker::PhantomData}; use super::{ block::{IdLookupResult, SizedDictBlock, SizedDictEntry}, dict::{SizedDict, SizedDictBufBuilder}, - Datatype, OwnedSizedDictEntryBuf, SizedDictEntryBuf, TdbDataType, ToLexical, + Datatype, FromLexical, OwnedSizedDictEntryBuf, SizedDictEntryBuf, TdbDataType, ToLexical, }; #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -33,6 +33,11 @@ impl TypedDictEntry { pub fn into_buf(self) -> OwnedSizedDictEntryBuf { self.entry.into_buf() } + + pub fn as_val>(&self) -> T { + assert_eq!(Q::datatype(), self.datatype); + T::from_lexical(self.entry.as_buf()) + } } #[derive(Clone, Debug)] @@ -494,7 +499,6 @@ mod tests { use bytes::BytesMut; use rug::Integer; - use super::super::datatypes::FromLexical; use crate::structure::Decimal; use super::*; From 9a310036e405faa212fbe3ffddb334101ea71c5b Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 7 Dec 2022 16:14:06 +0100 Subject: [PATCH 76/99] Adding datatype accessor function --- src/structure/tfc/typed.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 101978d4..778c8976 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -38,6 +38,10 @@ impl TypedDictEntry { assert_eq!(Q::datatype(), self.datatype); T::from_lexical(self.entry.as_buf()) } + + pub fn datatype(&self) -> Datatype { + self.datatype + } } #[derive(Clone, Debug)] From 8d0b943697e33a5b8531b5871d4e9f6b8aa05bf2 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 7 Dec 2022 22:46:30 +0100 Subject: [PATCH 77/99] Some more data types --- src/structure/tfc/datatypes.rs | 68 ++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index d12e4e7b..da76f494 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -19,6 +19,8 @@ pub enum Datatype { Float64, Decimal, BigInt, + Boolean, + Token, } impl Datatype { @@ -32,6 +34,7 @@ impl Datatype { pub fn record_size(&self) -> Option { match self { + Datatype::Boolean => Some(4), // this is huge Datatype::String => None, Datatype::UInt32 => Some(4), Datatype::Int32 => Some(4), @@ -278,8 +281,73 @@ impl FromLexical for Decimal { } } +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + // TODO make this better + Decimal::from_lexical(b).0 + } +} + impl ToLexical for Decimal { fn to_lexical(&self) -> Bytes { Bytes::from(decimal_to_storage(&self.0)) } } + +impl TdbDataType for bool { + fn datatype() -> Datatype { + Datatype::Boolean + } +} + +impl FromLexical for bool { + fn from_lexical(mut b: B) -> Self { + let num = b.get_u8(); + if num == 0 { + false + } else { + true + } + } +} + +impl ToLexical for bool { + fn to_lexical(&self) -> Bytes { + if *self { + vec![1].into() + } else { + vec![0].into() + } + } +} + +#[derive(PartialEq, Debug)] +pub struct Token(pub String); + +impl TdbDataType for Token { + fn datatype() -> Datatype { + Datatype::Token + } +} + +impl ToLexical for Token { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self.0.as_ref().as_bytes()) + } +} + +impl FromLexical for Token { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + Token(String::from_utf8(vec).unwrap()) + } +} + +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + String::from_utf8(vec).unwrap() + } +} From 81d2333c077b224dc718860c521dc8355609bb87 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Wed, 7 Dec 2022 23:14:02 +0100 Subject: [PATCH 78/99] Accidentally broke build briefly --- src/structure/tfc/datatypes.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index da76f494..44fbde60 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -44,6 +44,7 @@ impl Datatype { Datatype::Float64 => Some(8), Datatype::Decimal => None, Datatype::BigInt => None, + Datatype::Token => None, } } } @@ -324,6 +325,12 @@ impl ToLexical for bool { #[derive(PartialEq, Debug)] pub struct Token(pub String); +impl AsRef for Token { + fn as_ref(&self) -> &str { + &self.0 + } +} + impl TdbDataType for Token { fn datatype() -> Datatype { Datatype::Token @@ -332,7 +339,7 @@ impl TdbDataType for Token { impl ToLexical for Token { fn to_lexical(&self) -> Bytes { - Bytes::copy_from_slice(self.0.as_ref().as_bytes()) + Bytes::copy_from_slice(self.as_ref().as_bytes()) } } From d27ad4ed0eeaa96efe6424b6cab1dfa4109d4188 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 8 Dec 2022 08:45:22 +0100 Subject: [PATCH 79/99] Remove length test --- src/structure/tfc/typed.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 778c8976..d334d64a 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -64,7 +64,7 @@ impl TypedDict { let types_present = MonotonicLogArray::parse(types_present).unwrap(); let type_offsets = MonotonicLogArray::parse(type_offsets).unwrap(); let block_offsets = MonotonicLogArray::parse(block_offsets).unwrap(); - if types_present.len() == 0 { + if types_present.is_empty() { return Self { types_present, type_offsets, From c860a0a658391d5d690d5c88dc793cc01fe0010f Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Thu, 8 Dec 2022 12:25:10 +0100 Subject: [PATCH 80/99] add langstring support --- src/structure/tfc/datatypes.rs | 72 ++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 44fbde60..55f7c2c2 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -21,6 +21,7 @@ pub enum Datatype { BigInt, Boolean, Token, + LangString, } impl Datatype { @@ -45,6 +46,7 @@ impl Datatype { Datatype::Decimal => None, Datatype::BigInt => None, Datatype::Token => None, + Datatype::LangString => None, } } } @@ -322,39 +324,49 @@ impl ToLexical for bool { } } -#[derive(PartialEq, Debug)] -pub struct Token(pub String); +macro_rules! stringy_type { + ($ty:ident) => { + stringy_type!($ty, $ty); + }; + ($ty:ident, $datatype:ident) => { + #[derive(PartialEq, Debug)] + pub struct $ty(String); -impl AsRef for Token { - fn as_ref(&self) -> &str { - &self.0 - } -} + impl AsRef for $ty { + fn as_ref(&self) -> &str { + &self.0 + } + } -impl TdbDataType for Token { - fn datatype() -> Datatype { - Datatype::Token - } -} + impl TdbDataType for $ty { + fn datatype() -> Datatype { + Datatype::$datatype + } + } -impl ToLexical for Token { - fn to_lexical(&self) -> Bytes { - Bytes::copy_from_slice(self.as_ref().as_bytes()) - } -} + impl> ToLexical<$ty> for T { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self.as_ref().as_bytes()) + } + } -impl FromLexical for Token { - fn from_lexical(mut b: B) -> Self { - let mut vec = vec![0; b.remaining()]; - b.copy_to_slice(&mut vec); - Token(String::from_utf8(vec).unwrap()) - } -} + impl FromLexical<$ty> for $ty { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + $ty(String::from_utf8(vec).unwrap()) + } + } -impl FromLexical for String { - fn from_lexical(mut b: B) -> Self { - let mut vec = vec![0; b.remaining()]; - b.copy_to_slice(&mut vec); - String::from_utf8(vec).unwrap() - } + impl FromLexical<$ty> for String { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + String::from_utf8(vec).unwrap() + } + } + }; } + +stringy_type!(Token); +stringy_type!(LangString); From d34778661af614a6f2a246085f7aa63f7155040d Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 8 Dec 2022 13:54:06 +0100 Subject: [PATCH 81/99] Extending types --- src/structure/tfc/datatypes.rs | 72 +++++++++++++++++++++++++++++++++- src/structure/tfc/typed.rs | 39 +++++++++++------- 2 files changed, 95 insertions(+), 16 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 55f7c2c2..0f87af13 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -20,8 +20,16 @@ pub enum Datatype { Decimal, BigInt, Boolean, - Token, LangString, + DateTime, + Date, + AnyURI, + Language, + NormalizedString, + Token, + NMToken, + Name, + NCName, } impl Datatype { @@ -47,6 +55,7 @@ impl Datatype { Datatype::BigInt => None, Datatype::Token => None, Datatype::LangString => None, + _ => None, } } } @@ -291,6 +300,16 @@ impl FromLexical for String { } } +/* +impl FromLexical for f64 { + fn from_lexical(b: B) -> Self { + let s = Decimal::from_lexical(b).0; + s.parse::() + .expect("Too much precision for cast from decimal to f64") + } +} +*/ + impl ToLexical for Decimal { fn to_lexical(&self) -> Bytes { Bytes::from(decimal_to_storage(&self.0)) @@ -368,5 +387,54 @@ macro_rules! stringy_type { }; } -stringy_type!(Token); +/* +macro_rules! biginty_type { + ($ty:ident) => { + biginty_type!($ty, $ty); + }; + ($ty:ident, $datatype:ident) => { + #[derive(PartialEq, Debug)] + pub struct $ty(Integer); + + impl TdbDataType for $ty { + fn datatype() -> Datatype { + Datatype::$datatype + } + } + + impl FromLexical<$ty> for $ty { + fn from_lexical(mut b: B) -> Self { + $ty(storage_to_bigint(&mut b).to_string()) + } + } + + impl FromLexical<$ty> for String { + fn from_lexical(mut b: B) -> Self { + $ty(storage_to_bigint(&mut b).to_string()) + } + } + + impl ToLexical<$ty> for $ty { + fn to_lexical(&self) -> Bytes { + Bytes::from(bigint_to_storage(self.0.clone())) + } + } + }; +} +*/ + stringy_type!(LangString); +stringy_type!(NCName); +stringy_type!(Name); +stringy_type!(Token); +stringy_type!(NMToken); +stringy_type!(NormalizedString); +stringy_type!(Language); +stringy_type!(AnyURI); + +/* +biginty_type!(PositiveInteger); +biginty_type!(NonNegativeInteger); +biginty_type!(NegativeInteger); +biginty_type!(NonPositiveInteger); +*/ diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index d334d64a..5a5a27fd 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -77,22 +77,20 @@ impl TypedDict { let mut tally: u64 = 0; let mut type_id_offsets = Vec::with_capacity(types_present.len() - 1); for type_offset in type_offsets.iter() { - let last_block_len; - if type_offset == 0 { - last_block_len = parse_block_control_records(data[0]); + let last_block_len = if type_offset == 0 { + parse_block_control_records(data[0]) } else { let last_block_offset_of_previous_type = block_offsets.entry(type_offset as usize - 1); - last_block_len = - parse_block_control_records(data[last_block_offset_of_previous_type as usize]); - } + parse_block_control_records(data[last_block_offset_of_previous_type as usize]) + }; let gap = BLOCK_SIZE as u8 - last_block_len; tally += gap as u64; type_id_offsets.push((type_offset + 1) * 8 - tally); } - let last_gap = if block_offsets.len() == 0 { + let last_gap = if block_offsets.is_empty() { 1 } else { BLOCK_SIZE @@ -100,7 +98,7 @@ impl TypedDict { data[block_offsets.entry(block_offsets.len() - 1) as usize], ) as usize }; - let num_entries = if block_offsets.len() == 0 { + let num_entries = if block_offsets.is_empty() { parse_block_control_records(data[0]) as usize } else { (block_offsets.len() + 1) * BLOCK_SIZE - tally as usize - last_gap @@ -442,7 +440,7 @@ impl TypedDictBufBuilder u64 { - if self.current_datatype == None { + if self.current_datatype.is_none() { self.current_datatype = Some(value.datatype); self.types_present_builder.push(value.datatype as u64); self.sized_dict_buf_builder @@ -576,14 +574,21 @@ mod tests { let mut offsets = BytesMut::new(); let mut data = BytesMut::new(); - build_segment_and_offsets( + build_segment_and_offsets::< + &mut bytes::BytesMut, + &mut bytes::BytesMut, + String, + String, + std::vec::IntoIter, + >( Datatype::String, &mut offsets, &mut data, strings.clone().into_iter(), ); - let segment = TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); + let segment: TypedDictSegment = + TypedDictSegment::parse(offsets.freeze(), data.freeze(), 0); for (ix, s) in strings.into_iter().enumerate() { assert_eq!(IdLookupResult::Found((ix + 1) as u64), segment.id(&s)); @@ -746,7 +751,7 @@ mod tests { assert_eq!(13, dict.num_entries()); - let id = dict.id(&"Batty".to_string()); + let id = dict.id::(&"Batty".to_string()); assert_eq!(IdLookupResult::Found(2), id); assert_eq!(IdLookupResult::Found(6), dict.id(&20_u32)); assert_eq!(IdLookupResult::Found(7), dict.id(&(-500_i32))); @@ -829,8 +834,14 @@ mod tests { assert_eq!(26_u32, dict.get::(14).unwrap()); assert_eq!(Decimal("234.8973".to_string()), dict.get(29).unwrap()); - assert_eq!(IdLookupResult::NotFound, dict.id(&"AAAA".to_string())); - assert_eq!(IdLookupResult::Closest(2), dict.id(&"Baz".to_string())); + assert_eq!( + IdLookupResult::NotFound, + dict.id::(&"AAAA".to_string()) + ); + assert_eq!( + IdLookupResult::Closest(2), + dict.id::(&"Baz".to_string()) + ); assert_eq!(IdLookupResult::Found(17), dict.id(&3000_u32)); From 5e6bb66e1c29db7919cbc14afce193709bce08ec Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Thu, 8 Dec 2022 16:19:45 +0100 Subject: [PATCH 82/99] Adding date times. --- Cargo.toml | 1 + src/structure/tfc/datatypes.rs | 21 ++++++++++++++ src/structure/tfc/datetime.rs | 53 ++++++++++++++++++++++++++++++++++ src/structure/tfc/decimal.rs | 36 ++++++++++++++--------- src/structure/tfc/mod.rs | 1 + 5 files changed, 98 insertions(+), 14 deletions(-) create mode 100644 src/structure/tfc/datetime.rs diff --git a/Cargo.toml b/Cargo.toml index 51034a67..86f79137 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ itertools = "0.10" rug = {version="1.16", default-features=false, features=["integer","rational"]} num-derive = "0.3" num-traits = "0.2" +chrono = "0.4" [dev-dependencies] tempfile = "3.1" diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 0f87af13..4d1a6137 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -5,6 +5,7 @@ use super::{ }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use chrono::{DateTime, Utc}; use num_derive::FromPrimitive; use rug::Integer; @@ -438,3 +439,23 @@ biginty_type!(NonNegativeInteger); biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); */ + +/* +impl TdbDataType for DateTime { + fn datatype() -> Datatype { + Datatype::DateTime + } +} + +impl ToLexical> for DateTime { + fn to_lexical(&self) -> Bytes { + Bytes::from(datetime_to_storage(&self)) + } +} + +impl FromLexical> for DateTime { + fn from_lexical(mut b: B) -> Self { + todo!() + } +} +*/ diff --git a/src/structure/tfc/datetime.rs b/src/structure/tfc/datetime.rs new file mode 100644 index 00000000..1ca6459c --- /dev/null +++ b/src/structure/tfc/datetime.rs @@ -0,0 +1,53 @@ +use chrono::{DateTime, Utc}; +use rug::Integer; + +use super::decimal::integer_and_fraction_to_storage; + +fn datetime_to_parts(datetime: &DateTime) -> (bool, Integer, u32) { + let mut seconds = Integer::from(datetime.timestamp()); + let is_neg = seconds < 0; + let mut nanos = datetime.timestamp_subsec_nanos(); + if is_neg { + if nanos != 0 { + seconds += 1; + } + nanos = 1_000_000_000 - nanos; + } + (is_neg, seconds, nanos) +} + +fn datetime_to_storage(datetime: &DateTime) -> Vec { + let (is_neg, seconds, nanos) = datetime_to_parts(datetime); + let fraction = if nanos == 0 { + None + } else if nanos % 1_000_000 == 0 { + Some(format!("{:02}", nanos / 1_000_000)) + } else if nanos % 1_000 == 0 { + Some(format!("{:05}", nanos / 1_000)) + } else { + Some(format!("{:08}", nanos)) + }; + integer_and_fraction_to_storage(is_neg, seconds, fraction.as_ref().map(|b| b.as_ref())) +} + +/* +fn storage_to_datetime(bytes: &mut B) -> DateTime { + let (int, is_pos) = storage_to_bigint_and_sign(bytes); + let fraction = u32::parse(decode_fraction(bytes, is_pos)); + Utc.timestamp(int) + .opt_with_nanoseconds(fraction) +} +*/ + +#[cfg(test)] +mod tests { + use chrono::TimeZone; + + use super::*; + + #[test] + fn a_few_nanos_before_epoch() { + let dt = Utc.timestamp_opt(-1, 234).unwrap(); + let result = dbg!(datetime_to_parts(&dt)); + assert_eq!((true, Integer::from(0), 999999766_u32), result) + } +} diff --git a/src/structure/tfc/decimal.rs b/src/structure/tfc/decimal.rs index b7acaf6c..a973577b 100644 --- a/src/structure/tfc/decimal.rs +++ b/src/structure/tfc/decimal.rs @@ -85,8 +85,28 @@ pub fn decimal_to_storage(decimal: &str) -> Vec { let fraction = parts.next(); let integer_part = bigint.parse::().unwrap(); let is_neg = decimal.starts_with('-'); - let prefix = bigint_to_storage(integer_part.clone()); - let mut prefix = if integer_part == 0 && is_neg { + integer_and_fraction_to_storage(is_neg, integer_part, fraction) +} + +pub fn storage_to_decimal(bytes: &mut B) -> String { + let (int, is_pos) = storage_to_bigint_and_sign(bytes); + let fraction = decode_fraction(bytes, is_pos); + let decimal = if fraction.is_empty() { + format!("{int:}") + } else { + let sign = if int == 0 && !is_pos { "-" } else { "" }; + format!("{sign:}{int:}.{fraction:}") + }; + decimal +} + +pub fn integer_and_fraction_to_storage( + is_neg: bool, + integer: Integer, + fraction: Option<&str>, +) -> Vec { + let prefix = bigint_to_storage(integer.clone()); + let mut prefix = if integer == 0 && is_neg { vec![NEGATIVE_ZERO] // negative zero } else { prefix @@ -103,15 +123,3 @@ pub fn decimal_to_storage(decimal: &str) -> Vec { prefix.extend(suffix); prefix } - -pub fn storage_to_decimal(bytes: &mut B) -> String { - let (int, is_pos) = storage_to_bigint_and_sign(bytes); - let fraction = decode_fraction(bytes, is_pos); - let decimal = if fraction.is_empty() { - format!("{int:}") - } else { - let sign = if int == 0 && !is_pos { "-" } else { "" }; - format!("{sign:}{int:}.{fraction:}") - }; - decimal -} diff --git a/src/structure/tfc/mod.rs b/src/structure/tfc/mod.rs index 2c2f120c..c3e07550 100644 --- a/src/structure/tfc/mod.rs +++ b/src/structure/tfc/mod.rs @@ -1,5 +1,6 @@ pub mod block; pub mod datatypes; +pub mod datetime; pub mod decimal; pub mod dict; pub mod file; From d4cef311c93629e439c82ddcfb31074b3c47e0fa Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 10:21:17 +0100 Subject: [PATCH 83/99] DateTimes (with very little testing) --- src/structure/tfc/datatypes.rs | 15 +++++----- src/structure/tfc/datetime.rs | 52 +++++++++++++++++++++++++++------- src/structure/tfc/decimal.rs | 2 +- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 4d1a6137..3935a86c 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -1,11 +1,12 @@ use super::{ + datetime::{datetime_to_storage, storage_to_datetime}, decimal::{decimal_to_storage, storage_to_decimal}, integer::{bigint_to_storage, storage_to_bigint}, TypedDictEntry, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use chrono::{DateTime, Utc}; +use chrono::NaiveDateTime; use num_derive::FromPrimitive; use rug::Integer; @@ -440,22 +441,20 @@ biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); */ -/* -impl TdbDataType for DateTime { +impl TdbDataType for NaiveDateTime { fn datatype() -> Datatype { Datatype::DateTime } } -impl ToLexical> for DateTime { +impl ToLexical for NaiveDateTime { fn to_lexical(&self) -> Bytes { - Bytes::from(datetime_to_storage(&self)) + Bytes::from(datetime_to_storage(self)) } } -impl FromLexical> for DateTime { +impl FromLexical for NaiveDateTime { fn from_lexical(mut b: B) -> Self { - todo!() + storage_to_datetime(&mut b) } } -*/ diff --git a/src/structure/tfc/datetime.rs b/src/structure/tfc/datetime.rs index 1ca6459c..99dd1463 100644 --- a/src/structure/tfc/datetime.rs +++ b/src/structure/tfc/datetime.rs @@ -1,9 +1,13 @@ -use chrono::{DateTime, Utc}; +use bytes::Buf; +use chrono::NaiveDateTime; use rug::Integer; -use super::decimal::integer_and_fraction_to_storage; +use super::{ + decimal::{decode_fraction, integer_and_fraction_to_storage}, + integer::storage_to_bigint_and_sign, +}; -fn datetime_to_parts(datetime: &DateTime) -> (bool, Integer, u32) { +pub fn datetime_to_parts(datetime: &NaiveDateTime) -> (bool, Integer, u32) { let mut seconds = Integer::from(datetime.timestamp()); let is_neg = seconds < 0; let mut nanos = datetime.timestamp_subsec_nanos(); @@ -16,7 +20,7 @@ fn datetime_to_parts(datetime: &DateTime) -> (bool, Integer, u32) { (is_neg, seconds, nanos) } -fn datetime_to_storage(datetime: &DateTime) -> Vec { +pub fn datetime_to_storage(datetime: &NaiveDateTime) -> Vec { let (is_neg, seconds, nanos) = datetime_to_parts(datetime); let fraction = if nanos == 0 { None @@ -25,18 +29,44 @@ fn datetime_to_storage(datetime: &DateTime) -> Vec { } else if nanos % 1_000 == 0 { Some(format!("{:05}", nanos / 1_000)) } else { - Some(format!("{:08}", nanos)) + Some(format!("{nanos:08}")) }; integer_and_fraction_to_storage(is_neg, seconds, fraction.as_ref().map(|b| b.as_ref())) } -/* -fn storage_to_datetime(bytes: &mut B) -> DateTime { +fn count_leading_zeros(string: &str) -> usize { + string + .chars() + .take_while(|ch| *ch == '0') + .map(|ch| ch.len_utf8()) + .sum() +} + +pub fn storage_to_datetime(bytes: &mut B) -> NaiveDateTime { let (int, is_pos) = storage_to_bigint_and_sign(bytes); - let fraction = u32::parse(decode_fraction(bytes, is_pos)); - Utc.timestamp(int) + .opt_with_nanoseconds(fraction) + let fraction = decode_fraction(bytes, is_pos); + let seconds = int + .to_i64() + .expect("This is a surprisingly large number of seconds!"); + if fraction.is_empty() { + if is_pos { + NaiveDateTime::from_timestamp_opt(seconds, 0).unwrap() + } else { + NaiveDateTime::from_timestamp_opt(-seconds, 0).unwrap() + } + } else { + let leading_zeros = count_leading_zeros(&fraction); + let nanos = fraction + .parse::() + .expect("Nano seconds should actually fit in u32") + * u32::pow(10, leading_zeros as u32); + if is_pos { + NaiveDateTime::from_timestamp_opt(seconds, nanos).unwrap() + } else { + NaiveDateTime::from_timestamp_opt(seconds - 1, 1_000_000_000 - nanos).unwrap() + } + } } -*/ #[cfg(test)] mod tests { @@ -46,7 +76,7 @@ mod tests { #[test] fn a_few_nanos_before_epoch() { - let dt = Utc.timestamp_opt(-1, 234).unwrap(); + let dt = NaiveDateTime::from_timestamp_opt(-1, 234).unwrap(); let result = dbg!(datetime_to_parts(&dt)); assert_eq!((true, Integer::from(0), 999999766_u32), result) } diff --git a/src/structure/tfc/decimal.rs b/src/structure/tfc/decimal.rs index a973577b..9b26074e 100644 --- a/src/structure/tfc/decimal.rs +++ b/src/structure/tfc/decimal.rs @@ -54,7 +54,7 @@ fn centary_decimal_decode(i: u8) -> String { } } -fn decode_fraction(fraction_buf: &mut B, is_pos: bool) -> String { +pub fn decode_fraction(fraction_buf: &mut B, is_pos: bool) -> String { let mut first_byte = fraction_buf.chunk()[0]; if !is_pos { first_byte = !first_byte; From 56d33101987267b8afa19b3972c49d6ebfccc9b9 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 10:21:31 +0100 Subject: [PATCH 84/99] Remove warning --- src/structure/tfc/datetime.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/structure/tfc/datetime.rs b/src/structure/tfc/datetime.rs index 99dd1463..c6374d99 100644 --- a/src/structure/tfc/datetime.rs +++ b/src/structure/tfc/datetime.rs @@ -70,8 +70,6 @@ pub fn storage_to_datetime(bytes: &mut B) -> NaiveDateTime { #[cfg(test)] mod tests { - use chrono::TimeZone; - use super::*; #[test] From 5b7ebed18a40268f4b424284147780e9b9ffd4df Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 15:01:07 +0100 Subject: [PATCH 85/99] More types --- src/structure/tfc/datatypes.rs | 291 ++++++++++++++++++++++++++++----- 1 file changed, 249 insertions(+), 42 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 3935a86c..cacf65d2 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -6,7 +6,7 @@ use super::{ }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use chrono::NaiveDateTime; +use chrono::{NaiveDateTime, NaiveTime}; use num_derive::FromPrimitive; use rug::Integer; @@ -23,8 +23,6 @@ pub enum Datatype { BigInt, Boolean, LangString, - DateTime, - Date, AnyURI, Language, NormalizedString, @@ -32,6 +30,33 @@ pub enum Datatype { NMToken, Name, NCName, + Notation, + QName, + ID, + IDRef, + Entity, + PositiveInteger, + NonNegativeInteger, + NonPositiveInteger, + NegativeInteger, + Date, + DateTime, + DateTimeStamp, + Time, + GYear, + GMonth, + GDay, + GYearMonth, + GMonthDay, + Duration, + YearMonthDuration, + DayTimeDuration, + UInt8, + Int8, + UInt16, + Int16, + Base64Binary, + HexBinary, } impl Datatype { @@ -101,6 +126,48 @@ impl TdbDataType for String { } } +impl TdbDataType for u8 { + fn datatype() -> Datatype { + Datatype::UInt32 + } +} + +impl FromLexical for u8 { + fn from_lexical(b: B) -> Self { + b.reader().read_u8().unwrap() + } +} + +impl ToLexical for u8 { + fn to_lexical(&self) -> Bytes { + let mut buf = BytesMut::new().writer(); + buf.write_u8(*self).unwrap(); + + buf.into_inner().freeze() + } +} + +impl TdbDataType for u16 { + fn datatype() -> Datatype { + Datatype::UInt16 + } +} + +impl FromLexical for u16 { + fn from_lexical(b: B) -> Self { + b.reader().read_u16::().unwrap() + } +} + +impl ToLexical for u16 { + fn to_lexical(&self) -> Bytes { + let mut buf = BytesMut::new().writer(); + buf.write_u16::(*self).unwrap(); + + buf.into_inner().freeze() + } +} + impl TdbDataType for u32 { fn datatype() -> Datatype { Datatype::UInt32 @@ -122,6 +189,52 @@ impl ToLexical for u32 { } } +const I8_BYTE_MASK: u8 = 0b1000_0000; +impl TdbDataType for i8 { + fn datatype() -> Datatype { + Datatype::Int8 + } +} + +impl FromLexical for i8 { + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u8().unwrap(); + (I8_BYTE_MASK ^ i) as i8 + } +} + +impl ToLexical for i8 { + fn to_lexical(&self) -> Bytes { + let sign_flip = I8_BYTE_MASK ^ (*self as u8); + let mut buf = BytesMut::new().writer(); + buf.write_u8(sign_flip).unwrap(); + buf.into_inner().freeze() + } +} + +const I16_BYTE_MASK: u16 = 0b1000_0000 << 8; +impl TdbDataType for i16 { + fn datatype() -> Datatype { + Datatype::Int16 + } +} + +impl FromLexical for i16 { + fn from_lexical(b: B) -> Self { + let i = b.reader().read_u16::().unwrap(); + (I16_BYTE_MASK ^ i) as i16 + } +} + +impl ToLexical for i16 { + fn to_lexical(&self) -> Bytes { + let sign_flip = I16_BYTE_MASK ^ (*self as u16); + let mut buf = BytesMut::new().writer(); + buf.write_u16::(sign_flip).unwrap(); + buf.into_inner().freeze() + } +} + const I32_BYTE_MASK: u32 = 0b1000_0000 << (3 * 8); impl TdbDataType for i32 { fn datatype() -> Datatype { @@ -302,16 +415,6 @@ impl FromLexical for String { } } -/* -impl FromLexical for f64 { - fn from_lexical(b: B) -> Self { - let s = Decimal::from_lexical(b).0; - s.parse::() - .expect("Too much precision for cast from decimal to f64") - } -} -*/ - impl ToLexical for Decimal { fn to_lexical(&self) -> Bytes { Bytes::from(decimal_to_storage(&self.0)) @@ -327,11 +430,7 @@ impl TdbDataType for bool { impl FromLexical for bool { fn from_lexical(mut b: B) -> Self { let num = b.get_u8(); - if num == 0 { - false - } else { - true - } + num != 0 } } @@ -345,6 +444,127 @@ impl ToLexical for bool { } } +impl TdbDataType for NaiveDateTime { + fn datatype() -> Datatype { + Datatype::DateTime + } +} + +impl ToLexical for NaiveDateTime { + fn to_lexical(&self) -> Bytes { + Bytes::from(datetime_to_storage(self)) + } +} + +impl FromLexical for NaiveDateTime { + fn from_lexical(mut b: B) -> Self { + storage_to_datetime(&mut b) + } +} + +pub struct DateTimeStamp(NaiveDateTime); + +impl TdbDataType for DateTimeStamp { + fn datatype() -> Datatype { + Datatype::DateTimeStamp + } +} + +impl ToLexical for DateTimeStamp { + fn to_lexical(&self) -> Bytes { + Bytes::from(datetime_to_storage(&self.0)) + } +} + +impl FromLexical for DateTimeStamp { + fn from_lexical(mut b: B) -> Self { + DateTimeStamp(storage_to_datetime(&mut b)) + } +} + +impl TdbDataType for NaiveTime { + fn datatype() -> Datatype { + Datatype::Time + } +} + +impl ToLexical for NaiveTime { + fn to_lexical(&self) -> Bytes { + self.to_string().into() + } +} + +impl FromLexical for NaiveTime { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + String::from_utf8(vec) + .unwrap() + .parse::() + .unwrap() + } +} + +struct GYear(i64); + +impl TdbDataType for GYear { + fn datatype() -> Datatype { + Datatype::GYear + } +} + +impl ToLexical for GYear { + fn to_lexical(&self) -> Bytes { + self.0.to_lexical() + } +} + +impl FromLexical for GYear { + fn from_lexical(b: B) -> Self { + GYear(i64::from_lexical(b)) + } +} + +struct GMonth(u8); + +impl TdbDataType for GMonth { + fn datatype() -> Datatype { + Datatype::GMonth + } +} + +impl ToLexical for GMonth { + fn to_lexical(&self) -> Bytes { + self.0.to_lexical() + } +} + +impl FromLexical for GMonth { + fn from_lexical(b: B) -> Self { + GMonth(u8::from_lexical(b)) + } +} + +struct GDay(u8); + +impl TdbDataType for GDay { + fn datatype() -> Datatype { + Datatype::GDay + } +} + +impl ToLexical for GDay { + fn to_lexical(&self) -> Bytes { + self.0.to_lexical() + } +} + +impl FromLexical for GDay { + fn from_lexical(b: B) -> Self { + GDay(u8::from_lexical(b)) + } +} + macro_rules! stringy_type { ($ty:ident) => { stringy_type!($ty, $ty); @@ -389,7 +609,6 @@ macro_rules! stringy_type { }; } -/* macro_rules! biginty_type { ($ty:ident) => { biginty_type!($ty, $ty); @@ -406,13 +625,13 @@ macro_rules! biginty_type { impl FromLexical<$ty> for $ty { fn from_lexical(mut b: B) -> Self { - $ty(storage_to_bigint(&mut b).to_string()) + $ty(storage_to_bigint(&mut b)) } } impl FromLexical<$ty> for String { fn from_lexical(mut b: B) -> Self { - $ty(storage_to_bigint(&mut b).to_string()) + storage_to_bigint(&mut b).to_string() } } @@ -423,7 +642,6 @@ macro_rules! biginty_type { } }; } -*/ stringy_type!(LangString); stringy_type!(NCName); @@ -433,28 +651,17 @@ stringy_type!(NMToken); stringy_type!(NormalizedString); stringy_type!(Language); stringy_type!(AnyURI); +stringy_type!(Notation); +stringy_type!(QName); +stringy_type!(ID); +stringy_type!(IDRef); +stringy_type!(Entity); + +stringy_type!(Duration); +stringy_type!(YearMonthDuration); +stringy_type!(DayTimeDuration); -/* biginty_type!(PositiveInteger); biginty_type!(NonNegativeInteger); biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); -*/ - -impl TdbDataType for NaiveDateTime { - fn datatype() -> Datatype { - Datatype::DateTime - } -} - -impl ToLexical for NaiveDateTime { - fn to_lexical(&self) -> Bytes { - Bytes::from(datetime_to_storage(self)) - } -} - -impl FromLexical for NaiveDateTime { - fn from_lexical(mut b: B) -> Self { - storage_to_datetime(&mut b) - } -} From e3a1424ee51b6e27fcb70c574671852d6206f8a9 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 15:55:04 +0100 Subject: [PATCH 86/99] Adding more types to store --- src/structure/tfc/datatypes.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index cacf65d2..0ccef4d6 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -70,7 +70,7 @@ impl Datatype { pub fn record_size(&self) -> Option { match self { - Datatype::Boolean => Some(4), // this is huge + Datatype::Boolean => None, Datatype::String => None, Datatype::UInt32 => Some(4), Datatype::Int32 => Some(4), @@ -505,7 +505,7 @@ impl FromLexical for NaiveTime { } } -struct GYear(i64); +pub struct GYear(pub i64); impl TdbDataType for GYear { fn datatype() -> Datatype { @@ -525,7 +525,7 @@ impl FromLexical for GYear { } } -struct GMonth(u8); +pub struct GMonth(pub u8); impl TdbDataType for GMonth { fn datatype() -> Datatype { @@ -615,7 +615,7 @@ macro_rules! biginty_type { }; ($ty:ident, $datatype:ident) => { #[derive(PartialEq, Debug)] - pub struct $ty(Integer); + pub struct $ty(pub Integer); impl TdbDataType for $ty { fn datatype() -> Datatype { From 59763072be3401c55e6fae51431dc1161500ad3b Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 19:51:04 +0100 Subject: [PATCH 87/99] Adding gyear, days, etc. --- src/structure/tfc/datatypes.rs | 164 +++++++++++++++++++++++++++++++-- 1 file changed, 155 insertions(+), 9 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 0ccef4d6..3d2a7ef4 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -505,7 +505,10 @@ impl FromLexical for NaiveTime { } } -pub struct GYear(pub i64); +pub struct GYear { + pub year: i64, + pub offset: i16, +} impl TdbDataType for GYear { fn datatype() -> Datatype { @@ -515,17 +518,47 @@ impl TdbDataType for GYear { impl ToLexical for GYear { fn to_lexical(&self) -> Bytes { - self.0.to_lexical() + let year = self.year.to_lexical(); + let offset = self.offset.to_lexical(); + [year, offset].concat().into() } } impl FromLexical for GYear { + fn from_lexical(mut b: B) -> Self { + let year = i64::from_lexical(&mut b); + let offset = i16::from_lexical(b); + GYear { year, offset } + } +} + +fn offset_string(offset: i16) -> String { + if offset == 0 { + "".to_string() + } else { + let hours = offset / 60; + let minutes = offset % 60; + if hours < 0 { + format!("-{hours}:{minutes}") + } else { + format!("+{hours}:{minutes}") + } + } +} + +impl FromLexical for String { fn from_lexical(b: B) -> Self { - GYear(i64::from_lexical(b)) + let gyear = GYear::from_lexical(b); + let year = gyear.year; + let offset = offset_string(gyear.offset); + format!("{year:04}{offset:}") } } -pub struct GMonth(pub u8); +pub struct GMonth { + month: u8, + offset: i16, +} impl TdbDataType for GMonth { fn datatype() -> Datatype { @@ -535,17 +568,33 @@ impl TdbDataType for GMonth { impl ToLexical for GMonth { fn to_lexical(&self) -> Bytes { - self.0.to_lexical() + let month = self.month.to_lexical(); + let offset = self.offset.to_lexical(); + [month, offset].concat().into() } } impl FromLexical for GMonth { + fn from_lexical(mut b: B) -> Self { + let month = u8::from_lexical(&mut b); + let offset = i16::from_lexical(b); + GMonth { month, offset } + } +} + +impl FromLexical for String { fn from_lexical(b: B) -> Self { - GMonth(u8::from_lexical(b)) + let gmonth = GMonth::from_lexical(b); + let month = gmonth.month; + let offset = offset_string(gmonth.offset); + format!("-{month:02}{offset:}") } } -struct GDay(u8); +struct GDay { + day: u8, + offset: i16, +} impl TdbDataType for GDay { fn datatype() -> Datatype { @@ -555,13 +604,110 @@ impl TdbDataType for GDay { impl ToLexical for GDay { fn to_lexical(&self) -> Bytes { - self.0.to_lexical() + let day = self.day.to_lexical(); + let offset = self.offset.to_lexical(); + [day, offset].concat().into() } } impl FromLexical for GDay { + fn from_lexical(mut b: B) -> Self { + let day = u8::from_lexical(&mut b); + let offset = i16::from_lexical(b); + GDay { day, offset } + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let gday = GDay::from_lexical(b); + let day = gday.day; + let offset = offset_string(gday.offset); + format!("--{day:02}{offset:}") + } +} + +struct GYearMonth { + year: i64, + month: u8, + offset: i16, +} + +impl TdbDataType for GYearMonth { + fn datatype() -> Datatype { + Datatype::GYearMonth + } +} + +impl ToLexical for GYearMonth { + fn to_lexical(&self) -> Bytes { + let year = self.year.to_lexical(); + let month = self.month.to_lexical(); + let offset = self.offset.to_lexical(); + [year, month, offset].concat().into() + } +} + +impl FromLexical for GYearMonth { + fn from_lexical(mut b: B) -> Self { + let year = i64::from_lexical(&mut b); + let month = u8::from_lexical(&mut b); + let offset = i16::from_lexical(b); + GYearMonth { + year, + month, + offset, + } + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let gyearmonth = GYearMonth::from_lexical(b); + let year = gyearmonth.year; + let month = gyearmonth.month; + let offset = offset_string(gyearmonth.offset); + format!("{year:04}-{month:02}{offset:}") + } +} + +struct GMonthDay { + month: u8, + day: u8, + offset: i16, +} + +impl TdbDataType for GMonthDay { + fn datatype() -> Datatype { + Datatype::GMonthDay + } +} + +impl ToLexical for GMonthDay { + fn to_lexical(&self) -> Bytes { + let month = self.month.to_lexical(); + let day = self.day.to_lexical(); + let offset = self.offset.to_lexical(); + [month, day, offset].concat().into() + } +} + +impl FromLexical for GMonthDay { + fn from_lexical(mut b: B) -> Self { + let month = u8::from_lexical(&mut b); + let day = u8::from_lexical(&mut b); + let offset = i16::from_lexical(b); + GMonthDay { month, day, offset } + } +} + +impl FromLexical for String { fn from_lexical(b: B) -> Self { - GDay(u8::from_lexical(b)) + let gmonthday = GMonthDay::from_lexical(b); + let month = gmonthday.month; + let day = gmonthday.day; + let offset = offset_string(gmonthday.offset); + format!("-{month:02}-{day:02}{offset:}") } } From 5d2b3566216db76861d3e41efb961f485aff1c51 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Fri, 9 Dec 2022 20:05:52 +0100 Subject: [PATCH 88/99] Make acccessors public --- src/structure/tfc/datatypes.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 3d2a7ef4..e51c604b 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -556,8 +556,8 @@ impl FromLexical for String { } pub struct GMonth { - month: u8, - offset: i16, + pub month: u8, + pub offset: i16, } impl TdbDataType for GMonth { @@ -591,9 +591,9 @@ impl FromLexical for String { } } -struct GDay { - day: u8, - offset: i16, +pub struct GDay { + pub day: u8, + pub offset: i16, } impl TdbDataType for GDay { @@ -627,10 +627,10 @@ impl FromLexical for String { } } -struct GYearMonth { - year: i64, - month: u8, - offset: i16, +pub struct GYearMonth { + pub year: i64, + pub month: u8, + pub offset: i16, } impl TdbDataType for GYearMonth { @@ -671,10 +671,10 @@ impl FromLexical for String { } } -struct GMonthDay { - month: u8, - day: u8, - offset: i16, +pub struct GMonthDay { + pub month: u8, + pub day: u8, + pub offset: i16, } impl TdbDataType for GMonthDay { From 9dd5f8e69df6eda244624d36c5874176bfee229b Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 10 Dec 2022 01:57:30 +0100 Subject: [PATCH 89/99] Add some more datatypes --- Cargo.toml | 2 + src/structure/tfc/datatypes.rs | 125 ++++++++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 86f79137..b77f75be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,8 @@ rug = {version="1.16", default-features=false, features=["integer","rational"]} num-derive = "0.3" num-traits = "0.2" chrono = "0.4" +base64 = "0.13" +hex = "0.4" [dev-dependencies] tempfile = "3.1" diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index e51c604b..d84ddf8f 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -4,6 +4,7 @@ use super::{ integer::{bigint_to_storage, storage_to_bigint}, TypedDictEntry, }; +use base64::display::Base64Display; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use chrono::{NaiveDateTime, NaiveTime}; @@ -462,7 +463,7 @@ impl FromLexical for NaiveDateTime { } } -pub struct DateTimeStamp(NaiveDateTime); +pub struct DateTimeStamp(pub NaiveDateTime); impl TdbDataType for DateTimeStamp { fn datatype() -> Datatype { @@ -482,6 +483,12 @@ impl FromLexical for DateTimeStamp { } } +impl FromLexical for NaiveDateTime { + fn from_lexical(mut b: B) -> Self { + storage_to_datetime(&mut b) + } +} + impl TdbDataType for NaiveTime { fn datatype() -> Datatype { Datatype::Time @@ -505,6 +512,55 @@ impl FromLexical for NaiveTime { } } +pub struct Date { + pub year: i64, + pub month: u8, + pub day: u8, + pub offset: i16, +} + +impl TdbDataType for Date { + fn datatype() -> Datatype { + Datatype::Date + } +} + +impl ToLexical for Date { + fn to_lexical(&self) -> Bytes { + let year = self.year.to_lexical(); + let month = self.month.to_lexical(); + let day = self.month.to_lexical(); + let offset = self.offset.to_lexical(); + [year, month, day, offset].concat().into() + } +} + +impl FromLexical for Date { + fn from_lexical(mut b: B) -> Self { + let year = i64::from_lexical(&mut b); + let month = u8::from_lexical(&mut b); + let day = u8::from_lexical(&mut b); + let offset = i16::from_lexical(b); + Date { + year, + month, + day, + offset, + } + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let date = Date::from_lexical(b); + let year = date.year; + let month = date.month; + let day = date.day; + let offset = offset_string(date.offset); + format!("{year:04}-{month:02}-{day:02}{offset:}") + } +} + pub struct GYear { pub year: i64, pub offset: i16, @@ -711,6 +767,67 @@ impl FromLexical for String { } } +pub struct Base64Binary(pub Vec); + +impl ToLexical for Base64Binary { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(&self.0[..]) + } +} + +impl FromLexical for Base64Binary { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + Base64Binary(vec) + } +} + +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + let wrapper = Base64Display::with_config(&vec, base64::STANDARD); + format!("{wrapper}") + } +} + +impl TdbDataType for Base64Binary { + fn datatype() -> Datatype { + Datatype::Base64Binary + } +} + +pub struct HexBinary(pub Vec); + +impl ToLexical for HexBinary { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(&self.0[..]) + } +} + +impl FromLexical for HexBinary { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + HexBinary(vec) + } +} + +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let mut vec = vec![0; b.remaining()]; + b.copy_to_slice(&mut vec); + hex::encode(vec) + } +} + +impl TdbDataType for HexBinary { + fn datatype() -> Datatype { + Datatype::HexBinary + } +} + macro_rules! stringy_type { ($ty:ident) => { stringy_type!($ty, $ty); @@ -781,6 +898,12 @@ macro_rules! biginty_type { } } + impl FromLexical<$ty> for Integer { + fn from_lexical(mut b: B) -> Self { + storage_to_bigint(&mut b) + } + } + impl ToLexical<$ty> for $ty { fn to_lexical(&self) -> Bytes { Bytes::from(bigint_to_storage(self.0.clone())) From 9ceb73e899a17ed3e4a23f72a34724ea2e1ee655 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 10 Dec 2022 10:24:12 +0100 Subject: [PATCH 90/99] Typo bug --- src/structure/tfc/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index d84ddf8f..469ac7cb 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -529,7 +529,7 @@ impl ToLexical for Date { fn to_lexical(&self) -> Bytes { let year = self.year.to_lexical(); let month = self.month.to_lexical(); - let day = self.month.to_lexical(); + let day = self.day.to_lexical(); let offset = self.offset.to_lexical(); [year, month, day, offset].concat().into() } From dc6cec6ad9697fd7561f2969cd180e2ee5a39b6e Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sat, 10 Dec 2022 18:51:54 +0100 Subject: [PATCH 91/99] Typo! --- src/structure/tfc/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 469ac7cb..dc53fc6b 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -129,7 +129,7 @@ impl TdbDataType for String { impl TdbDataType for u8 { fn datatype() -> Datatype { - Datatype::UInt32 + Datatype::UInt8 } } From f1557d85c187b2aee4adc788a1c8787dbc9359d8 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sun, 11 Dec 2022 00:01:15 +0100 Subject: [PATCH 92/99] Add any simple type --- src/structure/tfc/datatypes.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index dc53fc6b..78c0b6a0 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -58,6 +58,7 @@ pub enum Datatype { Int16, Base64Binary, HexBinary, + AnySimpleType, } impl Datatype { @@ -930,7 +931,10 @@ stringy_type!(Duration); stringy_type!(YearMonthDuration); stringy_type!(DayTimeDuration); +stringy_type!(AnySimpleType); + biginty_type!(PositiveInteger); biginty_type!(NonNegativeInteger); biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); + From ce4d31e1b4fade4623b954418012f0a8894d0ee0 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sun, 11 Dec 2022 00:05:28 +0100 Subject: [PATCH 93/99] Fix tests --- src/structure/tfc/typed.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 5a5a27fd..6bea16ac 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -799,7 +799,7 @@ mod tests { Decimal::make_entry(&Decimal("2".to_string())), Decimal::make_entry(&Decimal("0".to_string())), f32::make_entry(&4.389832_f32), - f32::make_entry(&23434.389832_f32), + f32::make_entry(&23434.389_f32), Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); @@ -995,7 +995,7 @@ mod tests { Decimal::make_entry(&Decimal("2".to_string())), Decimal::make_entry(&Decimal("0".to_string())), f32::make_entry(&4.389832_f32), - f32::make_entry(&23434.389832_f32), + f32::make_entry(&23434.389_f32), Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); @@ -1040,7 +1040,7 @@ mod tests { u32::make_entry(&20_u32), i64::make_entry(&-3_i64), Decimal::make_entry(&Decimal("-12342343.2348973".to_string())), - f32::make_entry(&23434.389832_f32), + f32::make_entry(&23434.389_f32), Integer::make_entry(&int("239487329872343987")), ]; vec.sort(); @@ -1187,8 +1187,8 @@ mod tests { data.freeze(), ); - for i in 0..vec.len() { - assert_eq!(vec[i], dict.entry(i + 1).unwrap()) + for (i, e) in vec.into_iter().enumerate() { + assert_eq!(e, dict.entry(i + 1).unwrap()) } } } From 1e582626895856db4e7d9bf3d8e71846465f1abc Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Sun, 11 Dec 2022 11:35:29 +0100 Subject: [PATCH 94/99] Create fake f32 --- src/structure/tfc/datatypes.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 78c0b6a0..c43522ee 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -370,6 +370,35 @@ impl ToLexical for f64 { } } +// Fake f32s to avoid rounding errors +#[derive(PartialEq, Debug)] +pub struct Float32(pub f64); + +impl TdbDataType for Float32 { + fn datatype() -> Datatype { + Datatype::Float32 + } +} + +impl FromLexical for Float32 { + fn from_lexical(b: B) -> Self { + Float32(FromLexical::::from_lexical(b)) + } +} + +impl FromLexical for f64 { + fn from_lexical(b: B) -> Self { + // TODO make this better + Float32::from_lexical(b).0 + } +} + +impl ToLexical for Float32 { + fn to_lexical(&self) -> Bytes { + ToLexical::::to_lexical(&self.0) + } +} + impl TdbDataType for Integer { fn datatype() -> Datatype { Datatype::BigInt @@ -937,4 +966,3 @@ biginty_type!(PositiveInteger); biginty_type!(NonNegativeInteger); biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); - From f6a107ee7cd41e88701671530d1f1781d501cbfc Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Mon, 12 Dec 2022 11:39:51 +0100 Subject: [PATCH 95/99] Add back f32s with a cast to f64 --- src/structure/tfc/datatypes.rs | 35 ++++++---------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index c43522ee..37568c54 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -323,6 +323,12 @@ impl FromLexical for f32 { } } +impl FromLexical for f64 { + fn from_lexical(b: B) -> Self { + f32::from_lexical(b) as f64 + } +} + impl ToLexical for f32 { fn to_lexical(&self) -> Bytes { let f = *self; @@ -370,35 +376,6 @@ impl ToLexical for f64 { } } -// Fake f32s to avoid rounding errors -#[derive(PartialEq, Debug)] -pub struct Float32(pub f64); - -impl TdbDataType for Float32 { - fn datatype() -> Datatype { - Datatype::Float32 - } -} - -impl FromLexical for Float32 { - fn from_lexical(b: B) -> Self { - Float32(FromLexical::::from_lexical(b)) - } -} - -impl FromLexical for f64 { - fn from_lexical(b: B) -> Self { - // TODO make this better - Float32::from_lexical(b).0 - } -} - -impl ToLexical for Float32 { - fn to_lexical(&self) -> Bytes { - ToLexical::::to_lexical(&self.0) - } -} - impl TdbDataType for Integer { fn datatype() -> Datatype { Datatype::BigInt From caedc17e81f101905b17d961ceae2b7d59a8d4c9 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Mon, 12 Dec 2022 12:57:51 +0100 Subject: [PATCH 96/99] Adding durations --- src/structure/tfc/datatypes.rs | 155 ++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 4 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 37568c54..54e36259 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -774,6 +774,157 @@ impl FromLexical for String { } } +pub struct Duration { + pub sign: i8, + pub year: i64, + pub month: u8, + pub day: u8, + pub hour: u8, + pub minute: u8, + pub second: u8, +} + +impl TdbDataType for Duration { + fn datatype() -> Datatype { + Datatype::Duration + } +} + +impl ToLexical for Duration { + fn to_lexical(&self) -> Bytes { + let sign = self.sign.to_lexical(); + let year = self.year.to_lexical(); + let month = self.month.to_lexical(); + let day = self.day.to_lexical(); + let hour = self.hour.to_lexical(); + let minute = self.minute.to_lexical(); + let second = self.second.to_lexical(); + [sign, year, month, day, hour, minute, second] + .concat() + .into() + } +} + +impl FromLexical for Duration { + fn from_lexical(mut b: B) -> Self { + let sign = i8::from_lexical(&mut b); + let year = i64::from_lexical(&mut b); + let month = u8::from_lexical(&mut b); + let day = u8::from_lexical(&mut b); + let hour = u8::from_lexical(&mut b); + let minute = u8::from_lexical(&mut b); + let second = u8::from_lexical(b); + Duration { + sign, + year, + month, + day, + hour, + minute, + second, + } + } +} + +fn duration_string(duration: &Duration) -> String { + let year = if duration.year == 0 { + format!("{:04}Y", duration.year) + } else { + "".to_string() + }; + let month = if duration.month == 0 { + format!("{:02}M", duration.month) + } else { + "".to_string() + }; + let day = if duration.day == 0 { + format!("{:04}D", duration.year) + } else { + "".to_string() + }; + if duration.hour == 0 && duration.minute == 0 && duration.second == 0 { + format!("P{year}{month}{day}") + } else { + let hour = if duration.hour == 0 { + format!("{:02}H", duration.hour) + } else { + "".to_string() + }; + let minute = if duration.minute == 0 { + format!("{:02}M", duration.minute) + } else { + "".to_string() + }; + let second = if duration.second == 0 { + format!("{:02}S", duration.second) + } else { + "".to_string() + }; + format!("{year}{month}{day}T{hour}{minute}{second}") + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let duration = Duration::from_lexical(b); + duration_string(&duration) + } +} + +pub struct YearMonthDuration(pub Duration); + +impl TdbDataType for YearMonthDuration { + fn datatype() -> Datatype { + Datatype::YearMonthDuration + } +} + +impl ToLexical for YearMonthDuration { + fn to_lexical(&self) -> Bytes { + Duration::to_lexical(&self.0) + } +} + +impl FromLexical for YearMonthDuration { + fn from_lexical(b: B) -> Self { + YearMonthDuration(Duration::from_lexical(b)) + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let duration = Duration::from_lexical(b); + duration_string(&duration) + } +} + +pub struct DayTimeDuration(pub Duration); + +impl TdbDataType for DayTimeDuration { + fn datatype() -> Datatype { + Datatype::DayTimeDuration + } +} + +impl ToLexical for DayTimeDuration { + fn to_lexical(&self) -> Bytes { + Duration::to_lexical(&self.0) + } +} + +impl FromLexical for DayTimeDuration { + fn from_lexical(b: B) -> Self { + DayTimeDuration(Duration::from_lexical(b)) + } +} + +impl FromLexical for String { + fn from_lexical(b: B) -> Self { + let duration = Duration::from_lexical(b); + duration_string(&duration) + } +} + pub struct Base64Binary(pub Vec); impl ToLexical for Base64Binary { @@ -933,10 +1084,6 @@ stringy_type!(ID); stringy_type!(IDRef); stringy_type!(Entity); -stringy_type!(Duration); -stringy_type!(YearMonthDuration); -stringy_type!(DayTimeDuration); - stringy_type!(AnySimpleType); biginty_type!(PositiveInteger); From 5aa62c83bae40d1ce389b67ff0a74dee0e2ba3e2 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 13 Dec 2022 00:35:04 +0100 Subject: [PATCH 97/99] Adding string casts --- src/structure/tfc/datatypes.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 54e36259..7a4cc9ff 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -470,6 +470,13 @@ impl FromLexical for NaiveDateTime { } } +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let ndt = storage_to_datetime(&mut b); + ndt.format("%Y-%m-%dT%H:%M:%S%.fZ").to_string() + } +} + pub struct DateTimeStamp(pub NaiveDateTime); impl TdbDataType for DateTimeStamp { @@ -496,6 +503,13 @@ impl FromLexical for NaiveDateTime { } } +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let ndt = storage_to_datetime(&mut b); + ndt.format("%Y-%m-%dT%H:%M:%S%.fZ").to_string() + } +} + impl TdbDataType for NaiveTime { fn datatype() -> Datatype { Datatype::Time @@ -519,6 +533,13 @@ impl FromLexical for NaiveTime { } } +impl FromLexical for String { + fn from_lexical(mut b: B) -> Self { + let ndt = NaiveTime::from_lexical(&mut b); + ndt.format("%H:%M:%S%.fZ").to_string() + } +} + pub struct Date { pub year: i64, pub month: u8, From 07d4f1292b59a9753e9d961689a624c009a7736a Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 13 Dec 2022 11:01:20 +0100 Subject: [PATCH 98/99] fix test --- src/structure/tfc/typed.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 6bea16ac..651b9a2e 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -681,7 +681,7 @@ mod tests { cycle(f64::NEG_INFINITY); cycle(f64::INFINITY); - let j = f64::from_lexical(f64::NAN.to_lexical()); + let j = >::from_lexical(f64::NAN.to_lexical()); assert!(j.is_nan()) } From eb31c89c1089261836e17004e0e3473877e14954 Mon Sep 17 00:00:00 2001 From: Gavin Mendel-Gleason Date: Tue, 13 Dec 2022 11:13:34 +0100 Subject: [PATCH 99/99] Fix date tyeps --- src/structure/tfc/datatypes.rs | 36 ++++++++++++++-------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 7a4cc9ff..da9d2c81 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -497,12 +497,6 @@ impl FromLexical for DateTimeStamp { } } -impl FromLexical for NaiveDateTime { - fn from_lexical(mut b: B) -> Self { - storage_to_datetime(&mut b) - } -} - impl FromLexical for String { fn from_lexical(mut b: B) -> Self { let ndt = storage_to_datetime(&mut b); @@ -671,7 +665,7 @@ impl FromLexical for String { let gmonth = GMonth::from_lexical(b); let month = gmonth.month; let offset = offset_string(gmonth.offset); - format!("-{month:02}{offset:}") + format!("--{month:02}{offset:}") } } @@ -707,7 +701,7 @@ impl FromLexical for String { let gday = GDay::from_lexical(b); let day = gday.day; let offset = offset_string(gday.offset); - format!("--{day:02}{offset:}") + format!("---{day:02}{offset:}") } } @@ -848,40 +842,40 @@ impl FromLexical for Duration { } fn duration_string(duration: &Duration) -> String { - let year = if duration.year == 0 { - format!("{:04}Y", duration.year) + let year = if duration.year != 0 { + format!("{}Y", duration.year) } else { "".to_string() }; - let month = if duration.month == 0 { - format!("{:02}M", duration.month) + let month = if duration.month != 0 { + format!("{}M", duration.month) } else { "".to_string() }; - let day = if duration.day == 0 { - format!("{:04}D", duration.year) + let day = if duration.day != 0 { + format!("{}D", duration.day) } else { "".to_string() }; if duration.hour == 0 && duration.minute == 0 && duration.second == 0 { format!("P{year}{month}{day}") } else { - let hour = if duration.hour == 0 { - format!("{:02}H", duration.hour) + let hour = if duration.hour != 0 { + format!("{}H", duration.hour) } else { "".to_string() }; - let minute = if duration.minute == 0 { - format!("{:02}M", duration.minute) + let minute = if duration.minute != 0 { + format!("{}M", duration.minute) } else { "".to_string() }; - let second = if duration.second == 0 { - format!("{:02}S", duration.second) + let second = if duration.second != 0 { + format!("{}S", duration.second) } else { "".to_string() }; - format!("{year}{month}{day}T{hour}{minute}{second}") + format!("P{year}{month}{day}T{hour}{minute}{second}") } }