Skip to content

Commit

Permalink
Compute space usage of a Searcher / SegmentReader / CompositeFile
Browse files Browse the repository at this point in the history
  • Loading branch information
jason-wolfe committed May 2, 2018
1 parent 5637657 commit b078306
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 0 deletions.
13 changes: 13 additions & 0 deletions src/common/composite_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use common::VInt;
use directory::ReadOnlySource;
use directory::WritePtr;
use schema::Field;
use space_usage::PerFieldSpaceUsage;
use space_usage::ByteCount;
use space_usage::FieldUsage;
use std::collections::HashMap;
use std::io::Write;
use std::io::{self, Read};
Expand Down Expand Up @@ -166,6 +169,16 @@ impl CompositeFile {
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
}

pub fn space_usage(&self) -> PerFieldSpaceUsage {
let mut fields = HashMap::new();
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
fields.entry(field_addr.field)
.or_insert_with(|| FieldUsage::empty(field_addr.field))
.add_field_idx(field_addr.idx, ByteCount(end - start));
}
PerFieldSpaceUsage::new(fields)
}
}

#[cfg(test)]
Expand Down
10 changes: 10 additions & 0 deletions src/core/searcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use query::Query;
use schema::Document;
use schema::Schema;
use schema::{Field, Term};
use space_usage::SearcherSpaceUsage;
use std::fmt;
use std::sync::Arc;
use termdict::TermMerger;
Expand Down Expand Up @@ -84,6 +85,15 @@ impl Searcher {
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}

/// Summarize total space usage of this searcher.
pub fn space_usage(&self) -> SearcherSpaceUsage {
let mut space_usage = SearcherSpaceUsage::new();
for segment_reader in self.segment_readers.iter() {
space_usage.add_segment(segment_reader.space_usage());
}
space_usage
}
}

pub struct FieldSearcher {
Expand Down
16 changes: 16 additions & 0 deletions src/core/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ use schema::Document;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use space_usage::SegmentSpaceUsage;
use space_usage::ByteCount;
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
Expand Down Expand Up @@ -328,6 +330,20 @@ impl SegmentReader {
.map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false)
}

/// Summarize total space usage of this segment.
pub fn space_usage(&self) -> SegmentSpaceUsage {
SegmentSpaceUsage::new(
self.num_docs(),
self.termdict_composite.space_usage(),
self.postings_composite.space_usage(),
self.positions_composite.space_usage(),
self.fast_fields_composite.space_usage(),
self.fieldnorms_composite.space_usage(),
self.store_reader.space_usage(),
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(ByteCount(0)),
)
}
}

impl fmt::Debug for SegmentReader {
Expand Down
6 changes: 6 additions & 0 deletions src/fastfield/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use bit_set::BitSet;
use common::HasLen;
use directory::ReadOnlySource;
use directory::WritePtr;
use space_usage::ByteCount;
use std::io;
use std::io::Write;
use DocId;
Expand Down Expand Up @@ -62,6 +63,11 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
}

/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
ByteCount(self.data.len())
}
}

impl HasLen for DeleteBitSet {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ pub(crate) mod fieldnorm;
pub mod postings;
pub mod query;
pub mod schema;
pub mod space_usage;
pub mod store;
pub mod termdict;

Expand Down
197 changes: 197 additions & 0 deletions src/space_usage/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
/*!
Representations for the space usage of various parts of a Tantivy index.
This can be used programatically, and will also be exposed in a human readable fashion in
tantivy-cli.
One important caveat for all of this functionality is that none of it currently takes storage-level
details into consideration. For example, if your file system block size is 4096 bytes, we can
under-count actual resultant space usage by up to 4095 bytes per file.
*/

use schema::Field;
use std::collections::HashMap;
use std::ops::{Add, AddAssign};

/// Indicates space usage in bytes
#[derive(Clone, Copy, Debug)]
pub struct ByteCount(pub usize);

impl Add for ByteCount {
type Output = ByteCount;
fn add(self, rhs: ByteCount) -> ByteCount {
ByteCount(self.0 + rhs.0)
}
}

impl AddAssign for ByteCount {
fn add_assign(&mut self, rhs: ByteCount) {
self.0 += rhs.0;
}
}

/// Represents combined space usage of an entire searcher and its component segments.
#[derive(Clone, Debug)]
pub struct SearcherSpaceUsage {
segments: Vec<SegmentSpaceUsage>,
total: ByteCount,
}

impl SearcherSpaceUsage {
pub(crate) fn new() -> SearcherSpaceUsage {
SearcherSpaceUsage {
segments: Vec::new(),
total: ByteCount(0),
}
}

/// Add a segment, to `self`.
/// Performs no deduplication or other intelligence.
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
self.total += segment.total();
self.segments.push(segment);
}

/// Returns total byte usage of this searcher, including all large subcomponents.
/// Does not account for smaller things like `meta.json`.
pub fn total(&self) -> ByteCount {
self.total
}
}

/// Represents combined space usage for all of the large components comprising a segment.
#[derive(Clone, Debug)]
pub struct SegmentSpaceUsage {
num_docs: u32,

termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,

store: StoreSpaceUsage,

deletes: ByteCount,

total: ByteCount,
}

impl SegmentSpaceUsage {
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
deletes: ByteCount,
) -> SegmentSpaceUsage {
let total = termdict.total()
+ postings.total()
+ positions.total()
+ fast_fields.total()
+ fieldnorms.total()
+ store.total()
+ deletes;
SegmentSpaceUsage {
num_docs,
termdict,
postings,
positions,
fast_fields,
fieldnorms,
store,
deletes,
total,
}
}

/// Total space usage in bytes for this segment.
pub fn total(&self) -> ByteCount {
self.total
}
}

/// Represents space usage for the Store for this segment.
///
/// This is composed of two parts.
/// `data` represents the compressed data itself.
/// `offsets` represents a lookup to find the start of a block
#[derive(Clone, Debug)]
pub struct StoreSpaceUsage {
data: ByteCount,
offsets: ByteCount,
}

impl StoreSpaceUsage {
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
StoreSpaceUsage { data, offsets }
}

/// Total space usage in bytes for this Store
pub fn total(&self) -> ByteCount {
self.data + self.offsets
}
}

/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
///
/// A field can appear with a single index (typically 0) or with multiple indexes.
/// Multiple indexes are used to handle variable length things, where
#[derive(Clone, Debug)]
pub struct PerFieldSpaceUsage {
fields: HashMap<Field, FieldUsage>,
total: ByteCount
}

impl PerFieldSpaceUsage {
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
let total = fields.values().map(|x| x.total()).fold(ByteCount(0), Add::add);
PerFieldSpaceUsage { fields, total }
}

/// Bytes used by the represented file
pub fn total(&self) -> ByteCount {
self.total
}
}

/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
/// comprise it.
///
/// See documentation for PerFieldSpaceUsage for slightly more information.
#[derive(Clone, Debug)]
pub struct FieldUsage {
field: Field,
weight: ByteCount,
/// A field can be composed of more than one piece.
/// These pieces are indexed by arbitrary numbers starting at zero.
/// `self.weight` includes all of `self.sub_weights`.
sub_weights: Vec<Option<ByteCount>>,
}

impl FieldUsage {
pub(crate) fn empty(field: Field) -> FieldUsage {
FieldUsage {
field,
weight: ByteCount(0),
sub_weights: Vec::new(),
}
}

pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
if self.sub_weights.len() < idx {
self.sub_weights.resize(idx, None);
}
assert!(self.sub_weights[idx].is_none());
self.sub_weights[idx] = Some(size);
self.weight += size
}

/// Total bytes used for this field in this context
pub fn total(&self) -> ByteCount {
self.weight
}
}
7 changes: 7 additions & 0 deletions src/store/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use datastruct::SkipList;
use directory::ReadOnlySource;
use lz4;
use schema::Document;
use space_usage::StoreSpaceUsage;
use space_usage::ByteCount;
use std::cell::RefCell;
use std::io::{self, Read};
use std::mem::size_of;
Expand Down Expand Up @@ -89,6 +91,11 @@ impl StoreReader {
cursor = &cursor[..doc_length];
Ok(Document::deserialize(&mut cursor)?)
}

/// Summarize total space usage of this store reader.
pub fn space_usage(&self) -> StoreSpaceUsage {
StoreSpaceUsage::new(ByteCount(self.data.len()), ByteCount(self.offset_index_source.len()))
}
}

#[allow(needless_pass_by_value)]
Expand Down

0 comments on commit b078306

Please sign in to comment.