Skip to content

Commit

Permalink
Issue/1070 (#1071)
Browse files Browse the repository at this point in the history
Add a boolean flag in the Query::query_terms informing on whether
position information is required.

Closes #1070
  • Loading branch information
fulmicoton committed Jun 3, 2021
1 parent 2aad0ce commit 6e4b611
Show file tree
Hide file tree
Showing 10 changed files with 83 additions and 81 deletions.
12 changes: 7 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ Tantivy 0.15.0
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
- Add iterator over documents in doc store (@PSeitz). #1044
- Fix log merge policy (@PSeitz). #1043
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
- Make doc store compression dynamic (@PSeitz). #1060
- Switch to json for footer version handling (@PSeitz). #1060
- Add iterator over documents in doc store (@PSeitz). #1044
- Fix log merge policy (@PSeitz). #1043
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
- Make doc store compression dynamic (@PSeitz). #1060
- Switch to json for footer version handling (@PSeitz). #1060
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070


Tantivy 0.14.0
=========================
Expand Down
42 changes: 21 additions & 21 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,9 @@ impl IndexMerger {
Ok(everything_is_in_order)
}

pub(crate) fn get_sort_field_accessor<'b>(
pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader,
sort_by_field: &'b IndexSortByField,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Expand Down Expand Up @@ -456,22 +456,22 @@ impl IndexMerger {
// Loading the field accessor on demand causes a 15x regression

// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair = reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
})
.collect::<Vec<_>>();
let doc_id_reader_pair =
reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
});

// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
Expand Down Expand Up @@ -1017,12 +1017,12 @@ impl IndexMerger {
if reader.num_deleted_docs() > 0
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// => 5 * full blocks, 2 * 1 document blocks
//
// In a more realistic scenario the segments are of the same size, so 1/6 of
// the doc stores would be on average half full, given total randomness (which
// the doc stores would be on average half full, given total randomness (which
// is not the case here, but not sure how it behaves exactly).
//
// https://github.com/tantivy-search/tantivy/issues/1053
Expand Down
6 changes: 3 additions & 3 deletions src/query/boolean_query/boolean_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Searcher;
use std::collections::BTreeSet;
use std::collections::BTreeMap;

/// The boolean query returns a set of documents
/// that matches the Boolean combination of constituent subqueries.
Expand Down Expand Up @@ -159,9 +159,9 @@ impl Query for BooleanQuery {
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}

fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_occur, subquery) in &self.subqueries {
subquery.query_terms(term_set);
subquery.query_terms(terms);
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/query/boost_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;

/// `BoostQuery` is a wrapper over a query used to boost its score.
Expand Down Expand Up @@ -48,8 +48,8 @@ impl Query for BoostQuery {
Ok(boosted_weight)
}

fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
self.query.query_terms(term_set)
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.query.query_terms(terms)
}
}

Expand Down
42 changes: 21 additions & 21 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ mod tests {
use crate::schema::{Schema, TEXT};
use crate::Index;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;

#[test]
fn test_query_terms() {
Expand All @@ -78,49 +78,49 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("\"a b\"")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a a a a a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a -b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
}
}
25 changes: 11 additions & 14 deletions src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,9 @@ impl MoreLikeThis {
}
FieldType::U64(_) => {
for field_value in field_values {
let val = field_value
.value()
.u64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().u64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_u64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
Expand All @@ -249,7 +248,7 @@ impl MoreLikeThis {
let val = field_value
.value()
.date_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.timestamp();
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
Expand All @@ -259,10 +258,9 @@ impl MoreLikeThis {
}
FieldType::I64(_) => {
for field_value in field_values {
let val = field_value
.value()
.i64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().i64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
Expand All @@ -271,10 +269,9 @@ impl MoreLikeThis {
}
FieldType::F64(_) => {
for field_value in field_values {
let val = field_value
.value()
.f64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().f64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_f64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
Expand Down Expand Up @@ -306,7 +303,7 @@ impl MoreLikeThis {
{
return true;
}
return self.stop_words.contains(&word);
self.stop_words.contains(&word)
}

/// Couputes the score for each term while ignoring not useful terms
Expand Down
9 changes: 5 additions & 4 deletions src/query/phrase_query/phrase_query.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::collections::BTreeMap;

use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::Bm25Weight;
use crate::query::Query;
use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::{Field, Term};
use std::collections::BTreeSet;

/// `PhraseQuery` matches a specific sequence of words.
///
Expand Down Expand Up @@ -113,9 +114,9 @@ impl Query for PhraseQuery {
Ok(Box::new(phrase_weight))
}

fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
for (_, query_term) in &self.phrase_terms {
term_set.insert(query_term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_, term) in &self.phrase_terms {
terms.insert(term.clone(), true);
}
}
}
11 changes: 7 additions & 4 deletions src/query/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::query::Explanation;
use crate::DocAddress;
use crate::Term;
use downcast_rs::impl_downcast;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;

/// The `Query` trait defines a set of documents and a scoring method
Expand Down Expand Up @@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {

/// Extract all of the terms associated to the query and insert them in the
/// term set given in arguments.
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
///
/// Each term is associated with a boolean indicating whether
/// Positions are required or not.
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
}

/// Implements `box_clone`.
Expand All @@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
self.as_ref().count(searcher)
}

fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
self.as_ref().query_terms(term_set);
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.as_ref().query_terms(terms);
}
}

Expand Down
6 changes: 3 additions & 3 deletions src/query/term_query/term_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
use crate::schema::IndexRecordOption;
use crate::Searcher;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;

/// A Term query matches all of the documents
Expand Down Expand Up @@ -127,7 +127,7 @@ impl Query for TermQuery {
self.specialized_weight(searcher, scoring_enabled)?,
))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
term_set.insert(self.term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
terms.insert(self.term.clone(), false);
}
}
5 changes: 2 additions & 3 deletions src/snippet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use crate::{Document, Score};
use htmlescape::encode_minimal;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::ops::Range;

const DEFAULT_MAX_NUM_CHARS: usize = 150;
Expand Down Expand Up @@ -239,10 +238,10 @@ impl SnippetGenerator {
query: &dyn Query,
field: Field,
) -> crate::Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
let mut terms = BTreeMap::new();
query.query_terms(&mut terms);
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
for (term, _) in terms {
if term.field() != field {
continue;
}
Expand Down

0 comments on commit 6e4b611

Please sign in to comment.