Skip to content

Commit

Permalink
enhance: add skip using array index when some situation (milvus-io#33947
Browse files Browse the repository at this point in the history
)

milvus-io#32900

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
  • Loading branch information
2 people authored and yellow-shine committed Jul 2, 2024
1 parent b6869b9 commit fd83390
Show file tree
Hide file tree
Showing 13 changed files with 209 additions and 91 deletions.
23 changes: 16 additions & 7 deletions internal/core/src/exec/expression/Expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ class SegmentExpr : public Expr {
is_index_mode_ = segment_->HasIndex(field_id_);
if (is_index_mode_) {
num_index_chunk_ = segment_->num_chunk_index(field_id_);
} else {
}
// if index not include raw data, also need load data
if (segment_->HasFieldData(field_id_)) {
num_data_chunk_ = upper_div(active_count_, size_per_chunk_);
}
}
Expand Down Expand Up @@ -166,17 +168,21 @@ class SegmentExpr : public Expr {
MoveCursor() override {
if (is_index_mode_) {
MoveCursorForIndex();
if (segment_->HasFieldData(field_id_)) {
MoveCursorForData();
}
} else {
MoveCursorForData();
}
}

int64_t
GetNextBatchSize() {
auto current_chunk =
is_index_mode_ ? current_index_chunk_ : current_data_chunk_;
auto current_chunk_pos =
is_index_mode_ ? current_index_chunk_pos_ : current_data_chunk_pos_;
auto current_chunk = is_index_mode_ && use_index_ ? current_index_chunk_
: current_data_chunk_;
auto current_chunk_pos = is_index_mode_ && use_index_
? current_index_chunk_pos_
: current_data_chunk_pos_;
auto current_rows = current_chunk * size_per_chunk_ + current_chunk_pos;
return current_rows + batch_size_ >= active_count_
? active_count_ - current_rows
Expand Down Expand Up @@ -330,14 +336,17 @@ class SegmentExpr : public Expr {
DataType pk_type_;
int64_t batch_size_;

// State indicate position that expr computing at
// because expr maybe called for every batch.
bool is_index_mode_{false};
bool is_data_mode_{false};
// sometimes need to skip index and using raw data
// default true means use index as much as possible
bool use_index_{true};

int64_t active_count_{0};
int64_t num_data_chunk_{0};
int64_t num_index_chunk_{0};
// State indicate position that expr computing at
// because expr maybe called for every batch.
int64_t current_data_chunk_{0};
int64_t current_data_chunk_pos_{0};
int64_t current_index_chunk_{0};
Expand Down
75 changes: 68 additions & 7 deletions internal/core/src/exec/expression/UnaryExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,68 @@
namespace milvus {
namespace exec {

template <typename T>
bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray() {
typedef std::
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
IndexInnerType;
using Index = index::ScalarIndex<IndexInnerType>;

for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);

if (index.GetIndexType() == milvus::index::ScalarIndexType::HYBRID) {
return false;
}
}
return true;
}

template <>
bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray<milvus::Array>() {
bool res;
if (!is_index_mode_) {
use_index_ = res = false;
return res;
}
switch (expr_->column_.element_type_) {
case DataType::BOOL:
res = CanUseIndexForArray<bool>();
break;
case DataType::INT8:
res = CanUseIndexForArray<int8_t>();
break;
case DataType::INT16:
res = CanUseIndexForArray<int16_t>();
break;
case DataType::INT32:
res = CanUseIndexForArray<int32_t>();
break;
case DataType::INT64:
res = CanUseIndexForArray<int64_t>();
break;
case DataType::FLOAT:
case DataType::DOUBLE:
// not accurate on floating point number, rollback to bruteforce.
res = false;
break;
case DataType::VARCHAR:
case DataType::STRING:
res = CanUseIndexForArray<std::string_view>();
break;
default:
PanicInfo(DataTypeInvalid,
"unsupported element type when execute array "
"equal for index: {}",
expr_->column_.element_type_);
}
use_index_ = res;
return res;
}

template <typename T>
VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex() {
Expand Down Expand Up @@ -162,7 +224,7 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
result = ExecRangeVisitorImplArray<std::string>();
break;
case proto::plan::GenericValue::ValCase::kArrayVal:
if (is_index_mode_) {
if (CanUseIndexForArray<milvus::Array>()) {
result = ExecRangeVisitorImplArrayForIndex<
proto::plan::Array>();
} else {
Expand Down Expand Up @@ -297,7 +359,7 @@ PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) {

// filtering by index, get candidates.
auto size_per_chunk = segment_->size_per_chunk();
auto retrieve = [ size_per_chunk, this ](int64_t offset) -> auto{
auto retrieve = [size_per_chunk, this](int64_t offset) -> auto {
auto chunk_idx = offset / size_per_chunk;
auto chunk_offset = offset % size_per_chunk;
const auto& chunk =
Expand Down Expand Up @@ -784,11 +846,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {

template <typename T>
bool
PhyUnaryRangeFilterExpr::CanUseIndex() const {
if (!is_index_mode_) {
return false;
}
return SegmentExpr::CanUseIndex<T>(expr_->op_type_);
PhyUnaryRangeFilterExpr::CanUseIndex() {
bool res = is_index_mode_ && SegmentExpr::CanUseIndex<T>(expr_->op_type_);
use_index_ = res;
return res;
}

} // namespace exec
Expand Down
7 changes: 6 additions & 1 deletion internal/core/src/exec/expression/UnaryExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "common/Vector.h"
#include "exec/expression/Expr.h"
#include "index/Meta.h"
#include "index/ScalarIndex.h"
#include "segcore/SegmentInterface.h"
#include "query/Utils.h"
#include "common/RegexQuery.h"
Expand Down Expand Up @@ -325,7 +326,11 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {

template <typename T>
bool
CanUseIndex() const;
CanUseIndex();

template <typename T>
bool
CanUseIndexForArray();

private:
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;
Expand Down
5 changes: 5 additions & 0 deletions internal/core/src/index/BitmapIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ class BitmapIndex : public ScalarIndex<T> {
return total_num_rows_;
}

ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::BITMAP;
}

void
Build(size_t n, const T* values) override;

Expand Down
50 changes: 25 additions & 25 deletions internal/core/src/index/HybridScalarIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ HybridScalarIndex<T>::HybridScalarIndex(
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
}
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
internal_index_type_ = InternalIndexType::NONE;
internal_index_type_ = ScalarIndexType::NONE;
}

template <typename T>
Expand All @@ -57,11 +57,11 @@ HybridScalarIndex<T>::HybridScalarIndex(
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
}
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
internal_index_type_ = InternalIndexType::NONE;
internal_index_type_ = ScalarIndexType::NONE;
}

template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectIndexBuildType(size_t n, const T* values) {
std::set<T> distinct_vals;
for (size_t i = 0; i < n; i++) {
Expand All @@ -70,15 +70,15 @@ HybridScalarIndex<T>::SelectIndexBuildType(size_t n, const T* values) {

// Decide whether to select bitmap index or stl sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::STLSORT;
internal_index_type_ = ScalarIndexType::STLSORT;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}

template <>
InternalIndexType
ScalarIndexType
HybridScalarIndex<std::string>::SelectIndexBuildType(
size_t n, const std::string* values) {
std::set<std::string> distinct_vals;
Expand All @@ -91,15 +91,15 @@ HybridScalarIndex<std::string>::SelectIndexBuildType(

// Decide whether to select bitmap index or marisa index
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::MARISA;
internal_index_type_ = ScalarIndexType::MARISA;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}

template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectBuildTypeForPrimitiveType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
Expand All @@ -116,15 +116,15 @@ HybridScalarIndex<T>::SelectBuildTypeForPrimitiveType(

// Decide whether to select bitmap index or stl sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::STLSORT;
internal_index_type_ = ScalarIndexType::STLSORT;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}

template <>
InternalIndexType
ScalarIndexType
HybridScalarIndex<std::string>::SelectBuildTypeForPrimitiveType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<std::string> distinct_vals;
Expand All @@ -141,15 +141,15 @@ HybridScalarIndex<std::string>::SelectBuildTypeForPrimitiveType(

// Decide whether to select bitmap index or marisa sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::MARISA;
internal_index_type_ = ScalarIndexType::MARISA;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}

template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectBuildTypeForArrayType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
Expand All @@ -171,15 +171,15 @@ HybridScalarIndex<T>::SelectBuildTypeForArrayType(
}
// Decide whether to select bitmap index or inverted index
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::INVERTED;
internal_index_type_ = ScalarIndexType::INVERTED;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}

template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectIndexBuildType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
Expand All @@ -200,13 +200,13 @@ HybridScalarIndex<T>::GetInternalIndex() {
if (internal_index_ != nullptr) {
return internal_index_;
}
if (internal_index_type_ == InternalIndexType::BITMAP) {
if (internal_index_type_ == ScalarIndexType::BITMAP) {
internal_index_ =
std::make_shared<BitmapIndex<T>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::STLSORT) {
} else if (internal_index_type_ == ScalarIndexType::STLSORT) {
internal_index_ =
std::make_shared<ScalarIndexSort<T>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ =
std::make_shared<InvertedIndexTantivy<T>>(file_manager_context_);
} else {
Expand All @@ -223,13 +223,13 @@ HybridScalarIndex<std::string>::GetInternalIndex() {
return internal_index_;
}

if (internal_index_type_ == InternalIndexType::BITMAP) {
if (internal_index_type_ == ScalarIndexType::BITMAP) {
internal_index_ =
std::make_shared<BitmapIndex<std::string>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::MARISA) {
} else if (internal_index_type_ == ScalarIndexType::MARISA) {
internal_index_ =
std::make_shared<StringIndexMarisa>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ = std::make_shared<InvertedIndexTantivy<std::string>>(
file_manager_context_);
} else {
Expand Down Expand Up @@ -374,7 +374,7 @@ HybridScalarIndex<T>::DeserializeIndexType(const BinarySet& binary_set) {
uint8_t index_type;
auto index_type_buffer = binary_set.GetByName(INDEX_TYPE);
memcpy(&index_type, index_type_buffer->data.get(), index_type_buffer->size);
internal_index_type_ = static_cast<InternalIndexType>(index_type);
internal_index_type_ = static_cast<ScalarIndexType>(index_type);
}

template <typename T>
Expand Down
Loading

0 comments on commit fd83390

Please sign in to comment.