Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] Add sparse index support to knowhere #341

Merged
merged 1 commit into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions include/knowhere/comp/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ class BruteForce {
static expected<DataSetPtr>
RangeSearch(const DataSetPtr base_dataset, const DataSetPtr query_dataset, const Json& config,
const BitsetView& bitset);

// Perform row oriented sparse vector brute force search.
static expected<DataSetPtr>
SearchSparse(const DataSetPtr base_dataset, const DataSetPtr query_dataset, const Json& config,
const BitsetView& bitset);

static Status
SearchSparseWithBuf(const DataSetPtr base_dataset, const DataSetPtr query_dataset, sparse::label_t* ids, float* dis,
const Json& config, const BitsetView& bitset);
};

} // namespace knowhere
Expand Down
6 changes: 6 additions & 0 deletions include/knowhere/comp/index_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ constexpr const char* INDEX_RAFT_CAGRA = "GPU_RAFT_CAGRA";
constexpr const char* INDEX_HNSW = "HNSW";
constexpr const char* INDEX_DISKANN = "DISKANN";

constexpr const char* INDEX_SPARSE_INVERTED_INDEX = "SPARSE_INVERTED_INDEX";
constexpr const char* INDEX_SPARSE_WAND = "SPARSE_WAND";
} // namespace IndexEnum

namespace meta {
Expand Down Expand Up @@ -123,6 +125,10 @@ constexpr const char* HNSW_M = "M";
constexpr const char* EF = "ef";
constexpr const char* SEED_EF = "seed_ef";
constexpr const char* OVERVIEW_LEVELS = "overview_levels";

// Sparse Params
constexpr const char* DROP_RATIO_BUILD = "drop_ratio_build";
constexpr const char* DROP_RATIO_SEARCH = "drop_ratio_search";
} // namespace indexparam

using MetricType = std::string;
Expand Down
19 changes: 18 additions & 1 deletion include/knowhere/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <variant>

#include "comp/index_param.h"
#include "knowhere/sparse_utils.h"

namespace knowhere {

Expand Down Expand Up @@ -54,7 +55,11 @@ class DataSet : public std::enable_shared_from_this<const DataSet> {
{
auto ptr = std::get_if<3>(&x.second);
if (ptr != nullptr) {
delete[](char*)(*ptr);
if (is_sparse) {
delete[](sparse::SparseRow<float>*)(*ptr);
} else {
delete[](char*)(*ptr);
}
}
}
}
Expand All @@ -78,6 +83,11 @@ class DataSet : public std::enable_shared_from_this<const DataSet> {
this->data_[meta::IDS] = Var(std::in_place_index<2>, ids);
}

/**
* For dense float vector, tensor is a rows * dim float array
* For sparse float vector, tensor is pointer to sparse::Sparse<float>*
* and values in each row should be sorted by column id.
*/
void
SetTensor(const void* tensor) {
std::unique_lock lock(mutex_);
Expand Down Expand Up @@ -202,6 +212,12 @@ class DataSet : public std::enable_shared_from_this<const DataSet> {
this->is_owner = is_owner;
}

void
SetIsSparse(bool is_sparse) {
std::unique_lock lock(mutex_);
this->is_sparse = is_sparse;
}

// deprecated API
template <typename T>
void
Expand All @@ -225,6 +241,7 @@ class DataSet : public std::enable_shared_from_this<const DataSet> {
mutable std::shared_mutex mutex_;
std::map<std::string, Var> data_;
bool is_owner = true;
bool is_sparse = false;
};
using DataSetPtr = std::shared_ptr<DataSet>;
inline DataSetPtr
Expand Down
2 changes: 2 additions & 0 deletions include/knowhere/operands.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#define OPERANDS_H
#include <math.h>

#include <cstring>

namespace {
union fp32_bits {
uint32_t as_bits;
Expand Down
261 changes: 261 additions & 0 deletions include/knowhere/sparse_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
// Copyright (C) 2019-2023 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// valributed under the License is valributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#pragma once

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <type_traits>
#include <vector>

#include "knowhere/operands.h"

namespace knowhere::sparse {

// integer type in SparseRow
using table_t = uint32_t;
// type used to represent the id of a vector in the index interface.
// this is same as other index types.
using label_t = int64_t;

template <typename T>
struct IdVal {
table_t id;
T val;

IdVal() = default;
IdVal(table_t id, T val) : id(id), val(val) {
}

inline friend bool
operator<(const IdVal& lhs, const IdVal& rhs) {
return lhs.val < rhs.val || (lhs.val == rhs.val && lhs.id < rhs.id);
}
inline friend bool
operator>(const IdVal& lhs, const IdVal& rhs) {
return !(lhs < rhs);
}

inline friend bool
operator==(const IdVal& lhs, const IdVal& rhs) {
return lhs.id == rhs.id && lhs.val == rhs.val;
}
};

template <typename T>
class SparseRow {
static_assert(std::is_same_v<T, fp32>, "SparseRow supports float only");

public:
// construct an SparseRow with memory allocated to hold `count` elements.
SparseRow(size_t count = 0)
: data_(count ? new uint8_t[count * element_size()] : nullptr), count_(count), own_data_(true) {
}

SparseRow(size_t count, uint8_t* data, bool own_data) : data_(data), count_(count), own_data_(own_data) {
}

// copy constructor and copy assignment operator perform deep copy
SparseRow(const SparseRow<T>& other) : SparseRow(other.count_) {
std::copy(other.data_, other.data_ + count_ * element_size(), data_);
}

SparseRow(SparseRow<T>&& other) noexcept : SparseRow() {
swap(*this, other);
}

SparseRow&
operator=(const SparseRow<T>& other) {
if (this != &other) {
SparseRow<T> tmp(other);
swap(*this, tmp);
}
return *this;
}

SparseRow&
operator=(SparseRow<T>&& other) noexcept {
swap(*this, other);
return *this;
}

~SparseRow() {
if (own_data_ && data_ != nullptr) {
delete[] data_;
data_ = nullptr;
}
}

size_t
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move constructor and move assignment operator please :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

size() const {
return count_;
}

size_t
memory_usage() const {
return count_ * element_size() + sizeof(*this);
}

void*
data() {
return data_;
}

const void*
data() const {
return data_;
}

// dim of a sparse vector is the max index + 1, or 0 for an empty vector.
int64_t
dim() const {
if (count_ == 0) {
return 0;
}
auto* elem = reinterpret_cast<const ElementProxy*>(data_) + count_ - 1;
return elem->index + 1;
}

IdVal<T>
operator[](size_t i) const {
auto* elem = reinterpret_cast<const ElementProxy*>(data_) + i;
return {elem->index, elem->value};
}

void
set_at(size_t i, table_t index, T value) {
auto* elem = reinterpret_cast<ElementProxy*>(data_) + i;
elem->index = index;
elem->value = value;
}

float
dot(const SparseRow<T>& other) const {
float product_sum = 0.0f;
size_t i = 0, j = 0;
while (i < count_ && j < other.count_) {
auto* left = reinterpret_cast<const ElementProxy*>(data_) + i;
auto* right = reinterpret_cast<const ElementProxy*>(other.data_) + j;

if (left->index < right->index) {
++i;
} else if (left->index > right->index) {
++j;
} else {
product_sum += left->value * right->value;
++i;
++j;
}
}
return product_sum;
}

friend void
swap(SparseRow<T>& left, SparseRow<T>& right) {
using std::swap;
swap(left.count_, right.count_);
swap(left.data_, right.data_);
swap(left.own_data_, right.own_data_);
}

static inline size_t
element_size() {
return sizeof(table_t) + sizeof(T);
}

private:
// ElementProxy is used to access elements in the data_ array and should
// never be actually constructed.
struct __attribute__((packed)) ElementProxy {
table_t index;
T value;
ElementProxy() = delete;
ElementProxy(const ElementProxy&) = delete;
Copy link
Collaborator

@alexanderguzhva alexanderguzhva Jan 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add ElementProxy& operator=(const ElementProxy&) = delete; as well
On the other hand, I don't understand why it is needed to delete the constructors

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deleting the default and the copy constructor(so no constructor will be generated) should be sufficient. if no instance can ever be made there is no way to call copy assignment operator.

the reason is this element is used solely to help access what's in the raw data_ pointer, we don't actually have the need to create such objects.

};
// data_ must be sorted by column id. use raw pointer for easy mmap and zero
// copy.
uint8_t* data_;
size_t count_;
bool own_data_;
};

// When pushing new elements into a MaxMinHeap, only `capacity` elements with the
// largest val are kept. pop()/top() returns the smallest element out of them.
template <typename T>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we have any similar structures in our code already? I'm just curious, the implementation is fine

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see one that matches exactly the use case.

class MaxMinHeap {
public:
explicit MaxMinHeap(int capacity) : capacity_(capacity), pool_(capacity) {
}
void
push(table_t id, T val) {
if (size_ < capacity_) {
pool_[size_] = {id, val};
size_ += 1;
std::push_heap(pool_.begin(), pool_.begin() + size_, std::greater<IdVal<T>>());
} else if (val > pool_[0].val) {
sift_down(id, val);
}
}
table_t
pop() {
std::pop_heap(pool_.begin(), pool_.begin() + size_, std::greater<IdVal<T>>());
size_ -= 1;
return pool_[size_].id;
}
[[nodiscard]] size_t
size() const {
return size_;
}
[[nodiscard]] bool
empty() const {
return size() == 0;
}
IdVal<T>
top() const {
return pool_[0];
}
[[nodiscard]] bool
full() const {
return size_ == capacity_;
}

private:
void
sift_down(table_t id, T val) {
size_t i = 0;
for (; 2 * i + 1 < size_;) {
size_t j = i;
size_t l = 2 * i + 1, r = 2 * i + 2;
if (pool_[l].val < val) {
j = l;
}
if (r < size_ && pool_[r].val < std::min(pool_[l].val, val)) {
j = r;
}
if (i == j) {
break;
}
pool_[i] = pool_[j];
i = j;
}
pool_[i] = {id, val};
}

size_t size_ = 0, capacity_;
std::vector<IdVal<T>> pool_;
}; // class MaxMinHeap

} // namespace knowhere::sparse
Loading
Loading