Skip to content

Commit

Permalink
Add: Stateful "add many, clear once" allocator
Browse files Browse the repository at this point in the history
This optimization can drastically reduce memory consumption when
dealing with large collection of small dimensionality embeddings.
In some cases it will double effective number of vectors
that would fit into RAM.
  • Loading branch information
ashvardanian committed Jun 11, 2023
1 parent 3a695e9 commit fb07b53
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 17 deletions.
41 changes: 32 additions & 9 deletions docs/benchmarks.md
Expand Up @@ -170,15 +170,6 @@ mkdir -p datasets/deep_1B/ && \

## Profiling

Enabling Huge Pages:

```sh
sudo cat /proc/sys/vm/nr_hugepages
sudo sysctl -w vm.nr_hugepages=2048
sudo reboot
sudo cat /proc/sys/vm/nr_hugepages
```

With `perf`:

```sh
Expand All @@ -189,3 +180,35 @@ sudo -E perf mem -d ./build_release/bench ...
sudo -E perf record -F 1000 ./build_release/bench ...
perf record -d -e arm_spe// -- ./build_release/bench ..
```

### Caches

```sh
sudo perf stat -e 'faults,dTLB-loads,dTLB-load-misses,cache-misses,cache-references' ./build_release/bench ...
```

Typical output on a 1M vectors dataset is:

```txt
255426 faults
305988813388 dTLB-loads
8845723783 dTLB-load-misses # 2.89% of all dTLB cache accesses
20094264206 cache-misses # 6.567 % of all cache refs
305988812745 cache-references
8.285148010 seconds time elapsed
500.705967000 seconds user
1.371118000 seconds sys
```

If you notice problems and the stalls are closer to 90%, it might be a good reason to consider enabling Huge Pages and tuning allocations alignment.
To enable Huge Pages:

```sh
sudo cat /proc/sys/vm/nr_hugepages
sudo sysctl -w vm.nr_hugepages=2048
sudo reboot
sudo cat /proc/sys/vm/nr_hugepages
```

22 changes: 15 additions & 7 deletions include/usearch/index.hpp
Expand Up @@ -1190,14 +1190,16 @@ template <typename metric_at = ip_gt<float>, //
typename label_at = std::size_t, //
typename id_at = std::uint32_t, //
typename scalar_at = float, //
typename allocator_at = std::allocator<char>> //
typename allocator_at = std::allocator<char>, //
typename point_allocator_at = allocator_at> //
class index_gt {
public:
using metric_t = metric_at;
using scalar_t = scalar_at;
using label_t = label_at;
using id_t = id_at;
using allocator_t = allocator_at;
using point_allocator_t = point_allocator_at;

using vector_view_t = span_gt<scalar_t const>;
using distance_t = return_type_gt<metric_t, vector_view_t, vector_view_t>;
Expand Down Expand Up @@ -1285,6 +1287,10 @@ class index_gt {
using byte_t = typename allocator_t::value_type;
static_assert(sizeof(byte_t) == 1, "Allocator must allocate separate addressable bytes");

using point_allocator_traits_t = std::allocator_traits<point_allocator_t>;
static_assert(sizeof(typename point_allocator_traits_t::value_type) == 1,
"Allocator must allocate separate addressable bytes");

/**
* @brief How much larger (number of neighbors per node) will
* the base level be compared to other levels.
Expand Down Expand Up @@ -1403,6 +1409,7 @@ class index_gt {
index_limits_t limits_{};
metric_t metric_{};
allocator_t allocator_{};
point_allocator_t point_allocator_{};
precomputed_constants_t pre_{};
int viewed_file_descriptor_{};

Expand Down Expand Up @@ -1435,10 +1442,11 @@ class index_gt {
* @section Exceptions
* Doesn't throw, unless the ::metric's and ::allocators's throw on copy-construction.
*/
explicit index_gt(index_config_t config = {}, metric_t metric = {}, allocator_t allocator = {}) noexcept
: config_(config), limits_(0, 0), metric_(metric), allocator_(allocator), pre_(precompute_(config)),
viewed_file_descriptor_(0), size_(0u), max_level_(-1), entry_id_(0u), nodes_(nullptr), nodes_mutexes_(),
contexts_(nullptr) {}
explicit index_gt(index_config_t config = {}, metric_t metric = {}, allocator_t allocator = {},
point_allocator_t point_allocator = {}) noexcept
: config_(config), limits_(0, 0), metric_(metric), allocator_(std::move(allocator)),
point_allocator_(std::move(point_allocator)), pre_(precompute_(config)), viewed_file_descriptor_(0),
size_(0u), max_level_(-1), entry_id_(0u), nodes_(nullptr), nodes_mutexes_(), contexts_(nullptr) {}

/**
* @brief Clones the structure with the same hyper-parameters, but without contents.
Expand Down Expand Up @@ -2153,7 +2161,7 @@ class index_gt {

node_t& node = nodes_[id];
std::size_t node_bytes = node_bytes_(node) - node_vector_bytes_(node) * !node_stored_(node);
allocator_t{}.deallocate(node.tape(), node_bytes);
point_allocator_.deallocate(node.tape(), node_bytes);
node = node_t{};
}

Expand All @@ -2163,7 +2171,7 @@ class index_gt {
std::size_t stored_vector_bytes = node_vector_bytes_(dim) * store_vector;
std::size_t node_bytes = node_bytes_(dim, level) - node_vector_bytes_(dim) * !store_vector;

byte_t* data = (byte_t*)allocator_t{}.allocate(node_bytes);
byte_t* data = (byte_t*)point_allocator_.allocate(node_bytes);
if (!data)
return {};

Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index_punned_dense.hpp
Expand Up @@ -68,7 +68,7 @@ class index_punned_dense_gt {
/// @brief Schema: input buffer, bytes in input buffer, output buffer.
using cast_t = std::function<bool(byte_t const*, std::size_t, byte_t*)>;
/// @brief Punned index.
using index_t = index_gt<metric_t, label_t, id_t, byte_t, aligned_allocator_t>;
using index_t = index_gt<metric_t, label_t, id_t, byte_t, aligned_allocator_t, memory_mapping_allocator_t>;
using index_allocator_t = aligned_allocator_gt<index_t, 64>;

/// @brief A type-punned metric and metadata about present isa support.
Expand Down
91 changes: 91 additions & 0 deletions include/usearch/index_punned_helpers.hpp
Expand Up @@ -368,6 +368,97 @@ class aligned_allocator_gt {

using aligned_allocator_t = aligned_allocator_gt<>;

#if !defined(USEARCH_IS_WINDOWS)

/**
* @brief Memory-mapping allocator designed for "alloc many, free at once" usage patterns.
* Thread-safe.
*
* Using this memory allocator won't affect your overall speed much, as that is not the bottleneck.
* However, it can drastically improve memory usage especcially for huge indexes of small vectors.
*/
template <std::size_t alignment_ak = 1> class memory_mapping_allocator_gt {

static constexpr std::size_t min_size() { return 1024 * 1024 * 4; }
static constexpr std::size_t head_size() {
/// Pointer to the the previous arena and the size of the current one.
return divide_round_up<alignment_ak>(sizeof(byte_t*) + sizeof(std::size_t)) * alignment_ak;
}

std::mutex mutex_;
byte_t* last_arena_ = nullptr;
std::size_t last_usage_ = head_size();
std::size_t last_capacity_ = min_size();

void reset() noexcept {
byte_t* last_arena = last_arena_;
while (last_arena) {
byte_t* previous_arena;
std::memcpy(&previous_arena, last_arena, sizeof(byte_t*));
std::size_t current_size;
std::memcpy(&current_size, last_arena + sizeof(byte_t*), sizeof(std::size_t));
munmap(last_arena, current_size);
last_arena = previous_arena;
}

// Clear the references:
last_arena_ = nullptr;
last_usage_ = head_size();
last_capacity_ = min_size();
}

public:
using value_type = byte_t;
using size_type = std::size_t;
using pointer = byte_t*;
using const_pointer = byte_t const*;

memory_mapping_allocator_gt() = default;
memory_mapping_allocator_gt(memory_mapping_allocator_gt&& other) noexcept
: last_arena_(other.last_arena_), last_usage_(other.last_usage_), last_capacity_(other.last_capacity_) {}
memory_mapping_allocator_gt& operator=(memory_mapping_allocator_gt&& other) noexcept {
std::swap(last_arena_, other.last_arena_);
std::swap(last_usage_, other.last_usage_);
std::swap(last_capacity_, other.last_capacity_);
return *this;
}

~memory_mapping_allocator_gt() noexcept { reset(); }

inline byte_t* allocate(std::size_t count_bytes) noexcept {
count_bytes = divide_round_up<alignment_ak>(count_bytes) * alignment_ak;

std::unique_lock<std::mutex> lock(mutex_);
if (!last_arena_ || (last_usage_ + count_bytes > last_capacity_)) {
std::size_t new_capacity = last_capacity_ * 2;
int prot = PROT_WRITE | PROT_READ;
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
byte_t* new_arena = (byte_t*)mmap(NULL, new_capacity, prot, flags, 0, 0);
std::memcpy(new_arena, &last_arena_, sizeof(byte_t*));
std::memcpy(new_arena + sizeof(byte_t*), &new_capacity, sizeof(std::size_t));

last_arena_ = new_arena;
last_capacity_ = new_capacity;
last_usage_ = head_size();
}

return last_arena_ + exchange(last_usage_, last_usage_ + count_bytes);
}

/**
* @warning The very first memory de-allocation discards all the arenas!
*/
void deallocate(std::size_t) noexcept { reset(); }
};

using memory_mapping_allocator_t = memory_mapping_allocator_gt<>;

#else

using memory_mapping_allocator_t = aligned_allocator_t;

#endif

template <typename from_scalar_at, typename to_scalar_at> struct cast_gt {
inline bool operator()(byte_t const* input, std::size_t bytes_in_input, byte_t* output) const {
from_scalar_at const* typed_input = reinterpret_cast<from_scalar_at const*>(input);
Expand Down

0 comments on commit fb07b53

Please sign in to comment.