Add: Stateful "add many, clear once" allocator

This optimization can drastically reduce memory consumption when dealing with large collection of small dimensionality embeddings. In some cases it will double effective number of vectors that would fit into RAM.
unum-cloud · Jun 11, 2023 · fb07b53 · fb07b53
1 parent 3a695e9
commit fb07b53
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 17 deletions.
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -170,15 +170,6 @@ mkdir -p datasets/deep_1B/ && \
 
 ## Profiling
 
-Enabling Huge Pages:
-
-```sh
-sudo cat /proc/sys/vm/nr_hugepages
-sudo sysctl -w vm.nr_hugepages=2048
-sudo reboot
-sudo cat /proc/sys/vm/nr_hugepages
-```
-
 With `perf`:
 
 ```sh
@@ -189,3 +180,35 @@ sudo -E perf mem -d ./build_release/bench ...
 sudo -E perf record -F 1000 ./build_release/bench ...
 perf record -d -e arm_spe// -- ./build_release/bench ..
 ```
+
+### Caches
+
+```sh
+sudo perf stat -e 'faults,dTLB-loads,dTLB-load-misses,cache-misses,cache-references' ./build_release/bench ...
+```
+
+Typical output on a 1M vectors dataset is:
+
+```txt
+            255426      faults                                                      
+      305988813388      dTLB-loads                                                  
+        8845723783      dTLB-load-misses          #    2.89% of all dTLB cache accesses
+       20094264206      cache-misses              #    6.567 % of all cache refs    
+      305988812745      cache-references                                            
+
+       8.285148010 seconds time elapsed
+
+     500.705967000 seconds user
+       1.371118000 seconds sys
+```
+
+If you notice problems and the stalls are closer to 90%, it might be a good reason to consider enabling Huge Pages and tuning allocations alignment.
+To enable Huge Pages:
+
+```sh
+sudo cat /proc/sys/vm/nr_hugepages
+sudo sysctl -w vm.nr_hugepages=2048
+sudo reboot
+sudo cat /proc/sys/vm/nr_hugepages
+```
+
diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
@@ -1190,14 +1190,16 @@ template <typename metric_at = ip_gt<float>,            //
           typename label_at = std::size_t,              //
           typename id_at = std::uint32_t,               //
           typename scalar_at = float,                   //
-          typename allocator_at = std::allocator<char>> //
+          typename allocator_at = std::allocator<char>, //
+          typename point_allocator_at = allocator_at>   //
 class index_gt {
   public:
     using metric_t = metric_at;
     using scalar_t = scalar_at;
     using label_t = label_at;
     using id_t = id_at;
     using allocator_t = allocator_at;
+    using point_allocator_t = point_allocator_at;
 
     using vector_view_t = span_gt<scalar_t const>;
     using distance_t = return_type_gt<metric_t, vector_view_t, vector_view_t>;
@@ -1285,6 +1287,10 @@ class index_gt {
     using byte_t = typename allocator_t::value_type;
     static_assert(sizeof(byte_t) == 1, "Allocator must allocate separate addressable bytes");
 
+    using point_allocator_traits_t = std::allocator_traits<point_allocator_t>;
+    static_assert(sizeof(typename point_allocator_traits_t::value_type) == 1,
+                  "Allocator must allocate separate addressable bytes");
+
     /**
      *  @brief  How much larger (number of neighbors per node) will
      *          the base level be compared to other levels.
@@ -1403,6 +1409,7 @@ class index_gt {
     index_limits_t limits_{};
     metric_t metric_{};
     allocator_t allocator_{};
+    point_allocator_t point_allocator_{};
     precomputed_constants_t pre_{};
     int viewed_file_descriptor_{};
 
@@ -1435,10 +1442,11 @@ class index_gt {
      *  @section Exceptions
      *      Doesn't throw, unless the ::metric's and ::allocators's throw on copy-construction.
      */
-    explicit index_gt(index_config_t config = {}, metric_t metric = {}, allocator_t allocator = {}) noexcept
-        : config_(config), limits_(0, 0), metric_(metric), allocator_(allocator), pre_(precompute_(config)),
-          viewed_file_descriptor_(0), size_(0u), max_level_(-1), entry_id_(0u), nodes_(nullptr), nodes_mutexes_(),
-          contexts_(nullptr) {}
+    explicit index_gt(index_config_t config = {}, metric_t metric = {}, allocator_t allocator = {},
+                      point_allocator_t point_allocator = {}) noexcept
+        : config_(config), limits_(0, 0), metric_(metric), allocator_(std::move(allocator)),
+          point_allocator_(std::move(point_allocator)), pre_(precompute_(config)), viewed_file_descriptor_(0),
+          size_(0u), max_level_(-1), entry_id_(0u), nodes_(nullptr), nodes_mutexes_(), contexts_(nullptr) {}
 
     /**
      *  @brief  Clones the structure with the same hyper-parameters, but without contents.
@@ -2153,7 +2161,7 @@ class index_gt {
 
         node_t& node = nodes_[id];
         std::size_t node_bytes = node_bytes_(node) - node_vector_bytes_(node) * !node_stored_(node);
-        allocator_t{}.deallocate(node.tape(), node_bytes);
+        point_allocator_.deallocate(node.tape(), node_bytes);
         node = node_t{};
     }
 
@@ -2163,7 +2171,7 @@ class index_gt {
         std::size_t stored_vector_bytes = node_vector_bytes_(dim) * store_vector;
         std::size_t node_bytes = node_bytes_(dim, level) - node_vector_bytes_(dim) * !store_vector;
 
-        byte_t* data = (byte_t*)allocator_t{}.allocate(node_bytes);
+        byte_t* data = (byte_t*)point_allocator_.allocate(node_bytes);
         if (!data)
             return {};
 

diff --git a/include/usearch/index_punned_dense.hpp b/include/usearch/index_punned_dense.hpp
@@ -68,7 +68,7 @@ class index_punned_dense_gt {
     /// @brief Schema: input buffer, bytes in input buffer, output buffer.
     using cast_t = std::function<bool(byte_t const*, std::size_t, byte_t*)>;
     /// @brief Punned index.
-    using index_t = index_gt<metric_t, label_t, id_t, byte_t, aligned_allocator_t>;
+    using index_t = index_gt<metric_t, label_t, id_t, byte_t, aligned_allocator_t, memory_mapping_allocator_t>;
     using index_allocator_t = aligned_allocator_gt<index_t, 64>;
 
     /// @brief A type-punned metric and metadata about present isa support.

diff --git a/include/usearch/index_punned_helpers.hpp b/include/usearch/index_punned_helpers.hpp
@@ -368,6 +368,97 @@ class aligned_allocator_gt {
 
 using aligned_allocator_t = aligned_allocator_gt<>;
 
+#if !defined(USEARCH_IS_WINDOWS)
+
+/**
+ *  @brief  Memory-mapping allocator designed for "alloc many, free at once" usage patterns.
+ *          Thread-safe.
+ *
+ *  Using this memory allocator won't affect your overall speed much, as that is not the bottleneck.
+ *  However, it can drastically improve memory usage especcially for huge indexes of small vectors.
+ */
+template <std::size_t alignment_ak = 1> class memory_mapping_allocator_gt {
+
+    static constexpr std::size_t min_size() { return 1024 * 1024 * 4; }
+    static constexpr std::size_t head_size() {
+        /// Pointer to the the previous arena and the size of the current one.
+        return divide_round_up<alignment_ak>(sizeof(byte_t*) + sizeof(std::size_t)) * alignment_ak;
+    }
+
+    std::mutex mutex_;
+    byte_t* last_arena_ = nullptr;
+    std::size_t last_usage_ = head_size();
+    std::size_t last_capacity_ = min_size();
+
+    void reset() noexcept {
+        byte_t* last_arena = last_arena_;
+        while (last_arena) {
+            byte_t* previous_arena;
+            std::memcpy(&previous_arena, last_arena, sizeof(byte_t*));
+            std::size_t current_size;
+            std::memcpy(&current_size, last_arena + sizeof(byte_t*), sizeof(std::size_t));
+            munmap(last_arena, current_size);
+            last_arena = previous_arena;
+        }
+
+        // Clear the references:
+        last_arena_ = nullptr;
+        last_usage_ = head_size();
+        last_capacity_ = min_size();
+    }
+
+  public:
+    using value_type = byte_t;
+    using size_type = std::size_t;
+    using pointer = byte_t*;
+    using const_pointer = byte_t const*;
+
+    memory_mapping_allocator_gt() = default;
+    memory_mapping_allocator_gt(memory_mapping_allocator_gt&& other) noexcept
+        : last_arena_(other.last_arena_), last_usage_(other.last_usage_), last_capacity_(other.last_capacity_) {}
+    memory_mapping_allocator_gt& operator=(memory_mapping_allocator_gt&& other) noexcept {
+        std::swap(last_arena_, other.last_arena_);
+        std::swap(last_usage_, other.last_usage_);
+        std::swap(last_capacity_, other.last_capacity_);
+        return *this;
+    }
+
+    ~memory_mapping_allocator_gt() noexcept { reset(); }
+
+    inline byte_t* allocate(std::size_t count_bytes) noexcept {
+        count_bytes = divide_round_up<alignment_ak>(count_bytes) * alignment_ak;
+
+        std::unique_lock<std::mutex> lock(mutex_);
+        if (!last_arena_ || (last_usage_ + count_bytes > last_capacity_)) {
+            std::size_t new_capacity = last_capacity_ * 2;
+            int prot = PROT_WRITE | PROT_READ;
+            int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+            byte_t* new_arena = (byte_t*)mmap(NULL, new_capacity, prot, flags, 0, 0);
+            std::memcpy(new_arena, &last_arena_, sizeof(byte_t*));
+            std::memcpy(new_arena + sizeof(byte_t*), &new_capacity, sizeof(std::size_t));
+
+            last_arena_ = new_arena;
+            last_capacity_ = new_capacity;
+            last_usage_ = head_size();
+        }
+
+        return last_arena_ + exchange(last_usage_, last_usage_ + count_bytes);
+    }
+
+    /**
+     *  @warning The very first memory de-allocation discards all the arenas!
+     */
+    void deallocate(std::size_t) noexcept { reset(); }
+};
+
+using memory_mapping_allocator_t = memory_mapping_allocator_gt<>;
+
+#else
+
+using memory_mapping_allocator_t = aligned_allocator_t;
+
+#endif
+
 template <typename from_scalar_at, typename to_scalar_at> struct cast_gt {
     inline bool operator()(byte_t const* input, std::size_t bytes_in_input, byte_t* output) const {
         from_scalar_at const* typed_input = reinterpret_cast<from_scalar_at const*>(input);