Break: Index -> DenseIndex for dense vectors

unum-cloud · Sep 25, 2023 · 9c8ddb6 · 9c8ddb6
1 parent 49c4c54
commit 9c8ddb6
Show file tree

Hide file tree

Showing 50 changed files with 561 additions and 551 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -107,6 +107,7 @@
     "cSpell.words": [
         "allclose",
         "arange",
+        "Arxiv",
         "astype",
         "Availible",
         "bidict",
@@ -140,16 +141,20 @@
         "Println",
         "pytest",
         "Quickstart",
+        "rdkit",
         "rtype",
+        "SIMD",
         "simsimd",
         "SLOC",
         "sorensen",
+        "Streamlit",
         "tanimoto",
         "tqdm",
         "uninitialize",
         "unumusearch",
         "usearch",
         "usecases",
+        "Vardanian",
         "Xunit"
     ],
     "autoDocstring.docstringFormat": "sphinx",

diff --git a/Package.swift b/Package.swift
@@ -27,13 +27,13 @@ let package = Package(
             dependencies: ["USearchObjective"],
             path: "swift",
             exclude: ["README.md", "Test.swift"],
-            sources: ["USearch.swift", "Index+Sugar.swift"]
+            sources: ["USearch.swift", "DenseIndex+Sugar.swift"]
         ),
         .testTarget(
             name: "USearchTests",
             dependencies: ["USearch"],
             path: "swift",
-            exclude: ["USearch.swift", "Index+Sugar.swift", "README.md"],
+            exclude: ["USearch.swift", "DenseIndex+Sugar.swift", "README.md"],
             sources: ["Test.swift"]
         )
     ],

diff --git a/README.md b/README.md
@@ -86,9 +86,9 @@ Base functionality is identical to FAISS, and the interface must be familiar if
 $ pip install usearch
 
 import numpy as np
-from usearch.index import Index
+from usearch.index import DenseIndex
 
-index = Index(
+index = DenseIndex(
     ndim=3, # Define the number of dimensions in input vectors
     metric='cos', # Choose 'l2sq', 'haversine' or other metric, default = 'ip'
     dtype='f32', # Quantize to 'f16' or 'i8' if needed, default = 'f32'
@@ -155,7 +155,7 @@ Instead, we have focused on high-precision arithmetic over low-precision downcas
 The same index, and `add` and `search` operations will automatically down-cast or up-cast between `f32_t`, `f16_t`, `f64_t`, and `i8_t` representations, even if the hardware doesn't natively support it.
 Continuing the topic of memory efficiency, we provide a `uint40_t` to allow collection with over 4B+ vectors without allocating 8 bytes for every neighbor reference in the proximity graph.
 
-## Serialization & Serving `Index` from Disk
+## Serialization & Serving `DenseIndex` from Disk
 
 USearch supports multiple forms of serialization:
 
@@ -170,9 +170,9 @@ This can result in __20x cost reduction__ on AWS and other public clouds.
 index.save("index.usearch")
 
 loaded_copy = index.load("index.usearch")
-view = Index.restore("index.usearch", view=True)
+view = DenseIndex.restore("index.usearch", view=True)
 
-other_view = Index(ndim=..., metric=CompiledMetric(...))
+other_view = DenseIndex(ndim=..., metric=CompiledMetric(...))
 other_view.view("index.usearch")
 ```
 
@@ -200,7 +200,7 @@ When compared to FAISS's `IndexFlatL2` in Google Colab, __[USearch may offer up
 - `faiss.IndexFlatL2`: __55.3 ms__.
 - `usearch.index.search`: __2.54 ms__.
 
-## `Indexes` for Multi-Index Lookups
+## `Indexes` for Multi-DenseIndex Lookups
 
 For larger workloads targeting billions or even trillions of vectors, parallel multi-index lookups become invaluable.
 These lookups prevent the need to construct a single, massive index, allowing users to query multiple smaller ones instead.
@@ -209,7 +209,7 @@ These lookups prevent the need to construct a single, massive index, allowing us
 from usearch.index import Indexes
 
 multi_index = Indexes(
-    indexes: Iterable[usearch.index.Index] = [...],
+    indexes: Iterable[usearch.index.DenseIndex] = [...],
     paths: Iterable[os.PathLike] = [...],
     view: bool = False,
     threads: int = 0,
@@ -220,7 +220,7 @@ multi_index.search(...)
 ## Clustering
 
 Once the index is constructed, it can be used to cluster entries much faster.
-In essense, the `Index` itself can be seen as a clustering, and it allows iterative deepening.
+In essence, the `DenseIndex` itself can be seen as a clustering, and it allows iterative deepening.
 
 ```py
 clustering = index.cluster(
@@ -261,8 +261,8 @@ Using USearch one can implement sub-quadratic complexity approximate, fuzzy, and
 This can come in handy in any fuzzy-matching tasks, common to Database Management Software.
 
 ```py
-men = Index(...)
-women = Index(...)
+men = DenseIndex(...)
+women = DenseIndex(...)
 pairs: dict = men.join(women, max_proposals=0, exact=False)
 ```
 
@@ -301,7 +301,7 @@ import PIL as pil
 
 server = ucall.Server()
 model = uform.get_model('unum-cloud/uform-vl-multilingual')
-index = usearch.index.Index(ndim=256)
+index = usearch.index.DenseIndex(ndim=256)
 
 @server
 def add(key: int, photo: pil.Image.Image):
@@ -346,7 +346,7 @@ The latter are searchable with bitwise similarity metrics, like the Tanimoto coe
 Below is an example using the RDKit package.
 
 ```python
-from usearch.index import Index, MetricKind
+from usearch.index import DenseIndex, MetricKind
 from rdkit import Chem
 from rdkit.Chem import AllChem
 
@@ -358,7 +358,7 @@ encoder = AllChem.GetRDKitFPGenerator()
 fingerprints = np.vstack([encoder.GetFingerprint(x) for x in molecules])
 fingerprints = np.packbits(fingerprints, axis=1)
 
-index = Index(ndim=2048, metric=MetricKind.Tanimoto)
+index = DenseIndex(ndim=2048, metric=MetricKind.Tanimoto)
 keys = np.arange(len(molecules))
 
 index.add(keys, fingerprints)

diff --git a/c/test.c b/c/test.c
@@ -1,18 +1,16 @@
-#include <stdio.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 
 #include "usearch.h"
 
-#define ASSERT(must_be_true, message)           \
-    if (!(must_be_true)) {                      \
-        printf("Assert: %s\n", message);        \
-        exit(-1);                               \
+#define ASSERT(must_be_true, message)                                                                                  \
+    if (!(must_be_true)) {                                                                                             \
+        printf("Assert: %s\n", message);                                                                               \
+        exit(-1);                                                                                                      \
     }
 
-
 float* create_vectors(size_t count, size_t dimensions) {
     float* data = (float*)malloc(count * dimensions * sizeof(float));
     ASSERT(data, "Failed to allocate memory");
@@ -34,7 +32,7 @@ usearch_init_options_t create_options(size_t dimensions) {
 }
 
 void test_init(size_t collection_size, size_t dimensions) {
-    printf("Test: Index Initialization...\n");
+    printf("Test: DenseIndex Initialization...\n");
 
     // Init index
     usearch_error_t error = NULL;
@@ -64,7 +62,7 @@ void test_init(size_t collection_size, size_t dimensions) {
     usearch_free(idx, &error);
     ASSERT(!error, error);
 
-    printf("Test: Index Initialization - PASSED\n");
+    printf("Test: DenseIndex Initialization - PASSED\n");
 }
 
 void test_add_vector(size_t collection_size, size_t dimensions) {
@@ -122,8 +120,9 @@ void test_find_vector(size_t collection_size, size_t dimensions) {
 
     // Find the vectors
     for (size_t i = 0; i < collection_size; i++) {
-        const void *query_vector = data + i * dimensions;
-        size_t found_count = usearch_search(idx, query_vector, usearch_scalar_f32_k, results_count, keys, distances, &error);
+        const void* query_vector = data + i * dimensions;
+        size_t found_count =
+            usearch_search(idx, query_vector, usearch_scalar_f32_k, results_count, keys, distances, &error);
         ASSERT(!error, error);
         ASSERT(found_count = results_count, "Vector is missing");
     }

diff --git a/cpp/bench.cpp b/cpp/bench.cpp
@@ -547,7 +547,7 @@ int main(int argc, char** argv) {
         (option("-o", "--output") & value("path", args.path_output)).doc(".usearch output file path"),
         (option("-b", "--big").set(args.big)).doc("Will switch to uint40_t for neighbors lists with over 4B entries"),
         (option("-j", "--threads") & value("integer", args.threads)).doc("Uses all available cores by default"),
-        (option("-c", "--connectivity") & value("integer", args.connectivity)).doc("Index granularity"),
+        (option("-c", "--connectivity") & value("integer", args.connectivity)).doc("DenseIndex granularity"),
         (option("--expansion-add") & value("integer", args.expansion_add)).doc("Affects indexing depth"),
         (option("--expansion-search") & value("integer", args.expansion_search)).doc("Affects search depth"),
         (option("--rows-skip") & value("integer", args.vectors_to_skip)).doc("Number of vectors to skip"),
@@ -609,7 +609,7 @@ int main(int argc, char** argv) {
     limits.threads_add = limits.threads_search = args.threads;
     limits.members = dataset.vectors_count();
 
-    std::printf("- Index: \n");
+    std::printf("- DenseIndex: \n");
     std::printf("-- Connectivity: %zu\n", config.connectivity);
     std::printf("-- Expansion @ Add: %zu\n", config.expansion_add);
     std::printf("-- Expansion @ Search: %zu\n", config.expansion_search);

diff --git a/csharp/README.md b/csharp/README.md
@@ -12,7 +12,7 @@ dotnet add package Cloud.Unum.USearch
 using System.Diagnostics;
 using Cloud.Unum.USearch;
 
-using var index = new USearchIndex(
+using var index = new USearchDenseIndex(
     metricKind: MetricKind.Cos, // Choose cosine metric
     quantization: ScalarKind.Float32, // Only quantization to Float32, Float64 is currently supported
     dimensions: 3,  // Define the number of dimensions in input vectors
@@ -37,10 +37,10 @@ Trace.Assert(distances[0] <= 0.001f);
 index.Save("index.usearch")
 
 // Copy the whole index into memory
-using var indexLoaded = new USearchIndex("index.usearch");
+using var indexLoaded = new USearchDenseIndex("index.usearch");
 
 // Or view from disk without loading in memory
-// using var indexLoaded = new USearchIndex("index.usearch", view: true);
+// using var indexLoaded = new USearchDenseIndex("index.usearch", view: true);
 
 Trace.Assert(indexLoaded.Size() == 1);
 Trace.Assert(indexLoaded.Dimensions() == 3);
@@ -53,7 +53,7 @@ Trace.Assert(indexLoaded.Contains(42));
 Adding a batch of entries is identical to adding a single vector.
 
 ```csharp
-using var index = new USearchIndex(MetricKind.Cos, ScalarKind.Float32, dimensions: 3);
+using var index = new USearchDenseIndex(MetricKind.Cos, ScalarKind.Float32, dimensions: 3);
 
 // Generate keys and random vectors
 int n = 100;