/
index.hpp
3193 lines (2728 loc) 路 122 KB
/
index.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* @file index.hpp
* @author Ashot Vardanian
* @brief Single-header Vector Search.
* @date 2023-04-26
*
* @copyright Copyright (c) 2023
*/
#ifndef UNUM_USEARCH_HPP
#define UNUM_USEARCH_HPP
#define USEARCH_VERSION_MAJOR 0
#define USEARCH_VERSION_MINOR 0
#define USEARCH_VERSION_PATCH 0
// Inferring C++ version
#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
#define USEARCH_DEFINED_CPP17
#endif
// Inferring target OS
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
#define USEARCH_DEFINED_WINDOWS
#elif defined(__APPLE__) && defined(__MACH__)
#define USEARCH_DEFINED_APPLE
#elif defined(__linux__)
#define USEARCH_DEFINED_LINUX
#endif
// Inferring the compiler
#if defined(__clang__)
#define USEARCH_DEFINED_CLANG
#elif defined(__GNUC__)
#define USEARCH_DEFINED_GCC
#endif
// Inferring hardware architecture: x86 vs Arm
#if defined(__x86_64__)
#define USEARCH_DEFINED_X86
#elif defined(__aarch64__)
#define USEARCH_DEFINED_ARM
#endif
#if !defined(USEARCH_USE_OPENMP)
#define USEARCH_USE_OPENMP 0
#endif
// OS-specific includes
#if defined(USEARCH_DEFINED_WINDOWS)
#define _USE_MATH_DEFINES
#define NOMINMAX
#include <Windows.h>
#include <sys/stat.h> // `fstat` for file size
#undef NOMINMAX
#undef _USE_MATH_DEFINES
#else
#include <fcntl.h> // `fallocate`
#include <stdlib.h> // `posix_memalign`
#include <sys/mman.h> // `mmap`
#include <sys/stat.h> // `fstat` for file size
#include <unistd.h> // `open`, `close`
#endif
// STL includes
#include <algorithm> // `std::sort_heap`
#include <atomic> // `std::atomic`
#include <bitset> // `std::bitset`
#include <climits> // `CHAR_BIT`
#include <cmath> // `std::sqrt`
#include <cstring> // `std::memset`
#include <iterator> // `std::reverse_iterator`
#include <mutex> // `std::unique_lock` - replacement candidate
#include <random> // `std::default_random_engine` - replacement candidate
#include <stdexcept> // `std::runtime_exception`
#include <thread> // `std::thread`
#include <utility> // `std::pair`
// Prefetching
#if defined(USEARCH_DEFINED_GCC)
// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
// Zero means we are only going to read from that memory.
// Three means high temporal locality and suggests to keep
// the data in all layers of cache.
#define prefetch_m(ptr) __builtin_prefetch((void*)(ptr), 0, 3)
#elif defined(USEARCH_DEFINED_X86)
#define prefetch_m(ptr) _mm_prefetch((void*)(ptr), _MM_HINT_T0)
#else
#define prefetch_m(ptr)
#endif
// Alignment
#if defined(USEARCH_DEFINED_WINDOWS)
#define usearch_pack_m
#define usearch_align_m __declspec(align(64))
#else
#define usearch_pack_m __attribute__((packed))
#define usearch_align_m __attribute__((aligned(64)))
#endif
// Debugging
#if defined(NDEBUG)
#define usearch_assert_m(must_be_true, message)
#define usearch_noexcept_m noexcept
#else
#define usearch_assert_m(must_be_true, message) \
if (!(must_be_true)) { \
throw std::runtime_error(message); \
}
#define usearch_noexcept_m
#endif
namespace unum {
namespace usearch {
using f64_t = double;
using f32_t = float;
using byte_t = char;
enum b1x8_t : unsigned char {};
enum class metric_kind_t : std::uint8_t {
unknown_k = 0,
// Classics:
ip_k = 'i',
cos_k = 'c',
l2sq_k = 'e',
// Custom:
pearson_k = 'p',
haversine_k = 'h',
// Sets:
jaccard_k = 'j',
hamming_k = 'b',
tanimoto_k = 't',
sorensen_k = 's',
};
enum class scalar_kind_t : std::uint8_t {
unknown_k = 0,
f64_k,
f32_k,
f16_k,
f8_k,
b1x8_k,
};
template <typename scalar_at> scalar_kind_t common_scalar_kind() noexcept {
if (std::is_same<scalar_at, f32_t>())
return scalar_kind_t::f32_k;
if (std::is_same<scalar_at, f64_t>())
return scalar_kind_t::f64_k;
if (std::is_same<scalar_at, b1x8_t>())
return scalar_kind_t::b1x8_k;
return scalar_kind_t::unknown_k;
}
template <typename at> at angle_to_radians(at angle) noexcept { return angle * at(3.14159265358979323846) / at(180); }
template <typename at> at square(at value) noexcept { return value * value; }
template <std::size_t multiple_ak> std::size_t divide_round_up(std::size_t num) noexcept {
return (num + multiple_ak - 1) / multiple_ak;
}
inline std::size_t divide_round_up(std::size_t num, std::size_t denominator) noexcept {
return (num + denominator - 1) / denominator;
}
inline std::size_t ceil2(std::size_t v) noexcept {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v |= v >> 32;
v++;
return v;
}
template <typename at> void misaligned_store(void* ptr, at v) noexcept {
static_assert(!std::is_reference<at>::value);
std::memcpy(ptr, &v, sizeof(at));
}
/// @brief Simply dereferencing misaligned pointers can be dangerous.
template <typename at> at misaligned_load(void* ptr) noexcept {
static_assert(!std::is_reference<at>::value);
at v;
std::memcpy(&v, ptr, sizeof(at));
return v;
}
/// @brief The `std::exchange` alternative for C++11.
template <typename at, typename other_at = at> at exchange(at& obj, other_at&& new_value) {
at old_value = std::move(obj);
obj = std::forward<other_at>(new_value);
return old_value;
}
template <typename at> class misaligned_ref_gt {
byte_t* ptr_;
public:
misaligned_ref_gt(byte_t* ptr) noexcept : ptr_(ptr) {}
operator at() const noexcept { return misaligned_load<at>(ptr_); }
misaligned_ref_gt& operator=(at const& v) noexcept {
misaligned_store<at>(ptr_, v);
return *this;
}
void reset(byte_t* ptr) noexcept { ptr_ = ptr; }
};
template <typename at> class misaligned_ptr_gt {
byte_t* ptr_;
public:
using iterator_category = std::random_access_iterator_tag;
using value_type = at;
using difference_type = std::ptrdiff_t;
using pointer = misaligned_ptr_gt<at>;
using reference = misaligned_ref_gt<at>;
reference operator*() const noexcept { return {ptr_}; }
misaligned_ptr_gt(byte_t* ptr) noexcept : ptr_(ptr) {}
misaligned_ptr_gt operator++(int) noexcept { return misaligned_ptr_gt(ptr_ + sizeof(at)); }
misaligned_ptr_gt operator--(int) noexcept { return misaligned_ptr_gt(ptr_ - sizeof(at)); }
misaligned_ptr_gt operator+(difference_type d) noexcept { return misaligned_ptr_gt(ptr_ + d * sizeof(at)); }
misaligned_ptr_gt operator-(difference_type d) noexcept { return misaligned_ptr_gt(ptr_ - d * sizeof(at)); }
misaligned_ptr_gt& operator++() noexcept {
ptr_ += sizeof(at);
return *this;
}
misaligned_ptr_gt& operator--() noexcept {
ptr_ -= sizeof(at);
return *this;
}
misaligned_ptr_gt& operator+=(difference_type d) noexcept {
ptr_ += d * sizeof(at);
return *this;
}
misaligned_ptr_gt& operator-=(difference_type d) noexcept {
ptr_ -= d * sizeof(at);
return *this;
}
bool operator==(misaligned_ptr_gt const& other) noexcept { return ptr_ == other.ptr_; }
bool operator!=(misaligned_ptr_gt const& other) noexcept { return ptr_ != other.ptr_; }
};
/**
* @brief Non-owning memory range view, similar to `std::span`, but for C++11.
*/
template <typename scalar_at> class span_gt {
scalar_at* data_;
std::size_t size_;
public:
span_gt() noexcept : data_(nullptr), size_(0u) {}
span_gt(scalar_at* begin, scalar_at* end) noexcept : data_(begin), size_(end - begin) {}
span_gt(scalar_at* begin, std::size_t size) noexcept : data_(begin), size_(size) {}
scalar_at* data() const noexcept { return data_; }
std::size_t size() const noexcept { return size_; }
scalar_at* begin() const noexcept { return data_; }
scalar_at* end() const noexcept { return data_ + size_; }
operator scalar_at*() const noexcept { return data(); }
};
/**
* @brief Inner (Dot) Product distance.
*/
template <typename scalar_at = float, typename result_at = scalar_at> struct ip_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
inline metric_kind_t kind() const noexcept { return metric_kind_t::ip_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t dim) const noexcept {
result_type ab{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : ab)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != dim; ++i)
ab += result_t(a[i]) * result_t(b[i]);
return 1 - ab;
}
};
/**
* @brief Cosine (Angular) distance.
* Identical to the Inner Product of normalized vectors.
* Unless you are running on an tiny embedded platform, this metric
* is recommended over `::ip_gt` for low-precision scalars.
*/
template <typename scalar_at = float, typename result_at = scalar_at> struct cos_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
inline metric_kind_t kind() const noexcept { return metric_kind_t::cos_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t dim) const noexcept {
result_t ab{}, a2{}, b2{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : ab, a2, b2)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != dim; ++i)
ab += result_t(a[i]) * result_t(b[i]), //
a2 += square<result_t>(a[i]), //
b2 += square<result_t>(b[i]);
return (ab != 0) ? (1 - ab / (std::sqrt(a2) * std::sqrt(b2))) : 1;
}
};
/**
* @brief Squared Euclidean (L2) distance.
* Square root is avoided at the end, as it won't affect the ordering.
*/
template <typename scalar_at = float, typename result_at = scalar_at> struct l2sq_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
inline metric_kind_t kind() const noexcept { return metric_kind_t::l2sq_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t dim) const noexcept {
result_t ab_deltas_sq{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : ab_deltas_sq)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != dim; ++i)
ab_deltas_sq += square(result_t(a[i]) - result_t(b[i]));
return ab_deltas_sq;
}
};
/**
* @brief Hamming distance computes the number of differing bits in
* two arrays of integers. An example would be a textual document,
* tokenized and hashed into a fixed-capacity bitset.
*/
template <typename scalar_at = std::uint64_t, typename result_at = std::size_t> struct hamming_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
static_assert( //
std::is_unsigned<scalar_t>::value ||
(std::is_enum<scalar_t>::value && std::is_unsigned<typename std::underlying_type<scalar_t>::type>::value),
"Hamming distance requires unsigned integral words");
inline metric_kind_t kind() const noexcept { return metric_kind_t::hamming_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t words) const noexcept {
constexpr std::size_t bits_per_word_k = sizeof(scalar_t) * CHAR_BIT;
result_t matches{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : matches)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != words; ++i)
matches += std::bitset<bits_per_word_k>(a[i] ^ b[i]).count();
return matches;
}
};
/**
* @brief Tanimoto distance is the intersection over bitwise union.
* Often used in chemistry and biology to compare molecular fingerprints.
*/
template <typename scalar_at = std::uint64_t, typename result_at = float> struct tanimoto_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
static_assert( //
std::is_unsigned<scalar_t>::value ||
(std::is_enum<scalar_t>::value && std::is_unsigned<typename std::underlying_type<scalar_t>::type>::value),
"Tanimoto distance requires unsigned integral words");
static_assert(std::is_floating_point<result_t>::value, "Tanimoto distance will be a fraction");
inline metric_kind_t kind() const noexcept { return metric_kind_t::tanimoto_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t words) const noexcept {
constexpr std::size_t bits_per_word_k = sizeof(scalar_t) * CHAR_BIT;
result_t and_count{};
result_t or_count{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : and_count, or_count)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != words; ++i)
and_count += std::bitset<bits_per_word_k>(a[i] & b[i]).count(),
or_count += std::bitset<bits_per_word_k>(a[i] | b[i]).count();
return 1 - result_t(and_count) / or_count;
}
};
/**
* @brief Sorensen-Dice or F1 distance is the intersection over bitwise union.
* Often used in chemistry and biology to compare molecular fingerprints.
*/
template <typename scalar_at = std::uint64_t, typename result_at = float> struct sorensen_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
static_assert( //
std::is_unsigned<scalar_t>::value ||
(std::is_enum<scalar_t>::value && std::is_unsigned<typename std::underlying_type<scalar_t>::type>::value),
"Sorensen-Dice distance requires unsigned integral words");
static_assert(std::is_floating_point<result_t>::value, "Sorensen-Dice distance will be a fraction");
inline metric_kind_t kind() const noexcept { return metric_kind_t::sorensen_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t words) const noexcept {
constexpr std::size_t bits_per_word_k = sizeof(scalar_t) * CHAR_BIT;
result_t and_count{};
result_t any_count{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : and_count, any_count)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != words; ++i)
and_count += std::bitset<bits_per_word_k>(a[i] & b[i]).count(),
any_count += std::bitset<bits_per_word_k>(a[i]).count() + std::bitset<bits_per_word_k>(b[i]).count();
return 1 - 2 * result_t(and_count) / any_count;
}
};
/**
* @brief Counts the number of matching elements in two unique sorted sets.
* Can be used to compute the similarity between two textual documents
* using the IDs of tokens present in them.
* Similar to `tanimoto_gt` for dense representations.
*/
template <typename scalar_at = std::int32_t, typename result_at = float> struct jaccard_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
static_assert(!std::is_floating_point<scalar_t>::value, "Jaccard distance requires integral scalars");
inline metric_kind_t kind() const noexcept { return metric_kind_t::jaccard_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept {
return operator()(a.data(), b.data(), a.size(), b.size());
}
inline result_t operator()( //
scalar_t const* a, scalar_t const* b, std::size_t a_length, std::size_t b_length) const noexcept {
result_t intersection{};
std::size_t i{};
std::size_t j{};
while (i != a_length && j != b_length) {
intersection += a[i] == b[j];
i += a[i] < b[j];
j += a[i] >= b[j];
}
return 1 - intersection / (a_length + b_length - intersection);
}
};
/**
* @brief Counts the number of matching elements in two unique sorted sets.
* Can be used to compute the similarity between two textual documents
* using the IDs of tokens present in them.
*/
template <typename scalar_at = float, typename result_at = float> struct pearson_correlation_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
inline metric_kind_t kind() const noexcept { return metric_kind_t::pearson_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(view_t a, view_t b) const noexcept { return operator()(a.data(), b.data(), a.size()); }
inline result_t operator()(scalar_t const* a, scalar_t const* b, std::size_t dim) const noexcept {
result_t a_sum{}, b_sum{}, ab_sum{};
result_t a_sq_sum{}, b_sq_sum{};
#if USEARCH_USE_OPENMP
#pragma omp simd reduction(+ : a_sum, b_sum, ab_sum, a_sq_sum, b_sq_sum)
#elif defined(USEARCH_DEFINED_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(USEARCH_DEFINED_GCC)
#pragma GCC ivdep
#endif
for (std::size_t i = 0; i != dim; ++i) {
a_sum += result_t(a[i]);
b_sum += result_t(b[i]);
ab_sum += result_t(a[i]) * result_t(b[i]);
a_sq_sum += result_t(a[i]) * result_t(a[i]);
b_sq_sum += result_t(b[i]) * result_t(b[i]);
}
result_t denom = std::sqrt((dim * a_sq_sum - a_sum * a_sum) * (dim * b_sq_sum - b_sum * b_sum));
result_t corr = (dim * ab_sum - a_sum * b_sum) / denom;
return -corr;
}
};
/**
* @brief Haversine distance for the shortest distance between two nodes on
* the surface of a 3D sphere, defined with latitude and longitude.
*/
template <typename scalar_at = float, typename result_at = scalar_at> struct haversine_gt {
using scalar_t = scalar_at;
using view_t = span_gt<scalar_t const>;
using result_t = result_at;
using result_type = result_t;
static_assert(!std::is_integral<scalar_t>::value, "Latitude and longitude must be floating-node");
inline metric_kind_t kind() const noexcept { return metric_kind_t::haversine_k; }
inline scalar_kind_t scalar_kind() const noexcept { return common_scalar_kind<scalar_t>(); }
inline result_t operator()(scalar_t const* a, scalar_t const* b) const noexcept {
result_t lat_a = a[0], lon_a = a[1];
result_t lat_b = b[0], lon_b = b[1];
result_t lat_delta = angle_to_radians<result_t>(lat_b - lat_a);
result_t lon_delta = angle_to_radians<result_t>(lon_b - lon_a);
result_t converted_lat_a = angle_to_radians<result_t>(lat_a);
result_t converted_lat_b = angle_to_radians<result_t>(lat_b);
result_t x = //
square(std::sin(lat_delta / 2.f)) +
std::cos(converted_lat_a) * std::cos(converted_lat_b) * square(std::sin(lon_delta / 2.f));
return std::atan2(std::sqrt(x), std::sqrt(1.f - x));
}
};
class error_t {
char const* message_{};
public:
error_t(char const* message = nullptr) noexcept : message_(message) {}
error_t& operator=(char const* message) noexcept {
message_ = message;
return *this;
}
error_t(error_t&& other) noexcept : message_(exchange(other.message_, nullptr)) {}
error_t& operator=(error_t&& other) noexcept {
std::swap(message_, other.message_);
return *this;
}
explicit operator bool() const noexcept { return message_ != nullptr; }
char const* what() const noexcept { return message_; }
#if defined(__cpp_exceptions) && (1 == __cpp_exceptions)
~error_t() noexcept(false) {
if (message_)
if (!std::uncaught_exception())
raise();
}
void raise() noexcept(false) {
if (message_)
throw std::runtime_error(exchange(message_, nullptr));
}
#else
~error_t() noexcept { raise(); }
void raise() noexcept {
if (message_)
std::terminate();
}
#endif
};
template <typename result_at> struct expected_gt {
result_at result;
error_t error;
operator result_at&() & {
error.raise();
return result;
}
operator result_at&&() && {
error.raise();
return std::move(result);
}
result_at const& operator*() const noexcept { return result; }
explicit operator bool() const noexcept { return !error; }
expected_gt failed(error_t message) noexcept {
error = std::move(message);
return std::move(*this);
}
};
/**
* @brief Light-weight bitset implementation to track visited nodes during graph traversal.
* Extends basic functionality with atomic operations.
*/
template <typename allocator_at = std::allocator<char>> class visits_bitset_gt {
using allocator_t = allocator_at;
using byte_t = typename allocator_t::value_type;
static_assert(sizeof(byte_t) == 1, "Allocator must allocate separate addressable bytes");
using slot_t = unsigned long;
static constexpr std::size_t bits_per_slot() { return sizeof(slot_t) * CHAR_BIT; }
static constexpr slot_t bits_mask() { return sizeof(slot_t) * CHAR_BIT - 1; }
slot_t* slots_{};
/// @brief Number of slots.
std::size_t count_{};
public:
visits_bitset_gt() noexcept {}
~visits_bitset_gt() noexcept { reset(); }
void clear() noexcept { std::memset(slots_, 0, count_ * sizeof(slot_t)); }
void reset() noexcept {
if (slots_)
allocator_t{}.deallocate((byte_t*)slots_, count_ * sizeof(slot_t));
slots_ = nullptr;
count_ = 0;
}
bool resize(std::size_t capacity) noexcept {
std::size_t count = divide_round_up<bits_per_slot()>(capacity);
if (count <= count_)
return true;
slot_t* slots = (slot_t*)allocator_t{}.allocate(count * sizeof(slot_t));
if (!slots)
return false;
reset();
count_ = count;
slots_ = slots;
clear();
return true;
}
visits_bitset_gt(visits_bitset_gt&& other) noexcept {
slots_ = exchange(other.slots_, nullptr);
count_ = exchange(other.count_, 0);
}
visits_bitset_gt& operator=(visits_bitset_gt&& other) noexcept {
std::swap(slots_, other.slots_);
std::swap(count_, other.count_);
return *this;
}
visits_bitset_gt(visits_bitset_gt const&) = delete;
visits_bitset_gt& operator=(visits_bitset_gt const&) = delete;
inline bool test(std::size_t i) const noexcept { return slots_[i / bits_per_slot()] & (1ul << (i & bits_mask())); }
inline void set(std::size_t i) noexcept { slots_[i / bits_per_slot()] |= (1ul << (i & bits_mask())); }
#if defined(USEARCH_DEFINED_WINDOWS)
inline bool atomic_set(std::size_t i) noexcept {
slot_t mask{1ul << (i & bits_mask())};
return InterlockedOr((long volatile*)&slots_[i / bits_per_slot()], mask) & mask;
}
inline void atomic_reset(std::size_t i) noexcept {
slot_t mask{1ul << (i & bits_mask())};
InterlockedAnd((long volatile*)&slots_[i / bits_per_slot()], ~mask);
}
#else
inline bool atomic_set(std::size_t i) noexcept {
slot_t mask{1ul << (i & bits_mask())};
return __atomic_fetch_or(&slots_[i / bits_per_slot()], mask, __ATOMIC_ACQUIRE) & mask;
}
inline void atomic_reset(std::size_t i) noexcept {
slot_t mask{1ul << (i & bits_mask())};
__atomic_fetch_and(&slots_[i / bits_per_slot()], ~mask, __ATOMIC_RELEASE);
}
#endif
};
using visits_bitset_t = visits_bitset_gt<>;
/**
* @brief Similar to `std::priority_queue`, but allows raw access to underlying
* memory, in case you want to shuffle it or sort. Good for collections
* from 100s to 10'000s elements.
*/
template <typename element_at, //
typename comparator_at = std::less<void>, // <void> is needed before C++14.
typename allocator_at = std::allocator<element_at>> //
class max_heap_gt {
public:
using element_t = element_at;
using comparator_t = comparator_at;
using allocator_t = allocator_at;
using value_type = element_t;
static_assert(std::is_trivially_destructible<element_t>(), "This heap is designed for trivial structs");
static_assert(std::is_trivially_copy_constructible<element_t>(), "This heap is designed for trivial structs");
private:
element_t* elements_;
std::size_t size_;
std::size_t capacity_;
public:
max_heap_gt() noexcept : elements_(nullptr), size_(0), capacity_(0) {}
max_heap_gt(max_heap_gt&& other) noexcept
: elements_(exchange(other.elements_, nullptr)), size_(exchange(other.size_, 0)),
capacity_(exchange(other.capacity_, 0)) {}
max_heap_gt& operator=(max_heap_gt&& other) noexcept {
std::swap(elements_, other.elements_);
std::swap(size_, other.size_);
std::swap(capacity_, other.capacity_);
return *this;
}
max_heap_gt(max_heap_gt const&) = delete;
max_heap_gt& operator=(max_heap_gt const&) = delete;
~max_heap_gt() noexcept { reset(); }
void reset() noexcept {
if (elements_)
allocator_t{}.deallocate(elements_, capacity_);
elements_ = nullptr;
capacity_ = 0;
size_ = 0;
}
inline bool empty() const noexcept { return !size_; }
inline std::size_t size() const noexcept { return size_; }
inline std::size_t capacity() const noexcept { return capacity_; }
/// @brief Selects the largest element in the heap.
/// @return Reference to the stored element.
inline element_t const& top() const noexcept { return elements_[0]; }
inline void clear() noexcept { size_ = 0; }
bool reserve(std::size_t new_capacity) noexcept {
if (new_capacity < capacity_)
return true;
new_capacity = ceil2(new_capacity);
new_capacity = (std::max<std::size_t>)(new_capacity, (std::max<std::size_t>)(capacity_ * 2u, 16u));
auto allocator = allocator_t{};
auto new_elements = allocator.allocate(new_capacity);
if (!new_elements)
return false;
if (elements_) {
std::memcpy(new_elements, elements_, size_ * sizeof(element_t));
allocator.deallocate(elements_, capacity_);
}
elements_ = new_elements;
capacity_ = new_capacity;
return new_elements;
}
bool insert(element_t&& element) noexcept {
if (!reserve(size_ + 1))
return false;
insert_reserved(std::move(element));
return true;
}
inline void insert_reserved(element_t&& element) noexcept {
new (&elements_[size_]) element_t(element);
size_++;
shift_up(size_ - 1);
}
inline element_t pop() noexcept {
element_t result = top();
std::swap(elements_[0], elements_[size_ - 1]);
size_--;
elements_[size_].~element_t();
shift_down(0);
return result;
}
/** @brief Invalidates the "max-heap" property, transforming into ascending range. */
inline void sort_ascending() noexcept { std::sort_heap(elements_, elements_ + size_, &less); }
inline void shrink(std::size_t n) noexcept { size_ = (std::min<std::size_t>)(n, size_); }
inline element_t* data() noexcept { return elements_; }
inline element_t const* data() const noexcept { return elements_; }
private:
inline std::size_t parent_idx(std::size_t i) const noexcept { return (i - 1u) / 2u; }
inline std::size_t left_child_idx(std::size_t i) const noexcept { return (i * 2u) + 1u; }
inline std::size_t right_child_idx(std::size_t i) const noexcept { return (i * 2u) + 2u; }
static bool less(element_t const& a, element_t const& b) noexcept { return comparator_t{}(a, b); }
void shift_up(std::size_t i) noexcept {
for (; i && less(elements_[parent_idx(i)], elements_[i]); i = parent_idx(i))
std::swap(elements_[parent_idx(i)], elements_[i]);
}
void shift_down(std::size_t i) noexcept {
std::size_t max_idx = i;
std::size_t left = left_child_idx(i);
if (left < size_ && less(elements_[max_idx], elements_[left]))
max_idx = left;
std::size_t right = right_child_idx(i);
if (right < size_ && less(elements_[max_idx], elements_[right]))
max_idx = right;
if (i != max_idx) {
std::swap(elements_[i], elements_[max_idx]);
shift_down(max_idx);
}
}
};
/**
* @brief Similar to `std::priority_queue`, but allows raw access to underlying
* memory and always keeps the data sorted. Ideal for small collections
* under 128 elements.
*/
template <typename element_at, //
typename comparator_at = std::less<void>, // <void> is needed before C++14.
typename allocator_at = std::allocator<element_at>> //
class sorted_buffer_gt {
public:
using element_t = element_at;
using comparator_t = comparator_at;
using allocator_t = allocator_at;
static_assert(std::is_trivially_destructible<element_t>(), "This heap is designed for trivial structs");
static_assert(std::is_trivially_copy_constructible<element_t>(), "This heap is designed for trivial structs");
using value_type = element_t;
private:
element_t* elements_;
std::size_t size_;
std::size_t capacity_;
public:
sorted_buffer_gt() noexcept : elements_(nullptr), size_(0), capacity_(0) {}
sorted_buffer_gt(sorted_buffer_gt&& other) noexcept
: elements_(exchange(other.elements_, nullptr)), size_(exchange(other.size_, 0)),
capacity_(exchange(other.capacity_, 0)) {}
sorted_buffer_gt& operator=(sorted_buffer_gt&& other) noexcept {
std::swap(elements_, other.elements_);
std::swap(size_, other.size_);
std::swap(capacity_, other.capacity_);
return *this;
}
sorted_buffer_gt(sorted_buffer_gt const&) = delete;
sorted_buffer_gt& operator=(sorted_buffer_gt const&) = delete;
~sorted_buffer_gt() noexcept { reset(); }
void reset() noexcept {
if (elements_)
allocator_t{}.deallocate(elements_, capacity_);
elements_ = nullptr;
capacity_ = 0;
size_ = 0;
}
inline bool empty() const noexcept { return !size_; }
inline std::size_t size() const noexcept { return size_; }
inline std::size_t capacity() const noexcept { return capacity_; }
inline element_t const& top() const noexcept { return elements_[size_ - 1]; }
inline void clear() noexcept { size_ = 0; }
bool reserve(std::size_t new_capacity) noexcept {
if (new_capacity < capacity_)
return true;
new_capacity = ceil2(new_capacity);
new_capacity = (std::max<std::size_t>)(new_capacity, (std::max<std::size_t>)(capacity_ * 2u, 16u));
auto allocator = allocator_t{};
auto new_elements = allocator.allocate(new_capacity);
if (!new_elements)
return false;
if (size_)
std::memcpy(new_elements, elements_, size_ * sizeof(element_t));
if (elements_)
allocator.deallocate(elements_, capacity_);
elements_ = new_elements;
capacity_ = new_capacity;
return true;
}
inline void insert_reserved(element_t&& element) noexcept {
std::size_t slot = size_ ? std::lower_bound(elements_, elements_ + size_, element, &less) - elements_ : 0;
std::size_t to_move = size_ - slot;
element_t* source = elements_ + size_ - 1;
for (; to_move; --to_move, --source)
source[1] = source[0];
elements_[slot] = element;
size_++;
}
/**
* @return `true` if the entry was added, `false` if it wasn't relevant enough.
*/
inline bool insert(element_t&& element, std::size_t limit) noexcept {
std::size_t slot = size_ ? std::lower_bound(elements_, elements_ + size_, element, &less) - elements_ : 0;
if (slot == limit)
return false;
std::size_t to_move = size_ - slot - (size_ == limit);
element_t* source = elements_ + size_ - 1 - (size_ == limit);
for (; to_move; --to_move, --source)
source[1] = source[0];
elements_[slot] = element;
size_ += size_ != limit;
return true;
}
inline element_t pop() noexcept {
size_--;
element_t result = elements_[size_];
elements_[size_].~element_t();
return result;
}
void sort_ascending() noexcept {}
inline void shrink(std::size_t n) noexcept { size_ = (std::min<std::size_t>)(n, size_); }
inline element_t* data() noexcept { return elements_; }
inline element_t const* data() const noexcept { return elements_; }
private:
static bool less(element_t const& a, element_t const& b) noexcept { return comparator_t{}(a, b); }
};
#if defined(USEARCH_DEFINED_WINDOWS)
#pragma pack(push, 1) // Pack struct elements on 1-byte alignment
#endif
/**
* @brief Five-byte integer type to address node clouds with over 4B entries.
*/
class usearch_pack_m uint40_t {
unsigned char octets[5];
public:
inline uint40_t() noexcept { std::memset(octets, 0, 5); }
inline uint40_t(std::uint32_t n) noexcept { std::memcpy(octets, (char*)&n, 4); }
inline uint40_t(std::uint64_t n) noexcept { std::memcpy(octets, (char*)&n, 5); }
#if defined(USEARCH_DEFINED_CLANG) && defined(USEARCH_DEFINED_APPLE)
inline uint40_t(std::size_t n) noexcept { std::memcpy(octets, (char*)&n, 5); }
#endif
uint40_t(uint40_t&&) = default;
uint40_t(uint40_t const&) = default;