diff --git a/cpp/Mrpt.h b/cpp/Mrpt.h index 0765922..7b8e08e 100644 --- a/cpp/Mrpt.h +++ b/cpp/Mrpt.h @@ -164,10 +164,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters; in the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density expected proportion of non-zero components in the random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data @@ -201,10 +202,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters; in the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density expected proportion of non-zero components in the random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data @@ -239,10 +241,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters; in the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density_ expected proportion of non-zero components in the random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data @@ -325,10 +328,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters; in the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density_ expected proportion of non-zero components in the random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data @@ -341,6 +345,26 @@ class Mrpt { int depth_min_ = -1, int votes_max_ = -1, float density_ = -1.0, int seed = 0, const std::vector &indices_test = {}) { + if (trees_max == - 1) { + trees_max = std::min(std::sqrt(n_samples), 1000.0); + } + + if (depth_min_ == -1) { + depth_min_ = std::max(static_cast(std::log2(n_samples) - 11), 5); + } + + if (depth_max == -1) { + depth_max = std::max(static_cast(std::log2(n_samples) - 4), depth_min_); + } + + if (votes_max_ == -1) { + votes_max_ = std::max(trees_max / 10, std::min(trees_max, 10)); + } + + if (density_ > -1.0001 && density_ < -0.9999) { + density_ = 1.0 / std::sqrt(dim); + } + if (!empty()) { throw std::logic_error("The index has already been grown."); } @@ -349,23 +373,23 @@ class Mrpt { throw std::out_of_range("k_ must belong to the set {1, ..., n}."); } - if (trees_max < -1 || trees_max == 0) { + if (trees_max <= 0) { throw std::out_of_range("trees_max must be positive."); } - if (depth_max < -1 || depth_max == 0 || depth_max > std::log2(n_samples)) { + if (depth_max <= 0 || depth_max > std::log2(n_samples)) { throw std::out_of_range("depth_max must belong to the set {1, ... , log2(n)}."); } - if (depth_min_ < -1 || depth_min_ == 0 || depth_min_ > depth_max) { + if (depth_min_ <= 0 || depth_min_ > depth_max) { throw std::out_of_range("depth_min_ must belong to the set {1, ... , depth_max}"); } - if (votes_max_ < -1 || votes_max_ == 0 || votes_max_ > trees_max) { + if (votes_max_ <= 0 || votes_max_ > trees_max) { throw std::out_of_range("votes_max_ must belong to the set {1, ... , trees_max}."); } - if (density_ < -1.0001 || density_ > 1.0001 || (density_ > -0.9999 && density_ < -0.0001)) { + if (density_ < 0.0 || density_ > 1.0001) { throw std::out_of_range("The density must be on the interval (0,1]."); } @@ -373,36 +397,13 @@ class Mrpt { throw std::out_of_range("Sample size must be at least 101 to autotune an index."); } - if (trees_max == - 1) { - trees_max = std::min(std::sqrt(n_samples), 1000.0); - } - - if (depth_min_ == -1) { - depth_min = std::max(static_cast(std::log2(n_samples) - 11), 5); - } else { - depth_min = depth_min_; - } - - if (depth_max == -1) { - depth_max = std::max(static_cast(std::log2(n_samples) - 4), depth_min); - } - - if (votes_max_ == -1) { - votes_max = std::max(trees_max / 10, std::min(trees_max, 10)); - } else { - votes_max = votes_max_; - } - - if (density_ < 0) { - density = 1.0 / std::sqrt(dim); - } else { - density = density_; - } - + depth_min = depth_min_; + votes_max = votes_max_; k = k_; + const Eigen::Map Q(data, dim, n_test); - grow(trees_max, depth_max, density, seed); + grow(trees_max, depth_max, density_, seed); Eigen::MatrixXi exact(k, n_test); compute_exact(Q, exact, indices_test); @@ -453,10 +454,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters on the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density_ expected proportion of non-zero components of random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data @@ -486,10 +488,11 @@ class Mrpt { * @param depth_min_ minimum depth of trees considered when searching for * optimal parameters on the set * \f$\{1,2, \dots ,\lfloor \log_2 (n) \rfloor \}\f$; a default value -1 - * sets this to 5 + * sets this to \f$ \mathrm{max}(\lfloor \log_2 (n) \rfloor - 11, 5)\f$ * @param votes_max_ maximum number of votes considered when searching for * optimal parameters; a default value -1 sets this to - * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, 10) \f$ + * \f$ \mathrm{max}(\lfloor \mathrm{trees\_max} / 10 \rfloor, + * \mathrm{min}(10, \mathrm{trees\_max})) \f$ * @param density_ expected proportion of non-zero components of random vectors; * default value -1.0 sets this to \f$ 1 / \sqrt{d} \f$, where \f$ d\f$ is * the dimension of data diff --git a/docs/html/_mrpt_8h_source.html b/docs/html/_mrpt_8h_source.html index 30b6dab..6ed1d2e 100644 --- a/docs/html/_mrpt_8h_source.html +++ b/docs/html/_mrpt_8h_source.html @@ -23,7 +23,7 @@ Logo
Mrpt -  1.0.0 +  1.1.1
@@ -71,39 +71,39 @@
Mrpt.h
-
1 #ifndef CPP_MRPT_H_
2 #define CPP_MRPT_H_
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <functional>
7 #include <map>
8 #include <numeric>
9 #include <random>
10 #include <set>
11 #include <stdexcept>
12 #include <string>
13 #include <utility>
14 #include <vector>
15 
16 #include <Eigen/Dense>
17 #include <Eigen/SparseCore>
18 
20  int n_trees = 0;
21  int depth = 0;
22  int k = 0;
23  int votes = 0;
24  double estimated_qtime = 0.0;
25  double estimated_recall = 0.0;
26 };
27 
28 class Mrpt {
29  public:
49  Mrpt(const Eigen::Ref<const Eigen::MatrixXf> &X_) :
50  X(Eigen::Map<const Eigen::MatrixXf>(X_.data(), X_.rows(), X_.cols())),
51  n_samples(X_.cols()),
52  dim(X_.rows()) {}
53 
60  Mrpt(const float *X_, int dim_, int n_samples_) :
61  X(Eigen::Map<const Eigen::MatrixXf>(X_, dim_, n_samples_)),
62  n_samples(n_samples_),
63  dim(dim_) {}
64 
84  void grow(int n_trees_, int depth_, float density_ = -1.0, int seed = 0) {
85 
86  if (!empty()) {
87  throw std::logic_error("The index has already been grown.");
88  }
89 
90  if (n_trees_ <= 0) {
91  throw std::out_of_range("The number of trees must be positive.");
92  }
93 
94  if (depth_ <= 0 || depth_ > std::log2(n_samples)) {
95  throw std::out_of_range("The depth must belong to the set {1, ... , log2(n)}.");
96  }
97 
98  if (density_ < -1.0001 || density_ > 1.0001 || (density_ > -0.9999 && density_ < -0.0001)) {
99  throw std::out_of_range("The density must be on the interval (0,1].");
100  }
101 
102  n_trees = n_trees_;
103  depth = depth_;
104  n_pool = n_trees_ * depth_;
105  n_array = 1 << (depth_ + 1);
106 
107  if (density_ < 0) {
108  density = 1.0 / std::sqrt(dim);
109  } else {
110  density = density_;
111  }
112 
113  density < 1 ? build_sparse_random_matrix(sparse_random_matrix, n_pool, dim, density, seed) :
114  build_dense_random_matrix(dense_random_matrix, n_pool, dim, seed);
115 
116  split_points = Eigen::MatrixXf(n_array, n_trees);
117  tree_leaves = std::vector<std::vector<int>>(n_trees);
118 
119  count_first_leaf_indices_all(leaf_first_indices_all, n_samples, depth);
120  leaf_first_indices = leaf_first_indices_all[depth];
121 
122  #pragma omp parallel for
123  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
124  Eigen::MatrixXf tree_projections;
125 
126  if (density < 1)
127  tree_projections.noalias() = sparse_random_matrix.middleRows(n_tree * depth, depth) * X;
128  else
129  tree_projections.noalias() = dense_random_matrix.middleRows(n_tree * depth, depth) * X;
130 
131  tree_leaves[n_tree] = std::vector<int>(n_samples);
132  std::vector<int> &indices = tree_leaves[n_tree];
133  std::iota(indices.begin(), indices.end(), 0);
134 
135  grow_subtree(indices.begin(), indices.end(), 0, 0, n_tree, tree_projections);
136  }
137  }
138 
177  void grow(double target_recall, const Eigen::Ref<const Eigen::MatrixXf> &Q, int k_, int trees_max = -1,
178  int depth_max = -1, int depth_min_ = -1, int votes_max_ = -1,
179  float density = -1.0, int seed = 0) {
180  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
181  throw std::out_of_range("Target recall must be on the interval [0,1].");
182  }
183 
184  grow(Q, k_, trees_max, depth_max, depth_min_, votes_max_, density, seed);
185  prune(target_recall);
186  }
187 
216  void grow(double target_recall, const float *Q, int n_test, int k_, int trees_max = -1,
217  int depth_max = -1, int depth_min_ = -1, int votes_max_ = -1,
218  float density = -1.0, int seed = 0, const std::vector<int> &indices_test = {}) {
219  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
220  throw std::out_of_range("Target recall must be on the interval [0,1].");
221  }
222 
223  grow(Q, n_test, k_, trees_max, depth_max, depth_min_, votes_max_, density, seed, indices_test);
224  prune(target_recall);
225  }
226 
253  void grow_autotune(double target_recall, int k_, int trees_max = -1, int depth_max = -1, int depth_min_ = -1,
254  int votes_max_ = -1, float density_ = -1.0, int seed = 0, int n_test = 100) {
255  if (n_test < 1) {
256  throw std::out_of_range("Test set size must be > 0.");
257  }
258 
259  n_test = n_test > n_samples ? n_samples : n_test;
260  std::vector<int> indices_test(sample_indices(n_test, seed));
261  const Eigen::MatrixXf Q(subset(indices_test));
262 
263  grow(target_recall, Q.data(), Q.cols(), k_, trees_max,
264  depth_max, depth_min_, votes_max_, density_, seed, indices_test);
265  }
266 
279  if (index_type == normal || index_type == autotuned_unpruned) {
280  Mrpt_Parameters p;
281  p.n_trees = n_trees;
282  p.depth = depth;
283  p.k = par.k;
284  return p;
285  }
286 
287  return par;
288  }
289 
295  bool is_autotuned() const {
296  return index_type == autotuned;
297  }
298 
340  void grow(const float *data, int n_test, int k_, int trees_max = -1, int depth_max = -1,
341  int depth_min_ = -1, int votes_max_ = -1, float density_ = -1.0, int seed = 0,
342  const std::vector<int> &indices_test = {}) {
343 
344  if (!empty()) {
345  throw std::logic_error("The index has already been grown.");
346  }
347 
348  if (k_ <= 0 || k_ > n_samples) {
349  throw std::out_of_range("k_ must belong to the set {1, ..., n}.");
350  }
351 
352  if (trees_max < -1 || trees_max == 0) {
353  throw std::out_of_range("trees_max must be positive.");
354  }
355 
356  if (depth_max < -1 || depth_max == 0 || depth_max > std::log2(n_samples)) {
357  throw std::out_of_range("depth_max must belong to the set {1, ... , log2(n)}.");
358  }
359 
360  if (depth_min_ < -1 || depth_min_ == 0 || depth_min_ > depth_max) {
361  throw std::out_of_range("depth_min_ must belong to the set {1, ... , depth_max}");
362  }
363 
364  if (votes_max_ < -1 || votes_max_ == 0 || votes_max_ > trees_max) {
365  throw std::out_of_range("votes_max_ must belong to the set {1, ... , trees_max}.");
366  }
367 
368  if (density_ < -1.0001 || density_ > 1.0001 || (density_ > -0.9999 && density_ < -0.0001)) {
369  throw std::out_of_range("The density must be on the interval (0,1].");
370  }
371 
372  if (trees_max == - 1) {
373  trees_max = std::min(std::sqrt(n_samples), 1000.0);
374  }
375 
376  if (depth_min_ == -1) {
377  depth_min = std::min(static_cast<int>(std::log2(n_samples)), 5);
378  } else {
379  depth_min = depth_min_;
380  }
381 
382  if (depth_max == -1) {
383  depth_max = std::max(static_cast<int>(std::log2(n_samples) - 4), depth_min);
384  }
385 
386  if (votes_max_ == -1) {
387  votes_max = std::max(trees_max / 10, std::min(trees_max, 10));
388  } else {
389  votes_max = votes_max_;
390  }
391 
392  if (density_ < 0) {
393  density = 1.0 / std::sqrt(dim);
394  } else {
395  density = density_;
396  }
397 
398  k = k_;
399  const Eigen::Map<const Eigen::MatrixXf> Q(data, dim, n_test);
400 
401  grow(trees_max, depth_max, density, seed);
402  Eigen::MatrixXi exact(k, n_test);
403  compute_exact(Q, exact, indices_test);
404 
405  std::vector<Eigen::MatrixXd> recalls(depth_max - depth_min + 1);
406  cs_sizes = std::vector<Eigen::MatrixXd>(depth_max - depth_min + 1);
407 
408  for (int d = depth_min; d <= depth_max; ++d) {
409  recalls[d - depth_min] = Eigen::MatrixXd::Zero(votes_max, trees_max);
410  cs_sizes[d - depth_min] = Eigen::MatrixXd::Zero(votes_max, trees_max);
411  }
412 
413  for (int i = 0; i < n_test; ++i) {
414  std::vector<Eigen::MatrixXd> recall_tmp(depth_max - depth_min + 1);
415  std::vector<Eigen::MatrixXd> cs_size_tmp(depth_max - depth_min + 1);
416 
417  count_elected(Q.col(i), Eigen::Map<Eigen::VectorXi>(exact.data() + i * k, k),
418  votes_max, recall_tmp, cs_size_tmp);
419 
420  for (int d = depth_min; d <= depth_max; ++d) {
421  recalls[d - depth_min] += recall_tmp[d - depth_min];
422  cs_sizes[d - depth_min] += cs_size_tmp[d - depth_min];
423  }
424  }
425 
426  for (int d = depth_min; d <= depth_max; ++d) {
427  recalls[d - depth_min] /= (k * n_test);
428  cs_sizes[d - depth_min] /= n_test;
429  }
430 
431  fit_times(Q);
432  std::set<Mrpt_Parameters,decltype(is_faster)*> pars = list_parameters(recalls);
433  opt_pars = pareto_frontier(pars);
434 
435  index_type = autotuned_unpruned;
436  par.k = k_;
437  }
438 
462  void grow(const Eigen::Ref<const Eigen::MatrixXf> &Q, int k_, int trees_max = -1, int depth_max = -1,
463  int depth_min_ = -1, int votes_max_ = -1, float density_ = -1.0, int seed = 0) {
464  if (Q.rows() != dim) {
465  throw std::invalid_argument("Dimensions of the data and the validation set do not match.");
466  }
467 
468  grow(Q.data(), Q.cols(), k_, trees_max,
469  depth_max, depth_min_, votes_max_, density_, seed);
470  }
471 
496  void grow_autotune(int k_, int trees_max = -1, int depth_max = -1, int depth_min_ = -1,
497  int votes_max_ = -1, float density_ = -1.0, int seed = 0, int n_test = 100) {
498  if (n_test < 1) {
499  throw std::out_of_range("Test set size must be > 0.");
500  }
501 
502  n_test = n_test > n_samples ? n_samples : n_test;
503  std::vector<int> indices_test(sample_indices(n_test, seed));
504  const Eigen::MatrixXf Q(subset(indices_test));
505 
506  grow(Q.data(), Q.cols(), k_, trees_max,
507  depth_max, depth_min_, votes_max_, density_, seed, indices_test);
508  }
509 
520  Mrpt subset(double target_recall) const {
521  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
522  throw std::out_of_range("Target recall must be on the interval [0,1].");
523  }
524 
525  Mrpt index2(X);
526  index2.par = parameters(target_recall);
527 
528  int depth_max = depth;
529 
530  index2.n_trees = index2.par.n_trees;
531  index2.depth = index2.par.depth;
532  index2.votes = index2.par.votes;
533  index2.n_pool = index2.depth * index2.n_trees;
534  index2.n_array = 1 << (index2.depth + 1);
535  index2.tree_leaves.assign(tree_leaves.begin(), tree_leaves.begin() + index2.n_trees);
536  index2.leaf_first_indices_all = leaf_first_indices_all;
537  index2.density = density;
538  index2.k = k;
539 
540  index2.split_points = split_points.topLeftCorner(index2.n_array, index2.n_trees);
541  index2.leaf_first_indices = leaf_first_indices_all[index2.depth];
542  if (index2.density < 1) {
543  index2.sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(index2.n_pool, index2.dim);
544  for (int n_tree = 0; n_tree < index2.n_trees; ++n_tree)
545  index2.sparse_random_matrix.middleRows(n_tree * index2.depth, index2.depth) =
546  sparse_random_matrix.middleRows(n_tree * depth_max, index2.depth);
547  } else {
548  index2.dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(index2.n_pool, index2.dim);
549  for (int n_tree = 0; n_tree < index2.n_trees; ++n_tree)
550  index2.dense_random_matrix.middleRows(n_tree * index2.depth, index2.depth) =
551  dense_random_matrix.middleRows(n_tree * depth_max, index2.depth);
552  }
553  index2.index_type = autotuned;
554 
555  return index2;
556  }
557 
558 
570  Mrpt *subset_pointer(double target_recall) const {
571  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
572  throw std::out_of_range("Target recall must be on the interval [0,1].");
573  }
574 
575  Mrpt *index2 = new Mrpt(X);
576  index2->par = parameters(target_recall);
577 
578  int depth_max = depth;
579 
580  index2->n_trees = index2->par.n_trees;
581  index2->depth = index2->par.depth;
582  index2->votes = index2->par.votes;
583  index2->n_pool = index2->depth * index2->n_trees;
584  index2->n_array = 1 << (index2->depth + 1);
585  index2->tree_leaves.assign(tree_leaves.begin(), tree_leaves.begin() + index2->n_trees);
586  index2->leaf_first_indices_all = leaf_first_indices_all;
587  index2->density = density;
588  index2->k = k;
589 
590  index2->split_points = split_points.topLeftCorner(index2->n_array, index2->n_trees);
591  index2->leaf_first_indices = leaf_first_indices_all[index2->depth];
592  if (index2->density < 1) {
593  index2->sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(index2->n_pool, index2->dim);
594  for (int n_tree = 0; n_tree < index2->n_trees; ++n_tree)
595  index2->sparse_random_matrix.middleRows(n_tree * index2->depth, index2->depth) =
596  sparse_random_matrix.middleRows(n_tree * depth_max, index2->depth);
597  } else {
598  index2->dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(index2->n_pool, index2->dim);
599  for (int n_tree = 0; n_tree < index2->n_trees; ++n_tree)
600  index2->dense_random_matrix.middleRows(n_tree * index2->depth, index2->depth) =
601  dense_random_matrix.middleRows(n_tree * depth_max, index2->depth);
602  }
603  index2->index_type = autotuned;
604 
605  return index2;
606  }
607 
608 
618  std::vector<Mrpt_Parameters> optimal_parameters() const {
619  if (index_type == normal) {
620  throw std::logic_error("The list of optimal parameters cannot be retrieved for the non-autotuned index.");
621  }
622  if (index_type == autotuned) {
623  throw std::logic_error("The list of optimal parameters cannot be retrieved for the index which has already been subsetted or deleted to the target recall level.");
624  }
625 
626  std::vector<Mrpt_Parameters> new_pars;
627  std::copy(opt_pars.begin(), opt_pars.end(), std::back_inserter(new_pars));
628  return new_pars;
629  }
630 
654  void query(const float *data, int k, int vote_threshold, int *out,
655  float *out_distances = nullptr, int *out_n_elected = nullptr) const {
656 
657  if (k <= 0 || k > n_samples) {
658  throw std::out_of_range("k must belong to the set {1, ..., n}.");
659  }
660 
661  if (vote_threshold <= 0 || vote_threshold > n_trees) {
662  throw std::out_of_range("vote_threshold must belong to the set {1, ... , n_trees}.");
663  }
664 
665  if (empty()) {
666  throw std::logic_error("The index must be built before making queries.");
667  }
668 
669  const Eigen::Map<const Eigen::VectorXf> q(data, dim);
670 
671  Eigen::VectorXf projected_query(n_pool);
672  if (density < 1)
673  projected_query.noalias() = sparse_random_matrix * q;
674  else
675  projected_query.noalias() = dense_random_matrix * q;
676 
677  std::vector<int> found_leaves(n_trees);
678 
679  /*
680  * The following loops over all trees, and routes the query to exactly one
681  * leaf in each.
682  */
683  #pragma omp parallel for
684  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
685  int idx_tree = 0;
686  for (int d = 0; d < depth; ++d) {
687  const int j = n_tree * depth + d;
688  const int idx_left = 2 * idx_tree + 1;
689  const int idx_right = idx_left + 1;
690  const float split_point = split_points(idx_tree, n_tree);
691  if (projected_query(j) <= split_point) {
692  idx_tree = idx_left;
693  } else {
694  idx_tree = idx_right;
695  }
696  }
697  found_leaves[n_tree] = idx_tree - (1 << depth) + 1;
698  }
699 
700  int n_elected = 0, max_leaf_size = n_samples / (1 << depth) + 1;
701  Eigen::VectorXi elected(n_trees * max_leaf_size);
702  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
703 
704  // count votes
705  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
706  int leaf_begin = leaf_first_indices[found_leaves[n_tree]];
707  int leaf_end = leaf_first_indices[found_leaves[n_tree] + 1];
708  const std::vector<int> &indices = tree_leaves[n_tree];
709  for (int i = leaf_begin; i < leaf_end; ++i) {
710  int idx = indices[i];
711  if (++votes(idx) == vote_threshold)
712  elected(n_elected++) = idx;
713  }
714  }
715 
716  if (out_n_elected) {
717  *out_n_elected = n_elected;
718  }
719 
720  exact_knn(q, k, elected, n_elected, out, out_distances);
721  }
722 
733  void query(const Eigen::Ref<const Eigen::VectorXf> &q, int k, int vote_threshold, int *out,
734  float *out_distances = nullptr, int *out_n_elected = nullptr) const {
735  query(q.data(), k, vote_threshold, out, out_distances, out_n_elected);
736  }
737 
760  void query(const float *q, int *out, float *out_distances = nullptr,
761  int *out_n_elected = nullptr) const {
762  if (index_type == normal) {
763  throw std::logic_error("The index is not autotuned: k and vote threshold has to be specified.");
764  }
765 
766  if (index_type == autotuned_unpruned) {
767  throw std::logic_error("The target recall level has to be set before making queries.");
768  }
769 
770  query(q, k, votes, out, out_distances, out_n_elected);
771  }
772 
781  void query(const Eigen::Ref<const Eigen::VectorXf> &q, int *out, float *out_distances = nullptr,
782  int *out_n_elected = nullptr) const {
783  query(q.data(), out, out_distances, out_n_elected);
784  }
785 
807  static void exact_knn(const float *q_data, const float *X_data, int dim, int n_samples,
808  int k, int *out, float *out_distances = nullptr) {
809 
810  const Eigen::Map<const Eigen::MatrixXf> X(X_data, dim, n_samples);
811  const Eigen::Map<const Eigen::VectorXf> q(q_data, dim);
812 
813  if (k < 1 || k > n_samples) {
814  throw std::out_of_range("k must be positive and no greater than the sample size of data X.");
815  }
816 
817  Eigen::VectorXf distances(n_samples);
818 
819  #pragma omp parallel for
820  for (int i = 0; i < n_samples; ++i)
821  distances(i) = (X.col(i) - q).squaredNorm();
822 
823  if (k == 1) {
824  Eigen::MatrixXf::Index index;
825  distances.minCoeff(&index);
826  out[0] = index;
827 
828  if (out_distances)
829  out_distances[0] = std::sqrt(distances(index));
830 
831  return;
832  }
833 
834  Eigen::VectorXi idx(n_samples);
835  std::iota(idx.data(), idx.data() + n_samples, 0);
836  std::partial_sort(idx.data(), idx.data() + k, idx.data() + n_samples,
837  [&distances](int i1, int i2) { return distances(i1) < distances(i2); });
838 
839  for (int i = 0; i < k; ++i)
840  out[i] = idx(i);
841 
842  if (out_distances) {
843  for (int i = 0; i < k; ++i)
844  out_distances[i] = std::sqrt(distances(idx(i)));
845  }
846  }
847 
855  static void exact_knn(const Eigen::Ref<const Eigen::VectorXf> &q,
856  const Eigen::Ref<const Eigen::MatrixXf> &X,
857  int k, int *out, float *out_distances = nullptr) {
858  Mrpt::exact_knn(q.data(), X.data(), X.rows(), X.cols(), k, out, out_distances);
859  }
860 
867  void exact_knn(const float *q, int k, int *out, float *out_distances = nullptr) const {
868  Mrpt::exact_knn(q, X.data(), dim, n_samples, k, out, out_distances);
869  }
870 
877  void exact_knn(const Eigen::Ref<const Eigen::VectorXf> &q, int k, int *out,
878  float *out_distances = nullptr) const {
879  Mrpt::exact_knn(q.data(), X.data(), dim, n_samples, k, out, out_distances);
880  }
881 
898  bool save(const char *path) const {
899  FILE *fd;
900  if ((fd = fopen(path, "wb")) == NULL)
901  return false;
902 
903  int i = index_type;
904  fwrite(&i, sizeof(int), 1, fd);
905 
906  if (index_type == 2) {
907  write_parameter_list(opt_pars, fd);
908  }
909 
910  write_parameters(&par, fd);
911  fwrite(&n_trees, sizeof(int), 1, fd);
912  fwrite(&depth, sizeof(int), 1, fd);
913  fwrite(&density, sizeof(float), 1, fd);
914 
915  fwrite(split_points.data(), sizeof(float), n_array * n_trees, fd);
916 
917  // save tree leaves
918  for (int i = 0; i < n_trees; ++i) {
919  int sz = tree_leaves[i].size();
920  fwrite(&sz, sizeof(int), 1, fd);
921  fwrite(&tree_leaves[i][0], sizeof(int), sz, fd);
922  }
923 
924  // save random matrix
925  if (density < 1) {
926  int non_zeros = sparse_random_matrix.nonZeros();
927  fwrite(&non_zeros, sizeof(int), 1, fd);
928  for (int k = 0; k < sparse_random_matrix.outerSize(); ++k) {
929  for (Eigen::SparseMatrix<float, Eigen::RowMajor>::InnerIterator it(sparse_random_matrix, k); it; ++it) {
930  float val = it.value();
931  int row = it.row(), col = it.col();
932  fwrite(&row, sizeof(int), 1, fd);
933  fwrite(&col, sizeof(int), 1, fd);
934  fwrite(&val, sizeof(float), 1, fd);
935  }
936  }
937  } else {
938  fwrite(dense_random_matrix.data(), sizeof(float), n_pool * dim, fd);
939  }
940 
941  fclose(fd);
942  return true;
943  }
944 
951  bool load(const char *path) {
952  FILE *fd;
953  if ((fd = fopen(path, "rb")) == NULL)
954  return false;
955 
956  int i;
957  fread(&i, sizeof(int), 1, fd);
958  index_type = static_cast<itype>(i);
959  if (index_type == autotuned_unpruned) {
960  read_parameter_list(fd);
961  }
962 
963  read_parameters(&par, fd);
964  fread(&n_trees, sizeof(int), 1, fd);
965  fread(&depth, sizeof(int), 1, fd);
966  fread(&density, sizeof(float), 1, fd);
967 
968  n_pool = n_trees * depth;
969  n_array = 1 << (depth + 1);
970 
971  count_first_leaf_indices_all(leaf_first_indices_all, n_samples, depth);
972  leaf_first_indices = leaf_first_indices_all[depth];
973 
974  split_points = Eigen::MatrixXf(n_array, n_trees);
975  fread(split_points.data(), sizeof(float), n_array * n_trees, fd);
976 
977  // load tree leaves
978  tree_leaves = std::vector<std::vector<int>>(n_trees);
979  for (int i = 0; i < n_trees; ++i) {
980  int sz;
981  fread(&sz, sizeof(int), 1, fd);
982  std::vector<int> leaves(sz);
983  fread(&leaves[0], sizeof(int), sz, fd);
984  tree_leaves[i] = leaves;
985  }
986 
987  // load random matrix
988  if (density < 1) {
989  int non_zeros;
990  fread(&non_zeros, sizeof(int), 1, fd);
991 
992  sparse_random_matrix = Eigen::SparseMatrix<float>(n_pool, dim);
993  std::vector<Eigen::Triplet<float>> triplets;
994  for (int k = 0; k < non_zeros; ++k) {
995  int row, col;
996  float val;
997  fread(&row, sizeof(int), 1, fd);
998  fread(&col, sizeof(int), 1, fd);
999  fread(&val, sizeof(float), 1, fd);
1000  triplets.push_back(Eigen::Triplet<float>(row, col, val));
1001  }
1002 
1003  sparse_random_matrix.setFromTriplets(triplets.begin(), triplets.end());
1004  sparse_random_matrix.makeCompressed();
1005  } else {
1006  dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(n_pool, dim);
1007  fread(dense_random_matrix.data(), sizeof(float), n_pool * dim, fd);
1008  }
1009 
1010  fclose(fd);
1011 
1012  k = par.k;
1013  votes = par.votes;
1014  return true;
1015  }
1016 
1022  bool empty() const {
1023  return n_trees == 0;
1024  }
1025 
1032  friend class MrptTest;
1033  friend class UtilityTest;
1034 
1038  private:
1039 
1044  void grow_subtree(std::vector<int>::iterator begin, std::vector<int>::iterator end,
1045  int tree_level, int i, int n_tree, const Eigen::MatrixXf &tree_projections) {
1046  int n = end - begin;
1047  int idx_left = 2 * i + 1;
1048  int idx_right = idx_left + 1;
1049 
1050  if (tree_level == depth) return;
1051 
1052  std::nth_element(begin, begin + n / 2, end,
1053  [&tree_projections, tree_level] (int i1, int i2) {
1054  return tree_projections(tree_level, i1) < tree_projections(tree_level, i2);
1055  });
1056  auto mid = end - n / 2;
1057 
1058  if (n % 2) {
1059  split_points(i, n_tree) = tree_projections(tree_level, *(mid - 1));
1060  } else {
1061  auto left_it = std::max_element(begin, mid,
1062  [&tree_projections, tree_level] (int i1, int i2) {
1063  return tree_projections(tree_level, i1) < tree_projections(tree_level, i2);
1064  });
1065  split_points(i, n_tree) = (tree_projections(tree_level, *mid) +
1066  tree_projections(tree_level, *left_it)) / 2.0;
1067  }
1068 
1069  grow_subtree(begin, mid, tree_level + 1, idx_left, n_tree, tree_projections);
1070  grow_subtree(mid, end, tree_level + 1, idx_right, n_tree, tree_projections);
1071  }
1072 
1076  void exact_knn(const Eigen::Map<const Eigen::VectorXf> &q, int k, const Eigen::VectorXi &indices,
1077  int n_elected, int *out, float *out_distances = nullptr) const {
1078 
1079  if (!n_elected) {
1080  for (int i = 0; i < k; ++i)
1081  out[i] = -1;
1082 
1083  if (out_distances) {
1084  for (int i = 0; i < k; ++i)
1085  out_distances[i] = -1;
1086  }
1087 
1088  return;
1089  }
1090 
1091  Eigen::VectorXf distances(n_elected);
1092 
1093  #pragma omp parallel for
1094  for (int i = 0; i < n_elected; ++i)
1095  distances(i) = (X.col(indices(i)) - q).squaredNorm();
1096 
1097  if (k == 1) {
1098  Eigen::MatrixXf::Index index;
1099  distances.minCoeff(&index);
1100  out[0] = n_elected ? indices(index) : -1;
1101 
1102  if (out_distances)
1103  out_distances[0] = n_elected ? std::sqrt(distances(index)) : -1;
1104 
1105  return;
1106  }
1107 
1108  int n_to_sort = n_elected > k ? k : n_elected;
1109  Eigen::VectorXi idx(n_elected);
1110  std::iota(idx.data(), idx.data() + n_elected, 0);
1111  std::partial_sort(idx.data(), idx.data() + n_to_sort, idx.data() + n_elected,
1112  [&distances](int i1, int i2) { return distances(i1) < distances(i2); });
1113 
1114  for (int i = 0; i < k; ++i)
1115  out[i] = i < n_elected ? indices(idx(i)) : -1;
1116 
1117  if (out_distances) {
1118  for (int i = 0; i < k; ++i)
1119  out_distances[i] = i < n_elected ? std::sqrt(distances(idx(i))) : -1;
1120  }
1121  }
1122 
1123  void prune(double target_recall) {
1124  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
1125  throw std::out_of_range("Target recall must be on the interval [0,1].");
1126  }
1127 
1128  par = parameters(target_recall);
1129  if (!par.n_trees) {
1130  return;
1131  }
1132 
1133  int depth_max = depth;
1134 
1135  n_trees = par.n_trees;
1136  depth = par.depth;
1137  votes = par.votes;
1138  n_pool = depth * n_trees;
1139  n_array = 1 << (depth + 1);
1140 
1141  tree_leaves.resize(n_trees);
1142  tree_leaves.shrink_to_fit();
1143  split_points.conservativeResize(n_array, n_trees);
1144  leaf_first_indices = leaf_first_indices_all[depth];
1145 
1146  if (density < 1) {
1147  Eigen::SparseMatrix<float, Eigen::RowMajor> srm_new(n_pool, dim);
1148  for (int n_tree = 0; n_tree < n_trees; ++n_tree)
1149  srm_new.middleRows(n_tree * depth, depth) = sparse_random_matrix.middleRows(n_tree * depth_max, depth);
1150  sparse_random_matrix = srm_new;
1151  } else {
1152  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> drm_new(n_pool, dim);
1153  for (int n_tree = 0; n_tree < n_trees; ++n_tree)
1154  drm_new.middleRows(n_tree * depth, depth) = dense_random_matrix.middleRows(n_tree * depth_max, depth);
1155  dense_random_matrix = drm_new;
1156  }
1157 
1158  index_type = autotuned;
1159  }
1160 
1161  void count_elected(const Eigen::VectorXf &q, const Eigen::Map<Eigen::VectorXi> &exact, int votes_max,
1162  std::vector<Eigen::MatrixXd> &recalls, std::vector<Eigen::MatrixXd> &cs_sizes) const {
1163  Eigen::VectorXf projected_query(n_pool);
1164  if (density < 1)
1165  projected_query.noalias() = sparse_random_matrix * q;
1166  else
1167  projected_query.noalias() = dense_random_matrix * q;
1168 
1169  int depth_min = depth - recalls.size() + 1;
1170  std::vector<std::vector<int>> start_indices(n_trees);
1171 
1172  #pragma omp parallel for
1173  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1174  start_indices[n_tree] = std::vector<int>(depth - depth_min + 1);
1175  int idx_tree = 0;
1176  for (int d = 0; d < depth; ++d) {
1177  const int j = n_tree * depth + d;
1178  const int idx_left = 2 * idx_tree + 1;
1179  const int idx_right = idx_left + 1;
1180  const float split_point = split_points(idx_tree, n_tree);
1181  if (projected_query(j) <= split_point) {
1182  idx_tree = idx_left;
1183  } else {
1184  idx_tree = idx_right;
1185  }
1186  if (d >= depth_min - 1)
1187  start_indices[n_tree][d - depth_min + 1] = idx_tree - (1 << (d + 1)) + 1;
1188  }
1189  }
1190 
1191  const int *exact_begin = exact.data();
1192  const int *exact_end = exact.data() + exact.size();
1193 
1194  for (int depth_crnt = depth_min; depth_crnt <= depth; ++depth_crnt) {
1195  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
1196  const std::vector<int> &leaf_first_indices = leaf_first_indices_all[depth_crnt];
1197 
1198  Eigen::MatrixXd recall(votes_max, n_trees);
1199  Eigen::MatrixXd candidate_set_size(votes_max, n_trees);
1200  recall.col(0) = Eigen::VectorXd::Zero(votes_max);
1201  candidate_set_size.col(0) = Eigen::VectorXd::Zero(votes_max);
1202 
1203  // count votes
1204  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1205  std::vector<int> &found_leaves = start_indices[n_tree];
1206 
1207  if (n_tree) {
1208  recall.col(n_tree) = recall.col(n_tree - 1);
1209  candidate_set_size.col(n_tree) = candidate_set_size.col(n_tree - 1);
1210  }
1211 
1212  int leaf_begin = leaf_first_indices[found_leaves[depth_crnt - depth_min]];
1213  int leaf_end = leaf_first_indices[found_leaves[depth_crnt - depth_min] + 1];
1214 
1215  const std::vector<int> &indices = tree_leaves[n_tree];
1216  for (int i = leaf_begin; i < leaf_end; ++i) {
1217  int idx = indices[i];
1218  int v = ++votes(idx);
1219  if (v <= votes_max) {
1220  candidate_set_size(v - 1, n_tree)++;
1221  if (std::find(exact_begin, exact_end, idx) != exact_end)
1222  recall(v - 1, n_tree)++;
1223  }
1224  }
1225  }
1226 
1227  recalls[depth_crnt - depth_min] = recall;
1228  cs_sizes[depth_crnt - depth_min] = candidate_set_size;
1229  }
1230  }
1231 
1241  static void build_sparse_random_matrix(Eigen::SparseMatrix<float, Eigen::RowMajor> &sparse_random_matrix,
1242  int n_row, int n_col, float density, int seed = 0) {
1243  sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(n_row, n_col);
1244 
1245  std::random_device rd;
1246  int s = seed ? seed : rd();
1247  std::mt19937 gen(s);
1248  std::uniform_real_distribution<float> uni_dist(0, 1);
1249  std::normal_distribution<float> norm_dist(0, 1);
1250 
1251  std::vector<Eigen::Triplet<float>> triplets;
1252  for (int j = 0; j < n_row; ++j) {
1253  for (int i = 0; i < n_col; ++i) {
1254  if (uni_dist(gen) > density) continue;
1255  triplets.push_back(Eigen::Triplet<float>(j, i, norm_dist(gen)));
1256  }
1257  }
1258 
1259  sparse_random_matrix.setFromTriplets(triplets.begin(), triplets.end());
1260  sparse_random_matrix.makeCompressed();
1261  }
1262 
1263  /*
1264  * Builds a random dense matrix for use in random projection. The components of
1265  * the matrix are drawn from the standard normal distribution.
1266  */
1267  static void build_dense_random_matrix(Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> &dense_random_matrix,
1268  int n_row, int n_col, int seed = 0) {
1269  dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(n_row, n_col);
1270 
1271  std::random_device rd;
1272  int s = seed ? seed : rd();
1273  std::mt19937 gen(s);
1274  std::normal_distribution<float> normal_dist(0, 1);
1275 
1276  std::generate(dense_random_matrix.data(), dense_random_matrix.data() + n_row * n_col,
1277  [&normal_dist, &gen] { return normal_dist(gen); });
1278  }
1279 
1280  void compute_exact(const Eigen::Map<const Eigen::MatrixXf> &Q, Eigen::MatrixXi &out_exact,
1281  const std::vector<int> &indices_test = {}) const {
1282  int n_test = Q.cols();
1283 
1284  Eigen::VectorXi idx(n_samples);
1285  std::iota(idx.data(), idx.data() + n_samples, 0);
1286 
1287  for (int i = 0; i < n_test; ++i) {
1288  if(!indices_test.empty()) {
1289  std::remove(idx.data(), idx.data() + n_samples, indices_test[i]);
1290  }
1291  exact_knn(Eigen::Map<const Eigen::VectorXf>(Q.data() + i * dim, dim), k, idx,
1292  (indices_test.empty() ? n_samples : n_samples - 1), out_exact.data() + i * k);
1293  std::sort(out_exact.data() + i * k, out_exact.data() + i * k + k);
1294  if(!indices_test.empty()) {
1295  idx[n_samples - 1] = indices_test[i];
1296  }
1297  }
1298  }
1299 
1300  static bool is_faster(const Mrpt_Parameters &par1, const Mrpt_Parameters &par2) {
1301  return par1.estimated_qtime < par2.estimated_qtime;
1302  }
1303 
1304  void vote(const Eigen::VectorXf &projected_query, int vote_threshold, Eigen::VectorXi &elected,
1305  int &n_elected, int n_trees, int depth_crnt) {
1306  std::vector<int> found_leaves(n_trees);
1307  const std::vector<int> &leaf_first_indices = leaf_first_indices_all[depth_crnt];
1308 
1309  #pragma omp parallel for
1310  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1311  int idx_tree = 0;
1312  for (int d = 0; d < depth_crnt; ++d) {
1313  const int j = n_tree * depth + d;
1314  const int idx_left = 2 * idx_tree + 1;
1315  const int idx_right = idx_left + 1;
1316  const float split_point = split_points(idx_tree, n_tree);
1317  if (projected_query(j) <= split_point) {
1318  idx_tree = idx_left;
1319  } else {
1320  idx_tree = idx_right;
1321  }
1322  }
1323  found_leaves[n_tree] = idx_tree - (1 << depth_crnt) + 1;
1324  }
1325 
1326  int max_leaf_size = n_samples / (1 << depth_crnt) + 1;
1327  elected = Eigen::VectorXi(n_trees * max_leaf_size);
1328  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
1329 
1330  // count votes
1331  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1332  int leaf_begin = leaf_first_indices[found_leaves[n_tree]];
1333  int leaf_end = leaf_first_indices[found_leaves[n_tree] + 1];
1334  const std::vector<int> &indices = tree_leaves[n_tree];
1335  for (int i = leaf_begin; i < leaf_end; ++i) {
1336  int idx = indices[i];
1337  if (++votes(idx) == vote_threshold)
1338  elected(n_elected++) = idx;
1339  }
1340  }
1341  }
1342 
1343  std::pair<double,double> fit_projection_times(const Eigen::Map<const Eigen::MatrixXf> &Q,
1344  std::vector<int> &exact_x) {
1345  std::vector<double> projection_times, projection_x;
1346  long double idx_sum = 0;
1347 
1348  std::vector<int> tested_trees {1,2,3,4,5,7,10,15,20,25,30,40,50};
1349  generate_x(tested_trees, n_trees, 10, n_trees);
1350 
1351  for (int d = depth_min; d <= depth; ++d) {
1352  for (int i = 0; i < (int) tested_trees.size(); ++i) {
1353  int t = tested_trees[i];
1354  int n_random_vectors = t * d;
1355  projection_x.push_back(n_random_vectors);
1356  Eigen::SparseMatrix<float, Eigen::RowMajor> sparse_mat;
1357  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense_mat;
1358 
1359  if (density < 1) {
1360  build_sparse_random_matrix(sparse_mat, n_random_vectors, dim, density);
1361  } else {
1362  build_dense_random_matrix(dense_mat, n_random_vectors, dim);
1363  }
1364 
1365  double start_proj = omp_get_wtime();
1366  Eigen::VectorXf projected_query(n_random_vectors);
1367 
1368  if (density < 1) {
1369  projected_query.noalias() = sparse_mat * Q.col(0);
1370  } else {
1371  projected_query.noalias() = dense_mat * Q.col(0);
1372  }
1373 
1374  double end_proj = omp_get_wtime();
1375  projection_times.push_back(end_proj - start_proj);
1376  idx_sum += projected_query.norm();
1377 
1378  int votes_index = votes_max < t ? votes_max : t;
1379  for (int v = 1; v <= votes_index; ++v) {
1380  int cs_size = get_candidate_set_size(t, d, v);
1381  if (cs_size > 0) exact_x.push_back(cs_size);
1382  }
1383  }
1384  }
1385 
1386  // use results to ensure that the compiler does not optimize away the timed code.
1387  projection_x[0] += idx_sum > 1.0 ? 0.0000 : 0.0001;
1388  return fit_theil_sen(projection_x, projection_times);
1389  }
1390 
1391  std::vector<std::map<int,std::pair<double,double>>> fit_voting_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1392  int n_test = Q.cols();
1393 
1394  std::random_device rd;
1395  std::mt19937 rng(rd());
1396  std::uniform_int_distribution<int> uni(0, n_test - 1);
1397 
1398  std::vector<int> tested_trees {1,2,3,4,5,7,10,15,20,25,30,40,50};
1399  generate_x(tested_trees, n_trees, 10, n_trees);
1400  std::vector<int> vote_thresholds_x {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1401  generate_x(vote_thresholds_x, votes_max, 10, votes_max);
1402 
1403  beta_voting = std::vector<std::map<int,std::pair<double,double>>>();
1404 
1405  for (int d = depth_min; d <= depth; ++d) {
1406  std::map<int,std::pair<double,double>> beta;
1407  for (const auto &v : vote_thresholds_x) {
1408  long double idx_sum = 0;
1409  std::vector<double> voting_times, voting_x;
1410 
1411  for (int i = 0; i < (int) tested_trees.size(); ++i) {
1412  int t = tested_trees[i];
1413  int n_el = 0;
1414  Eigen::VectorXi elected;
1415  auto ri = uni(rng);
1416 
1417  Eigen::VectorXf projected_query(n_trees * depth);
1418  if (density < 1) {
1419  projected_query.noalias() = sparse_random_matrix * Q.col(ri);
1420  } else {
1421  projected_query.noalias() = dense_random_matrix * Q.col(ri);
1422  }
1423 
1424  double start_voting = omp_get_wtime();
1425  vote(projected_query, v, elected, n_el, t, d);
1426  double end_voting = omp_get_wtime();
1427 
1428  voting_times.push_back(end_voting - start_voting);
1429  voting_x.push_back(t);
1430  for (int i = 0; i < n_el; ++i)
1431  idx_sum += elected(i);
1432  }
1433  voting_x[0] += idx_sum > 1.0 ? 0.0 : 0.00001;
1434  beta[v] = fit_theil_sen(voting_x, voting_times);
1435  }
1436  beta_voting.push_back(beta);
1437  }
1438 
1439  return beta_voting;
1440  }
1441 
1442  static void generate_x(std::vector<int> &x, int max_generated, int n_tested, int max_val) {
1443  n_tested = max_generated > n_tested ? n_tested : max_val;
1444  int increment = max_generated / n_tested;
1445  for (int i = 1; i <= n_tested; ++i) {
1446  if (std::find(x.begin(), x.end(), i * increment) == x.end() && i * increment <= max_generated) {
1447  x.push_back(i * increment);
1448  }
1449  }
1450 
1451  auto end = std::remove_if(x.begin(), x.end(), [max_val](int t) { return t > max_val; });
1452  x.erase(end, x.end());
1453  }
1454 
1455  std::pair<double,double> fit_exact_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1456  std::vector<int> s_tested {1,2,5,10,20,35,50,75,100,150,200,300,400,500};
1457  generate_x(s_tested, n_samples / 20, 20, n_samples);
1458 
1459  int n_test = Q.cols();
1460  std::vector<double> exact_times;
1461  long double idx_sum = 0;
1462 
1463  std::random_device rd;
1464  std::mt19937 rng(rd());
1465  std::uniform_int_distribution<int> uni(0, n_test - 1);
1466  std::uniform_int_distribution<int> uni2(0, n_samples - 1);
1467 
1468  std::vector<double> ex;
1469  int n_sim = 100;
1470  for (int i = 0; i < (int) s_tested.size(); ++i) {
1471  double mean_exact_time = 0;
1472  int s_size = s_tested[i];
1473  ex.push_back(s_size);
1474 
1475  for (int m = 0; m < n_sim; ++m) {
1476  auto ri = uni(rng);
1477  Eigen::VectorXi elected(s_size);
1478  for (int j = 0; j < elected.size(); ++j)
1479  elected(j) = uni2(rng);
1480 
1481  double start_exact = omp_get_wtime();
1482  std::vector<int> res(k);
1483  exact_knn(Eigen::Map<const Eigen::VectorXf>(Q.data() + ri * dim, dim), k, elected, s_size, &res[0]);
1484  double end_exact = omp_get_wtime();
1485  mean_exact_time += (end_exact - start_exact);
1486 
1487  for (int l = 0; l < k; ++l)
1488  idx_sum += res[l];
1489  }
1490  mean_exact_time /= n_sim;
1491  exact_times.push_back(mean_exact_time);
1492  }
1493 
1494  ex[0] += idx_sum > 1.0 ? 0.0 : 0.00001;
1495  return fit_theil_sen(ex, exact_times);
1496  }
1497 
1498  std::set<Mrpt_Parameters,decltype(is_faster)*> list_parameters(const std::vector<Eigen::MatrixXd> &recalls) {
1499  std::set<Mrpt_Parameters,decltype(is_faster)*> pars(is_faster);
1500  std::vector<Eigen::MatrixXd> query_times(depth - depth_min + 1);
1501  for (int d = depth_min; d <= depth; ++d) {
1502  Eigen::MatrixXd query_time = Eigen::MatrixXd::Zero(votes_max, n_trees);
1503 
1504  for (int t = 1; t <= n_trees; ++t) {
1505  int votes_index = votes_max < t ? votes_max : t;
1506  for (int v = 1; v <= votes_index; ++v) {
1507  double qt = get_query_time(t, d, v);
1508  query_time(v - 1, t - 1) = qt;
1509  Mrpt_Parameters p;
1510  p.n_trees = t;
1511  p.depth = d;
1512  p.votes = v;
1513  p.k = k;
1514  p.estimated_qtime = qt;
1515  p.estimated_recall = recalls[d - depth_min](v - 1, t - 1);
1516  pars.insert(p);
1517  }
1518  }
1519 
1520  query_times[d - depth_min] = query_time;
1521  }
1522 
1523  return pars;
1524  }
1525 
1526  std::set<Mrpt_Parameters,decltype(is_faster)*> pareto_frontier(const std::set<Mrpt_Parameters,decltype(is_faster)*> &pars) {
1527  opt_pars = std::set<Mrpt_Parameters,decltype(is_faster)*>(is_faster);
1528  double best_recall = -1.0;
1529  for (const auto &p : pars) { // compute pareto frontier for query times and recalls
1530  if (p.estimated_recall > best_recall) {
1531  opt_pars.insert(p);
1532  best_recall = p.estimated_recall;
1533  }
1534  }
1535 
1536  return opt_pars;
1537  }
1538 
1539  void fit_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1540  std::vector<int> exact_x;
1541  beta_projection = fit_projection_times(Q, exact_x);
1542  beta_voting = fit_voting_times(Q);
1543  beta_exact = fit_exact_times(Q);
1544  }
1545 
1546  static std::pair<double,double> fit_theil_sen(const std::vector<double> &x,
1547  const std::vector<double> &y) {
1548  int n = x.size();
1549  std::vector<double> slopes;
1550  for (int i = 0; i < n; ++i) {
1551  for (int j = 0; j < n; ++j) {
1552  if (i != j)
1553  slopes.push_back((y[j] - y[i]) / (x[j] - x[i]));
1554  }
1555  }
1556 
1557  int n_slopes = slopes.size();
1558  std::nth_element(slopes.begin(), slopes.begin() + n_slopes / 2, slopes.end());
1559  double slope = *(slopes.begin() + n_slopes / 2);
1560 
1561  std::vector<double> residuals(n);
1562  for (int i = 0; i < n; ++i)
1563  residuals[i] = y[i] - slope * x[i];
1564 
1565  std::nth_element(residuals.begin(), residuals.begin() + n / 2, residuals.end());
1566  double intercept = *(residuals.begin() + n / 2);
1567 
1568  return std::make_pair(intercept, slope);
1569  }
1570 
1571  void write_parameters(const Mrpt_Parameters *p, FILE *fd) const {
1572  if (!fd) {
1573  return;
1574  }
1575 
1576  fwrite(&p->n_trees, sizeof(int), 1, fd);
1577  fwrite(&p->depth, sizeof(int), 1, fd);
1578  fwrite(&p->votes, sizeof(int), 1, fd);
1579  fwrite(&p->k, sizeof(int), 1, fd);
1580  fwrite(&p->estimated_qtime, sizeof(double), 1, fd);
1581  fwrite(&p->estimated_recall, sizeof(double), 1, fd);
1582  }
1583 
1584  void read_parameters(Mrpt_Parameters *p, FILE *fd) {
1585  fread(&p->n_trees, sizeof(int), 1, fd);
1586  fread(&p->depth, sizeof(int), 1, fd);
1587  fread(&p->votes, sizeof(int), 1, fd);
1588  fread(&p->k, sizeof(int), 1, fd);
1589  fread(&p->estimated_qtime, sizeof(double), 1, fd);
1590  fread(&p->estimated_recall, sizeof(double), 1, fd);
1591  }
1592 
1593  void write_parameter_list(const std::set<Mrpt_Parameters,decltype(is_faster)*> &pars, FILE *fd) const {
1594  if (!fd) {
1595  return;
1596  }
1597 
1598  int par_sz = pars.size();
1599  fwrite(&par_sz, sizeof(int), 1, fd);
1600 
1601  for (const auto p : pars)
1602  write_parameters(&p, fd);
1603  }
1604 
1605  void read_parameter_list(FILE *fd) {
1606  if (!fd) {
1607  return;
1608  }
1609 
1610  opt_pars = std::set<Mrpt_Parameters,decltype(is_faster)*>(is_faster);
1611  int par_sz = 0;
1612  fread(&par_sz, sizeof(int), 1, fd);
1613 
1614  for (int i = 0; i < par_sz; ++i) {
1615  Mrpt_Parameters p;
1616  read_parameters(&p, fd);
1617  opt_pars.insert(p);
1618  }
1619  }
1620 
1621  Mrpt_Parameters parameters(double target_recall) const {
1622  double tr = target_recall - epsilon;
1623  for (const auto &p : opt_pars) {
1624  if (p.estimated_recall > tr) {
1625  return p;
1626  }
1627  }
1628 
1629  if (!opt_pars.empty()) {
1630  return *(opt_pars.rbegin());
1631  }
1632 
1633  return Mrpt_Parameters();
1634  }
1635 
1641  static void count_leaf_sizes(int n, int level, int tree_depth, std::vector<int> &out_leaf_sizes) {
1642  if (level == tree_depth) {
1643  out_leaf_sizes.push_back(n);
1644  return;
1645  }
1646 
1647  count_leaf_sizes(n - n / 2, level + 1, tree_depth, out_leaf_sizes);
1648  count_leaf_sizes(n / 2, level + 1, tree_depth, out_leaf_sizes);
1649  }
1650 
1657  static void count_first_leaf_indices(std::vector<int> &indices, int n, int depth) {
1658  std::vector<int> leaf_sizes;
1659  count_leaf_sizes(n, 0, depth, leaf_sizes);
1660 
1661  indices = std::vector<int>(leaf_sizes.size() + 1);
1662  indices[0] = 0;
1663  for (int i = 0; i < (int) leaf_sizes.size(); ++i)
1664  indices[i + 1] = indices[i] + leaf_sizes[i];
1665  }
1666 
1667  static void count_first_leaf_indices_all(std::vector<std::vector<int>> &indices, int n, int depth_max) {
1668  for (int d = 0; d <= depth_max; ++d) {
1669  std::vector<int> idx;
1670  count_first_leaf_indices(idx, n, d);
1671  indices.push_back(idx);
1672  }
1673  }
1674 
1675  static double predict_theil_sen(double x, std::pair<double,double> beta) {
1676  return beta.first + beta.second * x;
1677  }
1678 
1679  double get_candidate_set_size(int tree, int depth, int v) const {
1680  return cs_sizes[depth - depth_min](v - 1, tree - 1);
1681  }
1682 
1683  double get_projection_time(int n_trees, int depth, int v) const {
1684  return predict_theil_sen(n_trees * depth, beta_projection);
1685  }
1686 
1687  double get_voting_time(int n_trees, int depth, int v) const {
1688  const std::map<int,std::pair<double,double>> &beta = beta_voting[depth - depth_min];
1689 
1690  if (v <= 0 || beta.empty()) {
1691  return 0.0;
1692  }
1693 
1694  for (const auto &b : beta) {
1695  if (v <= b.first) {
1696  return predict_theil_sen(n_trees, b.second);
1697  }
1698  }
1699 
1700  return predict_theil_sen(n_trees, beta.rbegin()->second);
1701  }
1702 
1703  double get_exact_time(int n_trees, int depth, int v) const {
1704  return predict_theil_sen(get_candidate_set_size(n_trees, depth, v), beta_exact);
1705  }
1706 
1707  double get_query_time(int tree, int depth, int v) const {
1708  return get_projection_time(tree, depth, v)
1709  + get_voting_time(tree, depth, v)
1710  + get_exact_time(tree, depth, v);
1711  }
1712 
1713  std::vector<int> sample_indices(int n_test, int seed = 0) const {
1714  std::random_device rd;
1715  int s = seed ? seed : rd();
1716  std::mt19937 gen(s);
1717 
1718  std::vector<int> indices_data(n_samples);
1719  std::iota(indices_data.begin(), indices_data.end(), 0);
1720  std::shuffle(indices_data.begin(), indices_data.end(), gen);
1721  return std::vector<int>(indices_data.begin(), indices_data.begin() + n_test);
1722  }
1723 
1724  Eigen::MatrixXf subset(const std::vector<int> &indices) const {
1725  int n_test = indices.size();
1726  Eigen::MatrixXf Q = Eigen::MatrixXf(dim, n_test);
1727  for(int i = 0; i < n_test; ++i)
1728  Q.col(i) = X.col(indices[i]);
1729 
1730  return Q;
1731  }
1732 
1733 
1734  const Eigen::Map<const Eigen::MatrixXf> X; // the data matrix
1735  Eigen::MatrixXf split_points; // all split points in all trees
1736  std::vector<std::vector<int>> tree_leaves; // contains all leaves of all trees
1737  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense_random_matrix; // random vectors needed for all the RP-trees
1738  Eigen::SparseMatrix<float, Eigen::RowMajor> sparse_random_matrix; // random vectors needed for all the RP-trees
1739  std::vector<std::vector<int>> leaf_first_indices_all; // first indices for each level
1740  std::vector<int> leaf_first_indices; // first indices of each leaf of tree in tree_leaves
1741 
1742  const int n_samples; // sample size of data
1743  const int dim; // dimension of data
1744  Mrpt_Parameters par;
1745  int n_trees = 0; // number of RP-trees
1746  int depth = 0; // depth of an RP-tree with median split
1747  float density = -1.0; // expected ratio of non-zero components in a projection matrix
1748  int n_pool = 0; // amount of random vectors needed for all the RP-trees
1749  int n_array = 0; // length of the one RP-tree as array
1750  int votes = 0; // optimal number of votes to use
1751  int k = 0;
1752  enum itype {normal, autotuned, autotuned_unpruned};
1753  itype index_type = normal;
1754 
1755  // Member variables used in autotuning:
1756  int depth_min = 0;
1757  int votes_max = 0;
1758  const double epsilon = 0.0001; // error bound for comparisons of recall levels
1759  std::vector<Eigen::MatrixXd> cs_sizes;
1760  std::pair<double,double> beta_projection, beta_exact;
1761  std::vector<std::map<int,std::pair<double,double>>> beta_voting;
1762  std::set<Mrpt_Parameters,decltype(is_faster)*> opt_pars;
1763 };
1764 
1765 #endif // CPP_MRPT_H_
void query(const Eigen::Ref< const Eigen::VectorXf > &q, int k, int vote_threshold, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:733
-
void query(const float *q, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:760
-
bool is_autotuned() const
Definition: Mrpt.h:295
-
void query(const float *data, int k, int vote_threshold, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:654
+
1 #ifndef CPP_MRPT_H_
2 #define CPP_MRPT_H_
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <functional>
7 #include <map>
8 #include <numeric>
9 #include <random>
10 #include <set>
11 #include <stdexcept>
12 #include <string>
13 #include <utility>
14 #include <vector>
15 
16 #include <Eigen/Dense>
17 #include <Eigen/SparseCore>
18 
20  int n_trees = 0;
21  int depth = 0;
22  int k = 0;
23  int votes = 0;
24  double estimated_qtime = 0.0;
25  double estimated_recall = 0.0;
26 };
27 
28 class Mrpt {
29  public:
49  Mrpt(const Eigen::Ref<const Eigen::MatrixXf> &X_) :
50  X(Eigen::Map<const Eigen::MatrixXf>(X_.data(), X_.rows(), X_.cols())),
51  n_samples(X_.cols()),
52  dim(X_.rows()) {}
53 
60  Mrpt(const float *X_, int dim_, int n_samples_) :
61  X(Eigen::Map<const Eigen::MatrixXf>(X_, dim_, n_samples_)),
62  n_samples(n_samples_),
63  dim(dim_) {}
64 
84  void grow(int n_trees_, int depth_, float density_ = -1.0, int seed = 0) {
85 
86  if (!empty()) {
87  throw std::logic_error("The index has already been grown.");
88  }
89 
90  if (n_trees_ <= 0) {
91  throw std::out_of_range("The number of trees must be positive.");
92  }
93 
94  if (depth_ <= 0 || depth_ > std::log2(n_samples)) {
95  throw std::out_of_range("The depth must belong to the set {1, ... , log2(n)}.");
96  }
97 
98  if (density_ < -1.0001 || density_ > 1.0001 || (density_ > -0.9999 && density_ < -0.0001)) {
99  throw std::out_of_range("The density must be on the interval (0,1].");
100  }
101 
102  n_trees = n_trees_;
103  depth = depth_;
104  n_pool = n_trees_ * depth_;
105  n_array = 1 << (depth_ + 1);
106 
107  if (density_ < 0) {
108  density = 1.0 / std::sqrt(dim);
109  } else {
110  density = density_;
111  }
112 
113  density < 1 ? build_sparse_random_matrix(sparse_random_matrix, n_pool, dim, density, seed) :
114  build_dense_random_matrix(dense_random_matrix, n_pool, dim, seed);
115 
116  split_points = Eigen::MatrixXf(n_array, n_trees);
117  tree_leaves = std::vector<std::vector<int>>(n_trees);
118 
119  count_first_leaf_indices_all(leaf_first_indices_all, n_samples, depth);
120  leaf_first_indices = leaf_first_indices_all[depth];
121 
122  #pragma omp parallel for
123  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
124  Eigen::MatrixXf tree_projections;
125 
126  if (density < 1)
127  tree_projections.noalias() = sparse_random_matrix.middleRows(n_tree * depth, depth) * X;
128  else
129  tree_projections.noalias() = dense_random_matrix.middleRows(n_tree * depth, depth) * X;
130 
131  tree_leaves[n_tree] = std::vector<int>(n_samples);
132  std::vector<int> &indices = tree_leaves[n_tree];
133  std::iota(indices.begin(), indices.end(), 0);
134 
135  grow_subtree(indices.begin(), indices.end(), 0, 0, n_tree, tree_projections);
136  }
137  }
138 
178  void grow(double target_recall, const Eigen::Ref<const Eigen::MatrixXf> &Q, int k_, int trees_max = -1,
179  int depth_max = -1, int depth_min_ = -1, int votes_max_ = -1,
180  float density = -1.0, int seed = 0) {
181  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
182  throw std::out_of_range("Target recall must be on the interval [0,1].");
183  }
184 
185  grow(Q, k_, trees_max, depth_max, depth_min_, votes_max_, density, seed);
186  prune(target_recall);
187  }
188 
218  void grow(double target_recall, const float *Q, int n_test, int k_, int trees_max = -1,
219  int depth_max = -1, int depth_min_ = -1, int votes_max_ = -1,
220  float density = -1.0, int seed = 0, const std::vector<int> &indices_test = {}) {
221  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
222  throw std::out_of_range("Target recall must be on the interval [0,1].");
223  }
224 
225  grow(Q, n_test, k_, trees_max, depth_max, depth_min_, votes_max_, density, seed, indices_test);
226  prune(target_recall);
227  }
228 
256  void grow_autotune(double target_recall, int k_, int trees_max = -1, int depth_max = -1, int depth_min_ = -1,
257  int votes_max_ = -1, float density_ = -1.0, int seed = 0, int n_test = 100) {
258  if (n_test < 1) {
259  throw std::out_of_range("Test set size must be > 0.");
260  }
261 
262  n_test = n_test > n_samples ? n_samples : n_test;
263  std::vector<int> indices_test(sample_indices(n_test, seed));
264  const Eigen::MatrixXf Q(subset(indices_test));
265 
266  grow(target_recall, Q.data(), Q.cols(), k_, trees_max,
267  depth_max, depth_min_, votes_max_, density_, seed, indices_test);
268  }
269 
282  if (index_type == normal || index_type == autotuned_unpruned) {
283  Mrpt_Parameters p;
284  p.n_trees = n_trees;
285  p.depth = depth;
286  p.k = par.k;
287  return p;
288  }
289 
290  return par;
291  }
292 
298  bool is_autotuned() const {
299  return index_type == autotuned;
300  }
301 
344  void grow(const float *data, int n_test, int k_, int trees_max = -1, int depth_max = -1,
345  int depth_min_ = -1, int votes_max_ = -1, float density_ = -1.0, int seed = 0,
346  const std::vector<int> &indices_test = {}) {
347 
348  if (trees_max == - 1) {
349  trees_max = std::min(std::sqrt(n_samples), 1000.0);
350  }
351 
352  if (depth_min_ == -1) {
353  depth_min_ = std::max(static_cast<int>(std::log2(n_samples) - 11), 5);
354  }
355 
356  if (depth_max == -1) {
357  depth_max = std::max(static_cast<int>(std::log2(n_samples) - 4), depth_min_);
358  }
359 
360  if (votes_max_ == -1) {
361  votes_max_ = std::max(trees_max / 10, std::min(trees_max, 10));
362  }
363 
364  if (density_ > -1.0001 && density_ < -0.9999) {
365  density_ = 1.0 / std::sqrt(dim);
366  }
367 
368  if (!empty()) {
369  throw std::logic_error("The index has already been grown.");
370  }
371 
372  if (k_ <= 0 || k_ > n_samples) {
373  throw std::out_of_range("k_ must belong to the set {1, ..., n}.");
374  }
375 
376  if (trees_max <= 0) {
377  throw std::out_of_range("trees_max must be positive.");
378  }
379 
380  if (depth_max <= 0 || depth_max > std::log2(n_samples)) {
381  throw std::out_of_range("depth_max must belong to the set {1, ... , log2(n)}.");
382  }
383 
384  if (depth_min_ <= 0 || depth_min_ > depth_max) {
385  throw std::out_of_range("depth_min_ must belong to the set {1, ... , depth_max}");
386  }
387 
388  if (votes_max_ <= 0 || votes_max_ > trees_max) {
389  throw std::out_of_range("votes_max_ must belong to the set {1, ... , trees_max}.");
390  }
391 
392  if (density_ < 0.0 || density_ > 1.0001) {
393  throw std::out_of_range("The density must be on the interval (0,1].");
394  }
395 
396  if(n_samples < 101) {
397  throw std::out_of_range("Sample size must be at least 101 to autotune an index.");
398  }
399 
400  depth_min = depth_min_;
401  votes_max = votes_max_;
402  k = k_;
403 
404  const Eigen::Map<const Eigen::MatrixXf> Q(data, dim, n_test);
405 
406  grow(trees_max, depth_max, density_, seed);
407  Eigen::MatrixXi exact(k, n_test);
408  compute_exact(Q, exact, indices_test);
409 
410  std::vector<Eigen::MatrixXd> recalls(depth_max - depth_min + 1);
411  cs_sizes = std::vector<Eigen::MatrixXd>(depth_max - depth_min + 1);
412 
413  for (int d = depth_min; d <= depth_max; ++d) {
414  recalls[d - depth_min] = Eigen::MatrixXd::Zero(votes_max, trees_max);
415  cs_sizes[d - depth_min] = Eigen::MatrixXd::Zero(votes_max, trees_max);
416  }
417 
418  for (int i = 0; i < n_test; ++i) {
419  std::vector<Eigen::MatrixXd> recall_tmp(depth_max - depth_min + 1);
420  std::vector<Eigen::MatrixXd> cs_size_tmp(depth_max - depth_min + 1);
421 
422  count_elected(Q.col(i), Eigen::Map<Eigen::VectorXi>(exact.data() + i * k, k),
423  votes_max, recall_tmp, cs_size_tmp);
424 
425  for (int d = depth_min; d <= depth_max; ++d) {
426  recalls[d - depth_min] += recall_tmp[d - depth_min];
427  cs_sizes[d - depth_min] += cs_size_tmp[d - depth_min];
428  }
429  }
430 
431  for (int d = depth_min; d <= depth_max; ++d) {
432  recalls[d - depth_min] /= (k * n_test);
433  cs_sizes[d - depth_min] /= n_test;
434  }
435 
436  fit_times(Q);
437  std::set<Mrpt_Parameters,decltype(is_faster)*> pars = list_parameters(recalls);
438  opt_pars = pareto_frontier(pars);
439 
440  index_type = autotuned_unpruned;
441  par.k = k_;
442  }
443 
468  void grow(const Eigen::Ref<const Eigen::MatrixXf> &Q, int k_, int trees_max = -1, int depth_max = -1,
469  int depth_min_ = -1, int votes_max_ = -1, float density_ = -1.0, int seed = 0) {
470  if (Q.rows() != dim) {
471  throw std::invalid_argument("Dimensions of the data and the validation set do not match.");
472  }
473 
474  grow(Q.data(), Q.cols(), k_, trees_max,
475  depth_max, depth_min_, votes_max_, density_, seed);
476  }
477 
503  void grow_autotune(int k_, int trees_max = -1, int depth_max = -1, int depth_min_ = -1,
504  int votes_max_ = -1, float density_ = -1.0, int seed = 0, int n_test = 100) {
505  if (n_test < 1) {
506  throw std::out_of_range("Test set size must be > 0.");
507  }
508 
509  n_test = n_test > n_samples ? n_samples : n_test;
510  std::vector<int> indices_test(sample_indices(n_test, seed));
511  const Eigen::MatrixXf Q(subset(indices_test));
512 
513  grow(Q.data(), Q.cols(), k_, trees_max,
514  depth_max, depth_min_, votes_max_, density_, seed, indices_test);
515  }
516 
527  Mrpt subset(double target_recall) const {
528  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
529  throw std::out_of_range("Target recall must be on the interval [0,1].");
530  }
531 
532  Mrpt index2(X);
533  index2.par = parameters(target_recall);
534 
535  int depth_max = depth;
536 
537  index2.n_trees = index2.par.n_trees;
538  index2.depth = index2.par.depth;
539  index2.votes = index2.par.votes;
540  index2.n_pool = index2.depth * index2.n_trees;
541  index2.n_array = 1 << (index2.depth + 1);
542  index2.tree_leaves.assign(tree_leaves.begin(), tree_leaves.begin() + index2.n_trees);
543  index2.leaf_first_indices_all = leaf_first_indices_all;
544  index2.density = density;
545  index2.k = k;
546 
547  index2.split_points = split_points.topLeftCorner(index2.n_array, index2.n_trees);
548  index2.leaf_first_indices = leaf_first_indices_all[index2.depth];
549  if (index2.density < 1) {
550  index2.sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(index2.n_pool, index2.dim);
551  for (int n_tree = 0; n_tree < index2.n_trees; ++n_tree)
552  index2.sparse_random_matrix.middleRows(n_tree * index2.depth, index2.depth) =
553  sparse_random_matrix.middleRows(n_tree * depth_max, index2.depth);
554  } else {
555  index2.dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(index2.n_pool, index2.dim);
556  for (int n_tree = 0; n_tree < index2.n_trees; ++n_tree)
557  index2.dense_random_matrix.middleRows(n_tree * index2.depth, index2.depth) =
558  dense_random_matrix.middleRows(n_tree * depth_max, index2.depth);
559  }
560  index2.index_type = autotuned;
561 
562  return index2;
563  }
564 
565 
577  Mrpt *subset_pointer(double target_recall) const {
578  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
579  throw std::out_of_range("Target recall must be on the interval [0,1].");
580  }
581 
582  Mrpt *index2 = new Mrpt(X);
583  index2->par = parameters(target_recall);
584 
585  int depth_max = depth;
586 
587  index2->n_trees = index2->par.n_trees;
588  index2->depth = index2->par.depth;
589  index2->votes = index2->par.votes;
590  index2->n_pool = index2->depth * index2->n_trees;
591  index2->n_array = 1 << (index2->depth + 1);
592  index2->tree_leaves.assign(tree_leaves.begin(), tree_leaves.begin() + index2->n_trees);
593  index2->leaf_first_indices_all = leaf_first_indices_all;
594  index2->density = density;
595  index2->k = k;
596 
597  index2->split_points = split_points.topLeftCorner(index2->n_array, index2->n_trees);
598  index2->leaf_first_indices = leaf_first_indices_all[index2->depth];
599  if (index2->density < 1) {
600  index2->sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(index2->n_pool, index2->dim);
601  for (int n_tree = 0; n_tree < index2->n_trees; ++n_tree)
602  index2->sparse_random_matrix.middleRows(n_tree * index2->depth, index2->depth) =
603  sparse_random_matrix.middleRows(n_tree * depth_max, index2->depth);
604  } else {
605  index2->dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(index2->n_pool, index2->dim);
606  for (int n_tree = 0; n_tree < index2->n_trees; ++n_tree)
607  index2->dense_random_matrix.middleRows(n_tree * index2->depth, index2->depth) =
608  dense_random_matrix.middleRows(n_tree * depth_max, index2->depth);
609  }
610  index2->index_type = autotuned;
611 
612  return index2;
613  }
614 
615 
625  std::vector<Mrpt_Parameters> optimal_parameters() const {
626  if (index_type == normal) {
627  throw std::logic_error("The list of optimal parameters cannot be retrieved for the non-autotuned index.");
628  }
629  if (index_type == autotuned) {
630  throw std::logic_error("The list of optimal parameters cannot be retrieved for the index which has already been subsetted or deleted to the target recall level.");
631  }
632 
633  std::vector<Mrpt_Parameters> new_pars;
634  std::copy(opt_pars.begin(), opt_pars.end(), std::back_inserter(new_pars));
635  return new_pars;
636  }
637 
661  void query(const float *data, int k, int vote_threshold, int *out,
662  float *out_distances = nullptr, int *out_n_elected = nullptr) const {
663 
664  if (k <= 0 || k > n_samples) {
665  throw std::out_of_range("k must belong to the set {1, ..., n}.");
666  }
667 
668  if (vote_threshold <= 0 || vote_threshold > n_trees) {
669  throw std::out_of_range("vote_threshold must belong to the set {1, ... , n_trees}.");
670  }
671 
672  if (empty()) {
673  throw std::logic_error("The index must be built before making queries.");
674  }
675 
676  const Eigen::Map<const Eigen::VectorXf> q(data, dim);
677 
678  Eigen::VectorXf projected_query(n_pool);
679  if (density < 1)
680  projected_query.noalias() = sparse_random_matrix * q;
681  else
682  projected_query.noalias() = dense_random_matrix * q;
683 
684  std::vector<int> found_leaves(n_trees);
685 
686  /*
687  * The following loops over all trees, and routes the query to exactly one
688  * leaf in each.
689  */
690  #pragma omp parallel for
691  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
692  int idx_tree = 0;
693  for (int d = 0; d < depth; ++d) {
694  const int j = n_tree * depth + d;
695  const int idx_left = 2 * idx_tree + 1;
696  const int idx_right = idx_left + 1;
697  const float split_point = split_points(idx_tree, n_tree);
698  if (projected_query(j) <= split_point) {
699  idx_tree = idx_left;
700  } else {
701  idx_tree = idx_right;
702  }
703  }
704  found_leaves[n_tree] = idx_tree - (1 << depth) + 1;
705  }
706 
707  int n_elected = 0, max_leaf_size = n_samples / (1 << depth) + 1;
708  Eigen::VectorXi elected(n_trees * max_leaf_size);
709  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
710 
711  // count votes
712  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
713  int leaf_begin = leaf_first_indices[found_leaves[n_tree]];
714  int leaf_end = leaf_first_indices[found_leaves[n_tree] + 1];
715  const std::vector<int> &indices = tree_leaves[n_tree];
716  for (int i = leaf_begin; i < leaf_end; ++i) {
717  int idx = indices[i];
718  if (++votes(idx) == vote_threshold)
719  elected(n_elected++) = idx;
720  }
721  }
722 
723  if (out_n_elected) {
724  *out_n_elected = n_elected;
725  }
726 
727  exact_knn(q, k, elected, n_elected, out, out_distances);
728  }
729 
740  void query(const Eigen::Ref<const Eigen::VectorXf> &q, int k, int vote_threshold, int *out,
741  float *out_distances = nullptr, int *out_n_elected = nullptr) const {
742  query(q.data(), k, vote_threshold, out, out_distances, out_n_elected);
743  }
744 
767  void query(const float *q, int *out, float *out_distances = nullptr,
768  int *out_n_elected = nullptr) const {
769  if (index_type == normal) {
770  throw std::logic_error("The index is not autotuned: k and vote threshold has to be specified.");
771  }
772 
773  if (index_type == autotuned_unpruned) {
774  throw std::logic_error("The target recall level has to be set before making queries.");
775  }
776 
777  query(q, k, votes, out, out_distances, out_n_elected);
778  }
779 
788  void query(const Eigen::Ref<const Eigen::VectorXf> &q, int *out, float *out_distances = nullptr,
789  int *out_n_elected = nullptr) const {
790  query(q.data(), out, out_distances, out_n_elected);
791  }
792 
814  static void exact_knn(const float *q_data, const float *X_data, int dim, int n_samples,
815  int k, int *out, float *out_distances = nullptr) {
816 
817  const Eigen::Map<const Eigen::MatrixXf> X(X_data, dim, n_samples);
818  const Eigen::Map<const Eigen::VectorXf> q(q_data, dim);
819 
820  if (k < 1 || k > n_samples) {
821  throw std::out_of_range("k must be positive and no greater than the sample size of data X.");
822  }
823 
824  Eigen::VectorXf distances(n_samples);
825 
826  #pragma omp parallel for
827  for (int i = 0; i < n_samples; ++i)
828  distances(i) = (X.col(i) - q).squaredNorm();
829 
830  if (k == 1) {
831  Eigen::MatrixXf::Index index;
832  distances.minCoeff(&index);
833  out[0] = index;
834 
835  if (out_distances)
836  out_distances[0] = std::sqrt(distances(index));
837 
838  return;
839  }
840 
841  Eigen::VectorXi idx(n_samples);
842  std::iota(idx.data(), idx.data() + n_samples, 0);
843  std::partial_sort(idx.data(), idx.data() + k, idx.data() + n_samples,
844  [&distances](int i1, int i2) { return distances(i1) < distances(i2); });
845 
846  for (int i = 0; i < k; ++i)
847  out[i] = idx(i);
848 
849  if (out_distances) {
850  for (int i = 0; i < k; ++i)
851  out_distances[i] = std::sqrt(distances(idx(i)));
852  }
853  }
854 
862  static void exact_knn(const Eigen::Ref<const Eigen::VectorXf> &q,
863  const Eigen::Ref<const Eigen::MatrixXf> &X,
864  int k, int *out, float *out_distances = nullptr) {
865  Mrpt::exact_knn(q.data(), X.data(), X.rows(), X.cols(), k, out, out_distances);
866  }
867 
874  void exact_knn(const float *q, int k, int *out, float *out_distances = nullptr) const {
875  Mrpt::exact_knn(q, X.data(), dim, n_samples, k, out, out_distances);
876  }
877 
884  void exact_knn(const Eigen::Ref<const Eigen::VectorXf> &q, int k, int *out,
885  float *out_distances = nullptr) const {
886  Mrpt::exact_knn(q.data(), X.data(), dim, n_samples, k, out, out_distances);
887  }
888 
905  bool save(const char *path) const {
906  FILE *fd;
907  if ((fd = fopen(path, "wb")) == NULL)
908  return false;
909 
910  int i = index_type;
911  fwrite(&i, sizeof(int), 1, fd);
912 
913  if (index_type == 2) {
914  write_parameter_list(opt_pars, fd);
915  }
916 
917  write_parameters(&par, fd);
918  fwrite(&n_trees, sizeof(int), 1, fd);
919  fwrite(&depth, sizeof(int), 1, fd);
920  fwrite(&density, sizeof(float), 1, fd);
921 
922  fwrite(split_points.data(), sizeof(float), n_array * n_trees, fd);
923 
924  // save tree leaves
925  for (int i = 0; i < n_trees; ++i) {
926  int sz = tree_leaves[i].size();
927  fwrite(&sz, sizeof(int), 1, fd);
928  fwrite(&tree_leaves[i][0], sizeof(int), sz, fd);
929  }
930 
931  // save random matrix
932  if (density < 1) {
933  int non_zeros = sparse_random_matrix.nonZeros();
934  fwrite(&non_zeros, sizeof(int), 1, fd);
935  for (int k = 0; k < sparse_random_matrix.outerSize(); ++k) {
936  for (Eigen::SparseMatrix<float, Eigen::RowMajor>::InnerIterator it(sparse_random_matrix, k); it; ++it) {
937  float val = it.value();
938  int row = it.row(), col = it.col();
939  fwrite(&row, sizeof(int), 1, fd);
940  fwrite(&col, sizeof(int), 1, fd);
941  fwrite(&val, sizeof(float), 1, fd);
942  }
943  }
944  } else {
945  fwrite(dense_random_matrix.data(), sizeof(float), n_pool * dim, fd);
946  }
947 
948  fclose(fd);
949  return true;
950  }
951 
958  bool load(const char *path) {
959  FILE *fd;
960  if ((fd = fopen(path, "rb")) == NULL)
961  return false;
962 
963  int i;
964  fread(&i, sizeof(int), 1, fd);
965  index_type = static_cast<itype>(i);
966  if (index_type == autotuned_unpruned) {
967  read_parameter_list(fd);
968  }
969 
970  read_parameters(&par, fd);
971  fread(&n_trees, sizeof(int), 1, fd);
972  fread(&depth, sizeof(int), 1, fd);
973  fread(&density, sizeof(float), 1, fd);
974 
975  n_pool = n_trees * depth;
976  n_array = 1 << (depth + 1);
977 
978  count_first_leaf_indices_all(leaf_first_indices_all, n_samples, depth);
979  leaf_first_indices = leaf_first_indices_all[depth];
980 
981  split_points = Eigen::MatrixXf(n_array, n_trees);
982  fread(split_points.data(), sizeof(float), n_array * n_trees, fd);
983 
984  // load tree leaves
985  tree_leaves = std::vector<std::vector<int>>(n_trees);
986  for (int i = 0; i < n_trees; ++i) {
987  int sz;
988  fread(&sz, sizeof(int), 1, fd);
989  std::vector<int> leaves(sz);
990  fread(&leaves[0], sizeof(int), sz, fd);
991  tree_leaves[i] = leaves;
992  }
993 
994  // load random matrix
995  if (density < 1) {
996  int non_zeros;
997  fread(&non_zeros, sizeof(int), 1, fd);
998 
999  sparse_random_matrix = Eigen::SparseMatrix<float>(n_pool, dim);
1000  std::vector<Eigen::Triplet<float>> triplets;
1001  for (int k = 0; k < non_zeros; ++k) {
1002  int row, col;
1003  float val;
1004  fread(&row, sizeof(int), 1, fd);
1005  fread(&col, sizeof(int), 1, fd);
1006  fread(&val, sizeof(float), 1, fd);
1007  triplets.push_back(Eigen::Triplet<float>(row, col, val));
1008  }
1009 
1010  sparse_random_matrix.setFromTriplets(triplets.begin(), triplets.end());
1011  sparse_random_matrix.makeCompressed();
1012  } else {
1013  dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(n_pool, dim);
1014  fread(dense_random_matrix.data(), sizeof(float), n_pool * dim, fd);
1015  }
1016 
1017  fclose(fd);
1018 
1019  k = par.k;
1020  votes = par.votes;
1021  return true;
1022  }
1023 
1029  bool empty() const {
1030  return n_trees == 0;
1031  }
1032 
1039  friend class MrptTest;
1040  friend class UtilityTest;
1041 
1045  private:
1046 
1051  void grow_subtree(std::vector<int>::iterator begin, std::vector<int>::iterator end,
1052  int tree_level, int i, int n_tree, const Eigen::MatrixXf &tree_projections) {
1053  int n = end - begin;
1054  int idx_left = 2 * i + 1;
1055  int idx_right = idx_left + 1;
1056 
1057  if (tree_level == depth) return;
1058 
1059  std::nth_element(begin, begin + n / 2, end,
1060  [&tree_projections, tree_level] (int i1, int i2) {
1061  return tree_projections(tree_level, i1) < tree_projections(tree_level, i2);
1062  });
1063  auto mid = end - n / 2;
1064 
1065  if (n % 2) {
1066  split_points(i, n_tree) = tree_projections(tree_level, *(mid - 1));
1067  } else {
1068  auto left_it = std::max_element(begin, mid,
1069  [&tree_projections, tree_level] (int i1, int i2) {
1070  return tree_projections(tree_level, i1) < tree_projections(tree_level, i2);
1071  });
1072  split_points(i, n_tree) = (tree_projections(tree_level, *mid) +
1073  tree_projections(tree_level, *left_it)) / 2.0;
1074  }
1075 
1076  grow_subtree(begin, mid, tree_level + 1, idx_left, n_tree, tree_projections);
1077  grow_subtree(mid, end, tree_level + 1, idx_right, n_tree, tree_projections);
1078  }
1079 
1083  void exact_knn(const Eigen::Map<const Eigen::VectorXf> &q, int k, const Eigen::VectorXi &indices,
1084  int n_elected, int *out, float *out_distances = nullptr) const {
1085 
1086  if (!n_elected) {
1087  for (int i = 0; i < k; ++i)
1088  out[i] = -1;
1089 
1090  if (out_distances) {
1091  for (int i = 0; i < k; ++i)
1092  out_distances[i] = -1;
1093  }
1094 
1095  return;
1096  }
1097 
1098  Eigen::VectorXf distances(n_elected);
1099 
1100  #pragma omp parallel for
1101  for (int i = 0; i < n_elected; ++i)
1102  distances(i) = (X.col(indices(i)) - q).squaredNorm();
1103 
1104  if (k == 1) {
1105  Eigen::MatrixXf::Index index;
1106  distances.minCoeff(&index);
1107  out[0] = n_elected ? indices(index) : -1;
1108 
1109  if (out_distances)
1110  out_distances[0] = n_elected ? std::sqrt(distances(index)) : -1;
1111 
1112  return;
1113  }
1114 
1115  int n_to_sort = n_elected > k ? k : n_elected;
1116  Eigen::VectorXi idx(n_elected);
1117  std::iota(idx.data(), idx.data() + n_elected, 0);
1118  std::partial_sort(idx.data(), idx.data() + n_to_sort, idx.data() + n_elected,
1119  [&distances](int i1, int i2) { return distances(i1) < distances(i2); });
1120 
1121  for (int i = 0; i < k; ++i)
1122  out[i] = i < n_elected ? indices(idx(i)) : -1;
1123 
1124  if (out_distances) {
1125  for (int i = 0; i < k; ++i)
1126  out_distances[i] = i < n_elected ? std::sqrt(distances(idx(i))) : -1;
1127  }
1128  }
1129 
1130  void prune(double target_recall) {
1131  if (target_recall < 0.0 - epsilon || target_recall > 1.0 + epsilon) {
1132  throw std::out_of_range("Target recall must be on the interval [0,1].");
1133  }
1134 
1135  par = parameters(target_recall);
1136  if (!par.n_trees) {
1137  return;
1138  }
1139 
1140  int depth_max = depth;
1141 
1142  n_trees = par.n_trees;
1143  depth = par.depth;
1144  votes = par.votes;
1145  n_pool = depth * n_trees;
1146  n_array = 1 << (depth + 1);
1147 
1148  tree_leaves.resize(n_trees);
1149  tree_leaves.shrink_to_fit();
1150  split_points.conservativeResize(n_array, n_trees);
1151  leaf_first_indices = leaf_first_indices_all[depth];
1152 
1153  if (density < 1) {
1154  Eigen::SparseMatrix<float, Eigen::RowMajor> srm_new(n_pool, dim);
1155  for (int n_tree = 0; n_tree < n_trees; ++n_tree)
1156  srm_new.middleRows(n_tree * depth, depth) = sparse_random_matrix.middleRows(n_tree * depth_max, depth);
1157  sparse_random_matrix = srm_new;
1158  } else {
1159  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> drm_new(n_pool, dim);
1160  for (int n_tree = 0; n_tree < n_trees; ++n_tree)
1161  drm_new.middleRows(n_tree * depth, depth) = dense_random_matrix.middleRows(n_tree * depth_max, depth);
1162  dense_random_matrix = drm_new;
1163  }
1164 
1165  index_type = autotuned;
1166  }
1167 
1168  void count_elected(const Eigen::VectorXf &q, const Eigen::Map<Eigen::VectorXi> &exact, int votes_max,
1169  std::vector<Eigen::MatrixXd> &recalls, std::vector<Eigen::MatrixXd> &cs_sizes) const {
1170  Eigen::VectorXf projected_query(n_pool);
1171  if (density < 1)
1172  projected_query.noalias() = sparse_random_matrix * q;
1173  else
1174  projected_query.noalias() = dense_random_matrix * q;
1175 
1176  int depth_min = depth - recalls.size() + 1;
1177  std::vector<std::vector<int>> start_indices(n_trees);
1178 
1179  #pragma omp parallel for
1180  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1181  start_indices[n_tree] = std::vector<int>(depth - depth_min + 1);
1182  int idx_tree = 0;
1183  for (int d = 0; d < depth; ++d) {
1184  const int j = n_tree * depth + d;
1185  const int idx_left = 2 * idx_tree + 1;
1186  const int idx_right = idx_left + 1;
1187  const float split_point = split_points(idx_tree, n_tree);
1188  if (projected_query(j) <= split_point) {
1189  idx_tree = idx_left;
1190  } else {
1191  idx_tree = idx_right;
1192  }
1193  if (d >= depth_min - 1)
1194  start_indices[n_tree][d - depth_min + 1] = idx_tree - (1 << (d + 1)) + 1;
1195  }
1196  }
1197 
1198  const int *exact_begin = exact.data();
1199  const int *exact_end = exact.data() + exact.size();
1200 
1201  for (int depth_crnt = depth_min; depth_crnt <= depth; ++depth_crnt) {
1202  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
1203  const std::vector<int> &leaf_first_indices = leaf_first_indices_all[depth_crnt];
1204 
1205  Eigen::MatrixXd recall(votes_max, n_trees);
1206  Eigen::MatrixXd candidate_set_size(votes_max, n_trees);
1207  recall.col(0) = Eigen::VectorXd::Zero(votes_max);
1208  candidate_set_size.col(0) = Eigen::VectorXd::Zero(votes_max);
1209 
1210  // count votes
1211  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1212  std::vector<int> &found_leaves = start_indices[n_tree];
1213 
1214  if (n_tree) {
1215  recall.col(n_tree) = recall.col(n_tree - 1);
1216  candidate_set_size.col(n_tree) = candidate_set_size.col(n_tree - 1);
1217  }
1218 
1219  int leaf_begin = leaf_first_indices[found_leaves[depth_crnt - depth_min]];
1220  int leaf_end = leaf_first_indices[found_leaves[depth_crnt - depth_min] + 1];
1221 
1222  const std::vector<int> &indices = tree_leaves[n_tree];
1223  for (int i = leaf_begin; i < leaf_end; ++i) {
1224  int idx = indices[i];
1225  int v = ++votes(idx);
1226  if (v <= votes_max) {
1227  candidate_set_size(v - 1, n_tree)++;
1228  if (std::find(exact_begin, exact_end, idx) != exact_end)
1229  recall(v - 1, n_tree)++;
1230  }
1231  }
1232  }
1233 
1234  recalls[depth_crnt - depth_min] = recall;
1235  cs_sizes[depth_crnt - depth_min] = candidate_set_size;
1236  }
1237  }
1238 
1248  static void build_sparse_random_matrix(Eigen::SparseMatrix<float, Eigen::RowMajor> &sparse_random_matrix,
1249  int n_row, int n_col, float density, int seed = 0) {
1250  sparse_random_matrix = Eigen::SparseMatrix<float, Eigen::RowMajor>(n_row, n_col);
1251 
1252  std::random_device rd;
1253  int s = seed ? seed : rd();
1254  std::mt19937 gen(s);
1255  std::uniform_real_distribution<float> uni_dist(0, 1);
1256  std::normal_distribution<float> norm_dist(0, 1);
1257 
1258  std::vector<Eigen::Triplet<float>> triplets;
1259  for (int j = 0; j < n_row; ++j) {
1260  for (int i = 0; i < n_col; ++i) {
1261  if (uni_dist(gen) > density) continue;
1262  triplets.push_back(Eigen::Triplet<float>(j, i, norm_dist(gen)));
1263  }
1264  }
1265 
1266  sparse_random_matrix.setFromTriplets(triplets.begin(), triplets.end());
1267  sparse_random_matrix.makeCompressed();
1268  }
1269 
1270  /*
1271  * Builds a random dense matrix for use in random projection. The components of
1272  * the matrix are drawn from the standard normal distribution.
1273  */
1274  static void build_dense_random_matrix(Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> &dense_random_matrix,
1275  int n_row, int n_col, int seed = 0) {
1276  dense_random_matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>(n_row, n_col);
1277 
1278  std::random_device rd;
1279  int s = seed ? seed : rd();
1280  std::mt19937 gen(s);
1281  std::normal_distribution<float> normal_dist(0, 1);
1282 
1283  std::generate(dense_random_matrix.data(), dense_random_matrix.data() + n_row * n_col,
1284  [&normal_dist, &gen] { return normal_dist(gen); });
1285  }
1286 
1287  void compute_exact(const Eigen::Map<const Eigen::MatrixXf> &Q, Eigen::MatrixXi &out_exact,
1288  const std::vector<int> &indices_test = {}) const {
1289  int n_test = Q.cols();
1290 
1291  Eigen::VectorXi idx(n_samples);
1292  std::iota(idx.data(), idx.data() + n_samples, 0);
1293 
1294  for (int i = 0; i < n_test; ++i) {
1295  if(!indices_test.empty()) {
1296  std::remove(idx.data(), idx.data() + n_samples, indices_test[i]);
1297  }
1298  exact_knn(Eigen::Map<const Eigen::VectorXf>(Q.data() + i * dim, dim), k, idx,
1299  (indices_test.empty() ? n_samples : n_samples - 1), out_exact.data() + i * k);
1300  std::sort(out_exact.data() + i * k, out_exact.data() + i * k + k);
1301  if(!indices_test.empty()) {
1302  idx[n_samples - 1] = indices_test[i];
1303  }
1304  }
1305  }
1306 
1307  static bool is_faster(const Mrpt_Parameters &par1, const Mrpt_Parameters &par2) {
1308  return par1.estimated_qtime < par2.estimated_qtime;
1309  }
1310 
1311  void vote(const Eigen::VectorXf &projected_query, int vote_threshold, Eigen::VectorXi &elected,
1312  int &n_elected, int n_trees, int depth_crnt) {
1313  std::vector<int> found_leaves(n_trees);
1314  const std::vector<int> &leaf_first_indices = leaf_first_indices_all[depth_crnt];
1315 
1316  #pragma omp parallel for
1317  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1318  int idx_tree = 0;
1319  for (int d = 0; d < depth_crnt; ++d) {
1320  const int j = n_tree * depth + d;
1321  const int idx_left = 2 * idx_tree + 1;
1322  const int idx_right = idx_left + 1;
1323  const float split_point = split_points(idx_tree, n_tree);
1324  if (projected_query(j) <= split_point) {
1325  idx_tree = idx_left;
1326  } else {
1327  idx_tree = idx_right;
1328  }
1329  }
1330  found_leaves[n_tree] = idx_tree - (1 << depth_crnt) + 1;
1331  }
1332 
1333  int max_leaf_size = n_samples / (1 << depth_crnt) + 1;
1334  elected = Eigen::VectorXi(n_trees * max_leaf_size);
1335  Eigen::VectorXi votes = Eigen::VectorXi::Zero(n_samples);
1336 
1337  // count votes
1338  for (int n_tree = 0; n_tree < n_trees; ++n_tree) {
1339  int leaf_begin = leaf_first_indices[found_leaves[n_tree]];
1340  int leaf_end = leaf_first_indices[found_leaves[n_tree] + 1];
1341  const std::vector<int> &indices = tree_leaves[n_tree];
1342  for (int i = leaf_begin; i < leaf_end; ++i) {
1343  int idx = indices[i];
1344  if (++votes(idx) == vote_threshold)
1345  elected(n_elected++) = idx;
1346  }
1347  }
1348  }
1349 
1350  std::pair<double,double> fit_projection_times(const Eigen::Map<const Eigen::MatrixXf> &Q,
1351  std::vector<int> &exact_x) {
1352  std::vector<double> projection_times, projection_x;
1353  long double idx_sum = 0;
1354 
1355  std::vector<int> tested_trees {1,2,3,4,5,7,10,15,20,25,30,40,50};
1356  generate_x(tested_trees, n_trees, 10, n_trees);
1357 
1358  for (int d = depth_min; d <= depth; ++d) {
1359  for (int i = 0; i < (int) tested_trees.size(); ++i) {
1360  int t = tested_trees[i];
1361  int n_random_vectors = t * d;
1362  projection_x.push_back(n_random_vectors);
1363  Eigen::SparseMatrix<float, Eigen::RowMajor> sparse_mat;
1364  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense_mat;
1365 
1366  if (density < 1) {
1367  build_sparse_random_matrix(sparse_mat, n_random_vectors, dim, density);
1368  } else {
1369  build_dense_random_matrix(dense_mat, n_random_vectors, dim);
1370  }
1371 
1372  double start_proj = omp_get_wtime();
1373  Eigen::VectorXf projected_query(n_random_vectors);
1374 
1375  if (density < 1) {
1376  projected_query.noalias() = sparse_mat * Q.col(0);
1377  } else {
1378  projected_query.noalias() = dense_mat * Q.col(0);
1379  }
1380 
1381  double end_proj = omp_get_wtime();
1382  projection_times.push_back(end_proj - start_proj);
1383  idx_sum += projected_query.norm();
1384 
1385  int votes_index = votes_max < t ? votes_max : t;
1386  for (int v = 1; v <= votes_index; ++v) {
1387  int cs_size = get_candidate_set_size(t, d, v);
1388  if (cs_size > 0) exact_x.push_back(cs_size);
1389  }
1390  }
1391  }
1392 
1393  // use results to ensure that the compiler does not optimize away the timed code.
1394  projection_x[0] += idx_sum > 1.0 ? 0.0000 : 0.0001;
1395  return fit_theil_sen(projection_x, projection_times);
1396  }
1397 
1398  std::vector<std::map<int,std::pair<double,double>>> fit_voting_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1399  int n_test = Q.cols();
1400 
1401  std::random_device rd;
1402  std::mt19937 rng(rd());
1403  std::uniform_int_distribution<int> uni(0, n_test - 1);
1404 
1405  std::vector<int> tested_trees {1,2,3,4,5,7,10,15,20,25,30,40,50};
1406  generate_x(tested_trees, n_trees, 10, n_trees);
1407  std::vector<int> vote_thresholds_x {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1408  generate_x(vote_thresholds_x, votes_max, 10, votes_max);
1409 
1410  beta_voting = std::vector<std::map<int,std::pair<double,double>>>();
1411 
1412  for (int d = depth_min; d <= depth; ++d) {
1413  std::map<int,std::pair<double,double>> beta;
1414  for (const auto &v : vote_thresholds_x) {
1415  long double idx_sum = 0;
1416  std::vector<double> voting_times, voting_x;
1417 
1418  for (int i = 0; i < (int) tested_trees.size(); ++i) {
1419  int t = tested_trees[i];
1420  int n_el = 0;
1421  Eigen::VectorXi elected;
1422  auto ri = uni(rng);
1423 
1424  Eigen::VectorXf projected_query(n_trees * depth);
1425  if (density < 1) {
1426  projected_query.noalias() = sparse_random_matrix * Q.col(ri);
1427  } else {
1428  projected_query.noalias() = dense_random_matrix * Q.col(ri);
1429  }
1430 
1431  double start_voting = omp_get_wtime();
1432  vote(projected_query, v, elected, n_el, t, d);
1433  double end_voting = omp_get_wtime();
1434 
1435  voting_times.push_back(end_voting - start_voting);
1436  voting_x.push_back(t);
1437  for (int i = 0; i < n_el; ++i)
1438  idx_sum += elected(i);
1439  }
1440  voting_x[0] += idx_sum > 1.0 ? 0.0 : 0.00001;
1441  beta[v] = fit_theil_sen(voting_x, voting_times);
1442  }
1443  beta_voting.push_back(beta);
1444  }
1445 
1446  return beta_voting;
1447  }
1448 
1449  static void generate_x(std::vector<int> &x, int max_generated, int n_tested, int max_val) {
1450  n_tested = max_generated > n_tested ? n_tested : max_val;
1451  int increment = max_generated / n_tested;
1452  for (int i = 1; i <= n_tested; ++i) {
1453  if (std::find(x.begin(), x.end(), i * increment) == x.end() && i * increment <= max_generated) {
1454  x.push_back(i * increment);
1455  }
1456  }
1457 
1458  auto end = std::remove_if(x.begin(), x.end(), [max_val](int t) { return t > max_val; });
1459  x.erase(end, x.end());
1460  }
1461 
1462  std::pair<double,double> fit_exact_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1463  std::vector<int> s_tested {1,2,5,10,20,35,50,75,100,150,200,300,400,500};
1464  generate_x(s_tested, n_samples / 20, 20, n_samples);
1465 
1466  int n_test = Q.cols();
1467  std::vector<double> exact_times;
1468  long double idx_sum = 0;
1469 
1470  std::random_device rd;
1471  std::mt19937 rng(rd());
1472  std::uniform_int_distribution<int> uni(0, n_test - 1);
1473  std::uniform_int_distribution<int> uni2(0, n_samples - 1);
1474 
1475  std::vector<double> ex;
1476  int n_sim = 20;
1477  for (int i = 0; i < (int) s_tested.size(); ++i) {
1478  double mean_exact_time = 0;
1479  int s_size = s_tested[i];
1480  ex.push_back(s_size);
1481 
1482  for (int m = 0; m < n_sim; ++m) {
1483  auto ri = uni(rng);
1484  Eigen::VectorXi elected(s_size);
1485  for (int j = 0; j < elected.size(); ++j)
1486  elected(j) = uni2(rng);
1487 
1488  double start_exact = omp_get_wtime();
1489  std::vector<int> res(k);
1490  exact_knn(Eigen::Map<const Eigen::VectorXf>(Q.data() + ri * dim, dim), k, elected, s_size, &res[0]);
1491  double end_exact = omp_get_wtime();
1492  mean_exact_time += (end_exact - start_exact);
1493 
1494  for (int l = 0; l < k; ++l)
1495  idx_sum += res[l];
1496  }
1497  mean_exact_time /= n_sim;
1498  exact_times.push_back(mean_exact_time);
1499  }
1500 
1501  ex[0] += idx_sum > 1.0 ? 0.0 : 0.00001;
1502  return fit_theil_sen(ex, exact_times);
1503  }
1504 
1505  std::set<Mrpt_Parameters,decltype(is_faster)*> list_parameters(const std::vector<Eigen::MatrixXd> &recalls) {
1506  std::set<Mrpt_Parameters,decltype(is_faster)*> pars(is_faster);
1507  std::vector<Eigen::MatrixXd> query_times(depth - depth_min + 1);
1508  for (int d = depth_min; d <= depth; ++d) {
1509  Eigen::MatrixXd query_time = Eigen::MatrixXd::Zero(votes_max, n_trees);
1510 
1511  for (int t = 1; t <= n_trees; ++t) {
1512  int votes_index = votes_max < t ? votes_max : t;
1513  for (int v = 1; v <= votes_index; ++v) {
1514  double qt = get_query_time(t, d, v);
1515  query_time(v - 1, t - 1) = qt;
1516  Mrpt_Parameters p;
1517  p.n_trees = t;
1518  p.depth = d;
1519  p.votes = v;
1520  p.k = k;
1521  p.estimated_qtime = qt;
1522  p.estimated_recall = recalls[d - depth_min](v - 1, t - 1);
1523  pars.insert(p);
1524  }
1525  }
1526 
1527  query_times[d - depth_min] = query_time;
1528  }
1529 
1530  return pars;
1531  }
1532 
1533  std::set<Mrpt_Parameters,decltype(is_faster)*> pareto_frontier(const std::set<Mrpt_Parameters,decltype(is_faster)*> &pars) {
1534  opt_pars = std::set<Mrpt_Parameters,decltype(is_faster)*>(is_faster);
1535  double best_recall = -1.0;
1536  for (const auto &p : pars) { // compute pareto frontier for query times and recalls
1537  if (p.estimated_recall > best_recall) {
1538  opt_pars.insert(p);
1539  best_recall = p.estimated_recall;
1540  }
1541  }
1542 
1543  return opt_pars;
1544  }
1545 
1546  void fit_times(const Eigen::Map<const Eigen::MatrixXf> &Q) {
1547  std::vector<int> exact_x;
1548  beta_projection = fit_projection_times(Q, exact_x);
1549  beta_voting = fit_voting_times(Q);
1550  beta_exact = fit_exact_times(Q);
1551  }
1552 
1553  static std::pair<double,double> fit_theil_sen(const std::vector<double> &x,
1554  const std::vector<double> &y) {
1555  int n = x.size();
1556  std::vector<double> slopes;
1557  for (int i = 0; i < n; ++i) {
1558  for (int j = 0; j < n; ++j) {
1559  if (i != j)
1560  slopes.push_back((y[j] - y[i]) / (x[j] - x[i]));
1561  }
1562  }
1563 
1564  int n_slopes = slopes.size();
1565  std::nth_element(slopes.begin(), slopes.begin() + n_slopes / 2, slopes.end());
1566  double slope = *(slopes.begin() + n_slopes / 2);
1567 
1568  std::vector<double> residuals(n);
1569  for (int i = 0; i < n; ++i)
1570  residuals[i] = y[i] - slope * x[i];
1571 
1572  std::nth_element(residuals.begin(), residuals.begin() + n / 2, residuals.end());
1573  double intercept = *(residuals.begin() + n / 2);
1574 
1575  return std::make_pair(intercept, slope);
1576  }
1577 
1578  void write_parameters(const Mrpt_Parameters *p, FILE *fd) const {
1579  if (!fd) {
1580  return;
1581  }
1582 
1583  fwrite(&p->n_trees, sizeof(int), 1, fd);
1584  fwrite(&p->depth, sizeof(int), 1, fd);
1585  fwrite(&p->votes, sizeof(int), 1, fd);
1586  fwrite(&p->k, sizeof(int), 1, fd);
1587  fwrite(&p->estimated_qtime, sizeof(double), 1, fd);
1588  fwrite(&p->estimated_recall, sizeof(double), 1, fd);
1589  }
1590 
1591  void read_parameters(Mrpt_Parameters *p, FILE *fd) {
1592  fread(&p->n_trees, sizeof(int), 1, fd);
1593  fread(&p->depth, sizeof(int), 1, fd);
1594  fread(&p->votes, sizeof(int), 1, fd);
1595  fread(&p->k, sizeof(int), 1, fd);
1596  fread(&p->estimated_qtime, sizeof(double), 1, fd);
1597  fread(&p->estimated_recall, sizeof(double), 1, fd);
1598  }
1599 
1600  void write_parameter_list(const std::set<Mrpt_Parameters,decltype(is_faster)*> &pars, FILE *fd) const {
1601  if (!fd) {
1602  return;
1603  }
1604 
1605  int par_sz = pars.size();
1606  fwrite(&par_sz, sizeof(int), 1, fd);
1607 
1608  for (const auto p : pars)
1609  write_parameters(&p, fd);
1610  }
1611 
1612  void read_parameter_list(FILE *fd) {
1613  if (!fd) {
1614  return;
1615  }
1616 
1617  opt_pars = std::set<Mrpt_Parameters,decltype(is_faster)*>(is_faster);
1618  int par_sz = 0;
1619  fread(&par_sz, sizeof(int), 1, fd);
1620 
1621  for (int i = 0; i < par_sz; ++i) {
1622  Mrpt_Parameters p;
1623  read_parameters(&p, fd);
1624  opt_pars.insert(p);
1625  }
1626  }
1627 
1628  Mrpt_Parameters parameters(double target_recall) const {
1629  double tr = target_recall - epsilon;
1630  for (const auto &p : opt_pars) {
1631  if (p.estimated_recall > tr) {
1632  return p;
1633  }
1634  }
1635 
1636  if (!opt_pars.empty()) {
1637  return *(opt_pars.rbegin());
1638  }
1639 
1640  return Mrpt_Parameters();
1641  }
1642 
1648  static void count_leaf_sizes(int n, int level, int tree_depth, std::vector<int> &out_leaf_sizes) {
1649  if (level == tree_depth) {
1650  out_leaf_sizes.push_back(n);
1651  return;
1652  }
1653 
1654  count_leaf_sizes(n - n / 2, level + 1, tree_depth, out_leaf_sizes);
1655  count_leaf_sizes(n / 2, level + 1, tree_depth, out_leaf_sizes);
1656  }
1657 
1664  static void count_first_leaf_indices(std::vector<int> &indices, int n, int depth) {
1665  std::vector<int> leaf_sizes;
1666  count_leaf_sizes(n, 0, depth, leaf_sizes);
1667 
1668  indices = std::vector<int>(leaf_sizes.size() + 1);
1669  indices[0] = 0;
1670  for (int i = 0; i < (int) leaf_sizes.size(); ++i)
1671  indices[i + 1] = indices[i] + leaf_sizes[i];
1672  }
1673 
1674  static void count_first_leaf_indices_all(std::vector<std::vector<int>> &indices, int n, int depth_max) {
1675  for (int d = 0; d <= depth_max; ++d) {
1676  std::vector<int> idx;
1677  count_first_leaf_indices(idx, n, d);
1678  indices.push_back(idx);
1679  }
1680  }
1681 
1682  static double predict_theil_sen(double x, std::pair<double,double> beta) {
1683  return beta.first + beta.second * x;
1684  }
1685 
1686  double get_candidate_set_size(int tree, int depth, int v) const {
1687  return cs_sizes[depth - depth_min](v - 1, tree - 1);
1688  }
1689 
1690  double get_projection_time(int n_trees, int depth, int v) const {
1691  return predict_theil_sen(n_trees * depth, beta_projection);
1692  }
1693 
1694  double get_voting_time(int n_trees, int depth, int v) const {
1695  const std::map<int,std::pair<double,double>> &beta = beta_voting[depth - depth_min];
1696 
1697  if (v <= 0 || beta.empty()) {
1698  return 0.0;
1699  }
1700 
1701  for (const auto &b : beta) {
1702  if (v <= b.first) {
1703  return predict_theil_sen(n_trees, b.second);
1704  }
1705  }
1706 
1707  return predict_theil_sen(n_trees, beta.rbegin()->second);
1708  }
1709 
1710  double get_exact_time(int n_trees, int depth, int v) const {
1711  return predict_theil_sen(get_candidate_set_size(n_trees, depth, v), beta_exact);
1712  }
1713 
1714  double get_query_time(int tree, int depth, int v) const {
1715  return get_projection_time(tree, depth, v)
1716  + get_voting_time(tree, depth, v)
1717  + get_exact_time(tree, depth, v);
1718  }
1719 
1720  std::vector<int> sample_indices(int n_test, int seed = 0) const {
1721  std::random_device rd;
1722  int s = seed ? seed : rd();
1723  std::mt19937 gen(s);
1724 
1725  std::vector<int> indices_data(n_samples);
1726  std::iota(indices_data.begin(), indices_data.end(), 0);
1727  std::shuffle(indices_data.begin(), indices_data.end(), gen);
1728  return std::vector<int>(indices_data.begin(), indices_data.begin() + n_test);
1729  }
1730 
1731  Eigen::MatrixXf subset(const std::vector<int> &indices) const {
1732  int n_test = indices.size();
1733  Eigen::MatrixXf Q = Eigen::MatrixXf(dim, n_test);
1734  for(int i = 0; i < n_test; ++i)
1735  Q.col(i) = X.col(indices[i]);
1736 
1737  return Q;
1738  }
1739 
1740 
1741  const Eigen::Map<const Eigen::MatrixXf> X; // the data matrix
1742  Eigen::MatrixXf split_points; // all split points in all trees
1743  std::vector<std::vector<int>> tree_leaves; // contains all leaves of all trees
1744  Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense_random_matrix; // random vectors needed for all the RP-trees
1745  Eigen::SparseMatrix<float, Eigen::RowMajor> sparse_random_matrix; // random vectors needed for all the RP-trees
1746  std::vector<std::vector<int>> leaf_first_indices_all; // first indices for each level
1747  std::vector<int> leaf_first_indices; // first indices of each leaf of tree in tree_leaves
1748 
1749  const int n_samples; // sample size of data
1750  const int dim; // dimension of data
1751  Mrpt_Parameters par;
1752  int n_trees = 0; // number of RP-trees
1753  int depth = 0; // depth of an RP-tree with median split
1754  float density = -1.0; // expected ratio of non-zero components in a projection matrix
1755  int n_pool = 0; // amount of random vectors needed for all the RP-trees
1756  int n_array = 0; // length of the one RP-tree as array
1757  int votes = 0; // optimal number of votes to use
1758  int k = 0;
1759  enum itype {normal, autotuned, autotuned_unpruned};
1760  itype index_type = normal;
1761 
1762  // Member variables used in autotuning:
1763  int depth_min = 0;
1764  int votes_max = 0;
1765  const double epsilon = 0.0001; // error bound for comparisons of recall levels
1766  std::vector<Eigen::MatrixXd> cs_sizes;
1767  std::pair<double,double> beta_projection, beta_exact;
1768  std::vector<std::map<int,std::pair<double,double>>> beta_voting;
1769  std::set<Mrpt_Parameters,decltype(is_faster)*> opt_pars;
1770 };
1771 
1772 #endif // CPP_MRPT_H_
void query(const Eigen::Ref< const Eigen::VectorXf > &q, int k, int vote_threshold, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:740
+
void query(const float *q, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:767
+
bool is_autotuned() const
Definition: Mrpt.h:298
+
void query(const float *data, int k, int vote_threshold, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:661
int votes
Definition: Mrpt.h:23
-
void grow(const float *data, int n_test, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, const std::vector< int > &indices_test={})
Definition: Mrpt.h:340
-
Mrpt_Parameters parameters() const
Definition: Mrpt.h:278
-
std::vector< Mrpt_Parameters > optimal_parameters() const
Definition: Mrpt.h:618
-
bool load(const char *path)
Definition: Mrpt.h:951
+
void grow(const float *data, int n_test, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, const std::vector< int > &indices_test={})
Definition: Mrpt.h:344
+
Mrpt_Parameters parameters() const
Definition: Mrpt.h:281
+
std::vector< Mrpt_Parameters > optimal_parameters() const
Definition: Mrpt.h:625
+
bool load(const char *path)
Definition: Mrpt.h:958
Definition: Mrpt.h:19
int depth
Definition: Mrpt.h:21
double estimated_recall
Definition: Mrpt.h:25
-
static void exact_knn(const float *q_data, const float *X_data, int dim, int n_samples, int k, int *out, float *out_distances=nullptr)
Definition: Mrpt.h:807
-
void exact_knn(const float *q, int k, int *out, float *out_distances=nullptr) const
Definition: Mrpt.h:867
-
void grow_autotune(double target_recall, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, int n_test=100)
Definition: Mrpt.h:253
+
static void exact_knn(const float *q_data, const float *X_data, int dim, int n_samples, int k, int *out, float *out_distances=nullptr)
Definition: Mrpt.h:814
+
void exact_knn(const float *q, int k, int *out, float *out_distances=nullptr) const
Definition: Mrpt.h:874
+
void grow_autotune(double target_recall, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, int n_test=100)
Definition: Mrpt.h:256
Mrpt(const float *X_, int dim_, int n_samples_)
Definition: Mrpt.h:60
Definition: Mrpt.h:28
int n_trees
Definition: Mrpt.h:20
-
static void exact_knn(const Eigen::Ref< const Eigen::VectorXf > &q, const Eigen::Ref< const Eigen::MatrixXf > &X, int k, int *out, float *out_distances=nullptr)
Definition: Mrpt.h:855
-
void grow(double target_recall, const float *Q, int n_test, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density=-1.0, int seed=0, const std::vector< int > &indices_test={})
Definition: Mrpt.h:216
-
void query(const Eigen::Ref< const Eigen::VectorXf > &q, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:781
-
Mrpt * subset_pointer(double target_recall) const
Definition: Mrpt.h:570
+
static void exact_knn(const Eigen::Ref< const Eigen::VectorXf > &q, const Eigen::Ref< const Eigen::MatrixXf > &X, int k, int *out, float *out_distances=nullptr)
Definition: Mrpt.h:862
+
void grow(double target_recall, const float *Q, int n_test, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density=-1.0, int seed=0, const std::vector< int > &indices_test={})
Definition: Mrpt.h:218
+
void query(const Eigen::Ref< const Eigen::VectorXf > &q, int *out, float *out_distances=nullptr, int *out_n_elected=nullptr) const
Definition: Mrpt.h:788
+
Mrpt * subset_pointer(double target_recall) const
Definition: Mrpt.h:577
void grow(int n_trees_, int depth_, float density_=-1.0, int seed=0)
Definition: Mrpt.h:84
-
bool save(const char *path) const
Definition: Mrpt.h:898
-
void grow(const Eigen::Ref< const Eigen::MatrixXf > &Q, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0)
Definition: Mrpt.h:462
-
void exact_knn(const Eigen::Ref< const Eigen::VectorXf > &q, int k, int *out, float *out_distances=nullptr) const
Definition: Mrpt.h:877
-
bool empty() const
Definition: Mrpt.h:1022
+
bool save(const char *path) const
Definition: Mrpt.h:905
+
void grow(const Eigen::Ref< const Eigen::MatrixXf > &Q, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0)
Definition: Mrpt.h:468
+
void exact_knn(const Eigen::Ref< const Eigen::VectorXf > &q, int k, int *out, float *out_distances=nullptr) const
Definition: Mrpt.h:884
+
bool empty() const
Definition: Mrpt.h:1029
int k
Definition: Mrpt.h:22
-
void grow(double target_recall, const Eigen::Ref< const Eigen::MatrixXf > &Q, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density=-1.0, int seed=0)
Definition: Mrpt.h:177
+
void grow(double target_recall, const Eigen::Ref< const Eigen::MatrixXf > &Q, int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density=-1.0, int seed=0)
Definition: Mrpt.h:178
Mrpt(const Eigen::Ref< const Eigen::MatrixXf > &X_)
Definition: Mrpt.h:49
-
Mrpt subset(double target_recall) const
Definition: Mrpt.h:520
+
Mrpt subset(double target_recall) const
Definition: Mrpt.h:527
double estimated_qtime
Definition: Mrpt.h:24
-
void grow_autotune(int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, int n_test=100)
Definition: Mrpt.h:496
+
void grow_autotune(int k_, int trees_max=-1, int depth_max=-1, int depth_min_=-1, int votes_max_=-1, float density_=-1.0, int seed=0, int n_test=100)
Definition: Mrpt.h:503