diff --git a/CMakeLists.txt b/CMakeLists.txt index bd487165..8e767f84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ set(albatross_HEADERS albatross/tuning_metrics.h albatross/map_utils.h albatross/csv_utils.h + albatross/outlier.h albatross/core/keys.h albatross/core/dataset.h albatross/core/model.h diff --git a/albatross/core/indexing.h b/albatross/core/indexing.h index c81c5ca8..d4a8fded 100644 --- a/albatross/core/indexing.h +++ b/albatross/core/indexing.h @@ -15,15 +15,17 @@ #include "core/dataset.h" #include +#include #include #include +#include #include +#include #include namespace albatross { -using s32 = int32_t; -using FoldIndices = std::vector; +using FoldIndices = std::vector; using FoldName = std::string; using FoldIndexer = std::map; @@ -83,7 +85,7 @@ inline Eigen::MatrixXd subset(const std::vector &row_indices, auto ii = static_cast(i); auto jj = static_cast(j); auto row_index = static_cast(row_indices[i]); - auto col_index = static_cast(col_indices[i]); + auto col_index = static_cast(col_indices[j]); out(ii, jj) = v(row_index, col_index); } } @@ -141,20 +143,26 @@ template struct RegressionFold { test_indices(test_indices_){}; }; -inline FoldIndices get_train_indices(const FoldIndices &test_indices, - const int n) { - const s32 k = static_cast(test_indices.size()); - // The train indices are all the indices that are not test indices. - FoldIndices train_indices(n - k); - s32 train_cnt = 0; - for (s32 j = 0; j < n; j++) { - if (std::find(test_indices.begin(), test_indices.end(), j) == - test_indices.end()) { - train_indices[train_cnt] = j; - train_cnt++; - } - } - return train_indices; +template +inline std::vector vector_set_difference(const std::vector &x, + const std::vector &y) { + std::vector diff; + std::set_difference(x.begin(), x.end(), y.begin(), y.end(), + std::inserter(diff, diff.begin())); + return diff; +} + +/* + * Computes the indices between 0 and n - 1 which are NOT contained + * in `indices`. Here complement is the mathematical interpretation + * of the word meaning "the part required to make something whole". + * In other words, indices and indices_complement(indices) should + * contain all the numbers between 0 and n-1 + */ +inline FoldIndices indices_complement(const FoldIndices &indices, const int n) { + FoldIndices all_indices(n); + std::iota(all_indices.begin(), all_indices.end(), 0); + return vector_set_difference(all_indices, indices); } /* @@ -168,7 +176,7 @@ static inline std::vector> folds_from_fold_indexer(const RegressionDataset &dataset, const FoldIndexer &groups) { // For a dataset with n features, we'll have n folds. - const s32 n = static_cast(dataset.features.size()); + const std::size_t n = dataset.features.size(); std::vector> folds; // For each fold, partition into train and test sets. for (const auto &pair : groups) { @@ -177,7 +185,7 @@ folds_from_fold_indexer(const RegressionDataset &dataset, // from changing the input FoldIndexer we perform a copy here. const FoldName group_name(pair.first); const FoldIndices test_indices(pair.second); - const auto train_indices = get_train_indices(test_indices, n); + const auto train_indices = indices_complement(test_indices, n); std::vector train_features = subset(train_indices, dataset.features); @@ -205,7 +213,7 @@ template static inline FoldIndexer leave_one_out_indexer(const RegressionDataset &dataset) { FoldIndexer groups; - for (s32 i = 0; i < static_cast(dataset.features.size()); i++) { + for (std::size_t i = 0; i < dataset.features.size(); i++) { FoldName group_name = std::to_string(i); groups[group_name] = {i}; } @@ -221,9 +229,8 @@ static inline FoldIndexer leave_one_group_out_indexer( const RegressionDataset &dataset, const std::function &get_group_name) { FoldIndexer groups; - for (s32 i = 0; i < static_cast(dataset.features.size()); i++) { - const std::string k = - get_group_name(dataset.features[static_cast(i)]); + for (std::size_t i = 0; i < dataset.features.size(); i++) { + const std::string k = get_group_name(dataset.features[i]); // Get the existing indices if we've already encountered this group_name // otherwise initialize a new one. FoldIndices indices; diff --git a/albatross/core/model.h b/albatross/core/model.h index dc01a5be..e568c7c4 100644 --- a/albatross/core/model.h +++ b/albatross/core/model.h @@ -66,8 +66,7 @@ class RegressionModel : public ParameterHandlingMixin { void fit(const std::vector &features, const MarginalDistribution &targets) { assert(features.size() > 0); - assert(static_cast(features.size()) == - static_cast(targets.size())); + assert(features.size() == static_cast(targets.size())); fit_(features, targets); has_been_fit_ = true; } @@ -164,7 +163,6 @@ class RegressionModel : public ParameterHandlingMixin { protected: virtual void fit_(const std::vector &features, const MarginalDistribution &targets) = 0; - /* * Predict specializations */ @@ -174,8 +172,7 @@ class RegressionModel : public ParameterHandlingMixin { detail::PredictTypeIdentity &&identity) const { assert(has_been_fit()); JointDistribution preds = predict_(features); - assert(static_cast(preds.mean.size()) == - static_cast(features.size())); + assert(static_cast(preds.mean.size()) == features.size()); return preds; } @@ -184,8 +181,7 @@ class RegressionModel : public ParameterHandlingMixin { detail::PredictTypeIdentity &&identity) const { assert(has_been_fit()); MarginalDistribution preds = predict_marginal_(features); - assert(static_cast(preds.mean.size()) == - static_cast(features.size())); + assert(static_cast(preds.mean.size()) == features.size()); return preds; } @@ -194,7 +190,7 @@ class RegressionModel : public ParameterHandlingMixin { detail::PredictTypeIdentity &&identity) const { assert(has_been_fit()); Eigen::VectorXd preds = predict_mean_(features); - assert(static_cast(preds.size()) == static_cast(features.size())); + assert(static_cast(preds.size()) == features.size()); return preds; } diff --git a/albatross/models/gp.h b/albatross/models/gp.h index 64b53078..c951aaf4 100644 --- a/albatross/models/gp.h +++ b/albatross/models/gp.h @@ -10,8 +10,8 @@ * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. */ -#ifndef ALBATROSS_GP_GP_H -#define ALBATROSS_GP_GP_H +#ifndef ALBATROSS_MODELS_GP_H +#define ALBATROSS_MODELS_GP_H #include "evaluate.h" #include "stdio.h" @@ -32,6 +32,12 @@ template struct GaussianProcessFit { Eigen::SerializableLDLT train_ldlt; Eigen::VectorXd information; + GaussianProcessFit(){}; + + GaussianProcessFit(const std::vector &features, + const Eigen::MatrixXd &train_cov, + const MarginalDistribution &targets); + template // todo: enable if FeatureType is serializable void serialize(Archive &archive) { @@ -46,6 +52,32 @@ template struct GaussianProcessFit { } }; +template +GaussianProcessFit::GaussianProcessFit( + const std::vector &features, const Eigen::MatrixXd &train_cov, + const MarginalDistribution &targets) { + train_features = features; + Eigen::MatrixXd cov(train_cov); + if (targets.has_covariance()) { + cov += targets.covariance; + } + train_ldlt = Eigen::SerializableLDLT(cov.ldlt()); + // Precompute the information vector + information = train_ldlt.solve(targets.mean); +} + +template +inline JointDistribution predict_from_covariance_and_fit( + const Eigen::MatrixXd &cross_cov, const Eigen::MatrixXd &pred_cov, + const GaussianProcessFit &model_fit) { + const Eigen::VectorXd pred = cross_cov.transpose() * model_fit.information; + auto ldlt = model_fit.train_ldlt; + Eigen::MatrixXd posterior_cov = ldlt.solve(cross_cov); + posterior_cov = cross_cov.transpose() * posterior_cov; + posterior_cov = pred_cov - posterior_cov; + return JointDistribution(pred, posterior_cov); +} + template using SerializableGaussianProcess = SerializableRegressionModelmodel_fit_.train_ldlt; pred_cov -= cross_cov * ldlt.solve(cross_cov.transpose()); - assert(static_cast(pred.size()) == static_cast(features.size())); + assert(static_cast(pred.size()) == features.size()); return InspectionDistribution(pred, pred_cov); } @@ -212,27 +244,17 @@ class GaussianProcessRegression serializable_fit_(const std::vector &features, const MarginalDistribution &targets) const override { Eigen::MatrixXd cov = symmetric_covariance(covariance_function_, features); - FitType model_fit; - model_fit.train_features = features; - if (targets.has_covariance()) { - cov += targets.covariance; - } - model_fit.train_ldlt = Eigen::SerializableLDLT(cov.ldlt()); - // Precompute the information vector - model_fit.information = model_fit.train_ldlt.solve(targets.mean); - return model_fit; + return GaussianProcessFit(features, cov, targets); } JointDistribution predict_(const std::vector &features) const override { const auto cross_cov = asymmetric_covariance( - covariance_function_, features, this->model_fit_.train_features); - const Eigen::VectorXd pred = cross_cov * this->model_fit_.information; + covariance_function_, this->model_fit_.train_features, features); Eigen::MatrixXd pred_cov = symmetric_covariance(covariance_function_, features); - auto ldlt = this->model_fit_.train_ldlt; - pred_cov -= cross_cov * ldlt.solve(cross_cov.transpose()); - return JointDistribution(pred, pred_cov); + return predict_from_covariance_and_fit(cross_cov, pred_cov, + this->model_fit_); } virtual MarginalDistribution @@ -261,7 +283,6 @@ class GaussianProcessRegression return pred; } -private: CovarianceFunction covariance_function_; std::string model_name_; }; diff --git a/albatross/models/least_squares.h b/albatross/models/least_squares.h index 96cbe48e..5c54ee7c 100644 --- a/albatross/models/least_squares.h +++ b/albatross/models/least_squares.h @@ -70,11 +70,11 @@ class LeastSquaresRegression protected: Eigen::VectorXd predict_mean_(const std::vector &features) const override { - int n = static_cast(features.size()); + std::size_t n = features.size(); Eigen::VectorXd mean(n); - for (s32 i = 0; i < n; i++) { - mean(i) = - features[static_cast(i)].dot(this->model_fit_.coefs); + for (std::size_t i = 0; i < n; i++) { + mean(static_cast(i)) = + features[i].dot(this->model_fit_.coefs); } return mean; } diff --git a/albatross/models/ransac_gp.h b/albatross/models/ransac_gp.h new file mode 100644 index 00000000..a7686fa5 --- /dev/null +++ b/albatross/models/ransac_gp.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2018 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_MODELS_RANSAC_GP_H +#define ALBATROSS_MODELS_RANSAC_GP_H + +#include "crossvalidation.h" +#include "gp.h" +#include "outlier.h" +#include + +namespace albatross { + +template +class RansacGaussianProcessRegression + : public GaussianProcessRegression { +public: + using BaseModel = GaussianProcessRegression; + using FitType = GaussianProcessFit; + + RansacGaussianProcessRegression(CovarianceFunction &covariance_function, + double inlier_threshold_, + std::size_t min_inliers_, + std::size_t min_features_, + std::size_t max_iterations_) + : BaseModel(covariance_function), inlier_threshold(inlier_threshold_), + min_inliers(min_inliers_), min_features(min_features_), + max_iterations(max_iterations_){}; + + template void save(Archive &archive) const { + archive(cereal::base_class(this), + cereal::make_nvp("inlier_threshold", inlier_threshold), + cereal::make_nvp("min_inliers", min_inliers), + cereal::make_nvp("min_features", min_features), + cereal::make_nvp("max_iterations", max_iterations)); + } + + template void load(Archive &archive) { + archive(cereal::base_class(this), + cereal::make_nvp("inlier_threshold", inlier_threshold), + cereal::make_nvp("min_inliers", min_inliers), + cereal::make_nvp("min_features", min_features), + cereal::make_nvp("max_iterations", max_iterations)); + } + +protected: + virtual std::vector cross_validated_predictions_( + const RegressionDataset &dataset, + const FoldIndexer &fold_indexer, + const detail::PredictTypeIdentity &identity) override { + const auto folds = folds_from_fold_indexer(dataset, fold_indexer); + return this->template cross_validated_predictions(folds); + } + + virtual std::vector cross_validated_predictions_( + const RegressionDataset &dataset, + const FoldIndexer &fold_indexer, + const detail::PredictTypeIdentity &identity) + override { + const auto folds = folds_from_fold_indexer(dataset, fold_indexer); + return this->template cross_validated_predictions( + folds); + } + + virtual std::vector cross_validated_predictions_( + const RegressionDataset &dataset, + const FoldIndexer &fold_indexer, + const detail::PredictTypeIdentity &identity) override { + const auto folds = folds_from_fold_indexer(dataset, fold_indexer); + return this->template cross_validated_predictions(folds); + } + + FitType + serializable_fit_(const std::vector &features, + const MarginalDistribution &targets) const override { + /* + * This function is basically just setting up the call to `ransac` by + * defining `fitter`, `outlier_metric` and `model_metric` functions + * that can perform each operation efficiently for Gaussian processes. + */ + + Eigen::MatrixXd cov = + symmetric_covariance(this->covariance_function_, features); + + if (max_iterations < 1) { + return GaussianProcessFit(features, cov, targets); + } + + struct FitAndIndices { + FitType model_fit; + Indexer fit_indices; + }; + + std::function fitter = + [&](const Indexer &indexer) { + const auto train_features = subset(indexer, features); + const auto train_targets = subset(indexer, targets); + auto train_cov = symmetric_subset(indexer, cov); + const FitAndIndices fit_and_indices = { + GaussianProcessFit(train_features, train_cov, + train_targets), + indexer}; + return fit_and_indices; + }; + + std::function + outlier_metric = [&](const Indexer &test_indices, + const FitAndIndices &fit_and_indices) { + const auto cross_cov = + subset(fit_and_indices.fit_indices, test_indices, cov); + const auto pred_cov = symmetric_subset(test_indices, cov); + const auto pred = predict_from_covariance_and_fit( + cross_cov, pred_cov, fit_and_indices.model_fit); + const auto target = subset(test_indices, targets); + double metric_value = metric(pred, target); + return metric_value; + }; + + std::function model_metric = [&]( + const Indexer &inliers) { + RegressionDataset inlier_dataset(subset(inliers, features), + subset(inliers, targets)); + const FoldIndexer inlier_loo = leave_one_out_indexer(inlier_dataset); + + BaseModel model; + model.set_params(this->get_params()); + double mean_metric = cross_validated_scores( + metric, inlier_dataset, inlier_loo, &model) + .mean(); + return mean_metric; + }; + + // Now that the ransac functions are defined we can actually perform + // RANSAC, then return the subsequent outlier free fit. + const RegressionDataset dataset(features, targets); + const auto loo_indexer = leave_one_out_indexer(dataset); + + auto inliers = ransac( + fitter, outlier_metric, model_metric, map_values(loo_indexer), + inlier_threshold, min_features, min_inliers, max_iterations); + + const auto inlier_features = subset(inliers, features); + const auto inlier_targets = subset(inliers, targets); + const auto inlier_cov = symmetric_subset(inliers, cov); + + return GaussianProcessFit(inlier_features, inlier_cov, + inlier_targets); + } + + double inlier_threshold; + std::size_t min_inliers; + std::size_t min_features; + std::size_t max_iterations; + EvaluationMetric metric = + albatross::evaluation_metrics::negative_log_likelihood; +}; + +template +std::unique_ptr> +ransac_gp_pointer_from_covariance(CovFunc covariance_function, + double inlier_threshold = 1., + std::size_t min_inliers = 3, + std::size_t min_features = 3, + std::size_t max_iterations = 20) { + return std::make_unique< + RansacGaussianProcessRegression>( + covariance_function, inlier_threshold, min_inliers, min_features, + max_iterations); +}; + +} // namespace albatross + +#endif diff --git a/albatross/outlier.h b/albatross/outlier.h new file mode 100644 index 00000000..c5171d78 --- /dev/null +++ b/albatross/outlier.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2018 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_CORE_OUTLIER_H +#define ALBATROSS_CORE_OUTLIER_H + +#include "core/dataset.h" +#include "core/indexing.h" +#include "core/model.h" +#include "crossvalidation.h" +#include "random_utils.h" +#include +#include +#include + +namespace albatross { + +using Indexer = std::vector; +using GroupIndexer = std::vector>; + +Indexer concatenate_subset_of_groups(const Indexer &subset_indices, + const GroupIndexer &indexer) { + + Indexer output; + for (const auto &i : subset_indices) { + assert(i < static_cast(indexer.size())); + output.insert(output.end(), indexer[i].begin(), indexer[i].end()); + } + return output; +} + +/* + * This RANdom SAmple Consensus (RANSAC) algorithm works as follows. + * + * 1) Randomly sample a small number of data points and fit a + * reference model to that data. + * 2) Assemble all the data points that agree with the + * reference model into a set of inliers. + * 3) Evaluate the quality of the inliers. + * 4) Repeat N times keeping track of the best set of inliers. + * + * One of the large drawbacks of this approach is the computational + * load since it requires fitting and predicting repeatedly. + * The goal of this implementation is to provide a way + * for the user to optionally perform a lot of computation upfront, + * then use call backs which take indices as input to selectively + * update/downdate the model to produce the fits and evaluation + * metrics. + */ +template +Indexer +ransac(std::function &fitter, + std::function &outlier_metric, + std::function &model_metric, + const GroupIndexer &indexer, double inlier_threshold, + std::size_t min_features, std::size_t min_inliers, + std::size_t max_iterations) { + + std::default_random_engine gen; + + Indexer reference; + double best_metric = HUGE_VAL; + Indexer best_inds; + + for (std::size_t i = 0; i < max_iterations; i++) { + // Sample a random subset of the data and fit a model. + reference = + randint_without_replacement(min_features, 0, indexer.size() - 1, gen); + auto ref_inds = concatenate_subset_of_groups(reference, indexer); + const auto fit = fitter(ref_inds); + + // Find which of the other groups agree with the reference model + auto test_groups = indices_complement(reference, indexer.size()); + Indexer inliers; + for (const auto &test_ind : test_groups) { + double metric_value = outlier_metric(indexer[test_ind], fit); + if (metric_value < inlier_threshold) { + inliers.push_back(test_ind); + } + } + + // If there is enough agreement, consider this random set of inliers + // as a candidate model. + if (inliers.size() > min_inliers) { + const auto inlier_inds = concatenate_subset_of_groups(inliers, indexer); + ref_inds.insert(ref_inds.end(), inlier_inds.begin(), inlier_inds.end()); + std::sort(ref_inds.begin(), ref_inds.end()); + double model_metric_value = model_metric(ref_inds); + if (model_metric_value < best_metric) { + best_inds = ref_inds; + best_metric = model_metric_value; + } + } + } + assert(best_metric < HUGE_VAL); + return best_inds; +} + +template +RegressionDataset +ransac(const RegressionDataset &dataset, + const FoldIndexer &fold_indexer, RegressionModel *model, + EvaluationMetric &metric, double inlier_threshold, + std::size_t min_features, std::size_t min_inliers, int max_iterations) { + + using FitType = RegressionModel *; + + using FitFunc = std::function &)>; + using OutlierFunc = + std::function &, const FitType &)>; + using ModelMetricFunc = + std::function &)>; + + FitFunc fitter = [&](const std::vector &inds) { + RegressionDataset dataset_subset( + subset(inds, dataset.features), subset(inds, dataset.targets)); + model->fit(dataset_subset); + return model; + }; + + OutlierFunc outlier_metric = [&](const std::vector &inds, + const FitType &fit) { + const auto pred = fit->predict(subset(inds, dataset.features)); + const auto target = subset(inds, dataset.targets); + double metric_value = metric(pred, target); + return metric_value; + }; + + ModelMetricFunc model_metric = [&](const std::vector &inds) { + RegressionDataset inlier_dataset( + subset(inds, dataset.features), subset(inds, dataset.targets)); + const auto inlier_loo = leave_one_out_indexer(inlier_dataset); + return cross_validated_scores(metric, inlier_dataset, inlier_loo, model) + .mean(); + + }; + + const auto best_inds = ransac( + fitter, outlier_metric, model_metric, map_values(fold_indexer), + inlier_threshold, min_features, min_inliers, max_iterations); + RegressionDataset best_dataset( + subset(best_inds, dataset.features), subset(best_inds, dataset.targets)); + return best_dataset; +} + +} // namespace albatross + +#endif diff --git a/albatross/random_utils.h b/albatross/random_utils.h new file mode 100644 index 00000000..6355b9dc --- /dev/null +++ b/albatross/random_utils.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2018 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_RANDOM_UTILS_H +#define ALBATROSS_RANDOM_UTILS_H + +#include "core/indexing.h" +#include +#include + +/* + * Samples integers between low and high (inclusive) with replacement. + */ +inline std::vector +randint_without_replacement(std::size_t n, std::size_t low, std::size_t high, + std::default_random_engine &gen) { + assert(n >= 0); + assert(n <= (high - low)); + + std::uniform_int_distribution dist(low, high); + std::set samples; + while (samples.size() < n) { + samples.insert(dist(gen)); + } + return std::vector(samples.begin(), samples.end()); +} + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9ba465c6..94163e00 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -21,6 +21,8 @@ test_serializable_ldlt.cc test_serialize.cc test_tune.cc test_tuning_metrics.cc +test_random_utils.cc +test_outlier.cc ) add_dependencies(albatross_unit_tests diff --git a/tests/test_evaluate.cc b/tests/test_evaluate.cc index 05de9e76..d51ad17e 100644 --- a/tests/test_evaluate.cc +++ b/tests/test_evaluate.cc @@ -80,7 +80,7 @@ std::string group_by_interval(const double &x) { } bool is_monotonic_increasing(const Eigen::VectorXd &x) { - for (s32 i = 0; i < static_cast(x.size()) - 1; i++) { + for (Eigen::Index i = 0; i < x.size() - 1; i++) { if (x[i + 1] - x[i] <= 0.) { return false; } diff --git a/tests/test_outlier.cc b/tests/test_outlier.cc new file mode 100644 index 00000000..b80f0280 --- /dev/null +++ b/tests/test_outlier.cc @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2018 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "models/ransac_gp.h" +#include "outlier.h" +#include "test_utils.h" +#include + +namespace albatross { + +TEST(test_outlier, test_ransac) { + auto dataset = make_toy_linear_data(); + const auto model_ptr = toy_gaussian_process(); + + EvaluationMetric nll = + albatross::evaluation_metrics::negative_log_likelihood; + + dataset.targets.mean[3] = 400.; + dataset.targets.mean[5] = -300.; + + const auto fold_indexer = leave_one_out_indexer(dataset); + const auto modified = + ransac(dataset, fold_indexer, model_ptr.get(), nll, 1., 3, 3, 20); + + EXPECT_EQ(modified.features.size(), dataset.features.size() - 2); + + // Make sure we threw out the correct features. + EXPECT_EQ(std::find(modified.features.begin(), modified.features.end(), + dataset.features[3]), + modified.features.end()); + EXPECT_EQ(std::find(modified.features.begin(), modified.features.end(), + dataset.features[5]), + modified.features.end()); +} + +static inline std::unique_ptr> +toy_ransac_gaussian_process() { + return ransac_gp_pointer_from_covariance(toy_covariance_function()); +} + +TEST(test_outlier, test_ransac_gp) { + auto dataset = make_toy_linear_data(); + + const auto fold_indexer = leave_one_out_indexer(dataset); + + const auto model_ptr = toy_ransac_gaussian_process(); + + EvaluationMetric nll = + albatross::evaluation_metrics::negative_log_likelihood; + + dataset.targets.mean[3] = 400.; + dataset.targets.mean[5] = -300.; + + const auto scores = + cross_validated_scores(nll, dataset, fold_indexer, model_ptr.get()); + + // Here we make sure the leave one out likelihoods for inliers are all + // reasonable, and for the known outliers we assert the likelihood is + // really really really small. + for (Eigen::Index i = 0; i < scores.size(); i++) { + if (i == 3 || i == 5) { + EXPECT_GE(scores[i], 1.e5); + } else { + EXPECT_LE(scores[i], 0.); + } + } +} + +} // namespace albatross diff --git a/tests/test_random_utils.cc b/tests/test_random_utils.cc new file mode 100644 index 00000000..5bcb08b8 --- /dev/null +++ b/tests/test_random_utils.cc @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2018 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "assert.h" +#include "random_utils.h" +#include + +namespace albatross { + +TEST(test_random_utils, randint_without_replacement) { + + int iterations = 10; + int k = 6; + + std::default_random_engine gen; + + for (int i = 0; i < iterations; i++) { + for (int n = 0; n <= k; n++) { + const auto inds = randint_without_replacement(n, i, i + k, gen); + for (const auto &j : inds) { + EXPECT_LE(j, i + k); + EXPECT_GE(j, i); + } + } + } +} + +} // namespace albatross diff --git a/tests/test_scaling_function.cc b/tests/test_scaling_function.cc index e4a39f52..fa75f20e 100644 --- a/tests/test_scaling_function.cc +++ b/tests/test_scaling_function.cc @@ -74,11 +74,11 @@ auto make_attenuation_data(const double attenuation = 3.14159, gen.seed(3); std::normal_distribution<> d{0., sigma_noise}; - s32 n = 10; + std::size_t n = 10; std::vector features; Eigen::VectorXd targets(n); - for (s32 i = 0; i < n; i++) { + for (std::size_t i = 0; i < n; i++) { // x ranges from 0 to 2 double x = static_cast(i) * (2. / static_cast(n)); features.push_back(x); diff --git a/tests/test_utils.h b/tests/test_utils.h index f0c1c0dd..ccb5d357 100644 --- a/tests/test_utils.h +++ b/tests/test_utils.h @@ -149,7 +149,7 @@ expect_parameter_vector_equal(const std::vector &x, static inline auto make_toy_linear_data(const double a = 5., const double b = 1., const double sigma = 0.1, - const s32 n = 10) { + const std::size_t n = 10) { std::random_device rd{}; std::mt19937 gen{rd()}; gen.seed(3); @@ -157,7 +157,7 @@ static inline auto make_toy_linear_data(const double a = 5., std::vector features; Eigen::VectorXd targets(n); - for (s32 i = 0; i < n; i++) { + for (std::size_t i = 0; i < n; i++) { double x = static_cast(i); features.push_back(x); targets[i] = a + x * b + d(gen); @@ -259,7 +259,7 @@ class LinearRegressionTest : public ::testing::Test { RegressionDataset dataset_; }; -inline auto random_spherical_points(s32 n = 10, double radius = 1., +inline auto random_spherical_points(std::size_t n = 10, double radius = 1., int seed = 5) { std::random_device rd{}; std::mt19937 gen{rd()}; @@ -270,7 +270,7 @@ inline auto random_spherical_points(s32 n = 10, double radius = 1., std::vector points; - for (s32 i = 0; i < n; i++) { + for (std::size_t i = 0; i < n; i++) { const double lon = rand_lon(gen); const double lat = rand_lat(gen); Eigen::VectorXd x(3);