add weights argument (#29)

* remove unused variable (#26) * fix edf computation for deg = 2 * bump version + update docs * remove package cache on appveyor * weighted bandwidth selection * weighted fitting * adjust R interface * add unit tests * a few more sanity checks * update docs
tnagler · Oct 8, 2018 · e9a9a54 · e9a9a54
1 parent 10b08bd
commit e9a9a54
Show file tree

Hide file tree

Showing 14 changed files with 198 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,7 +26,7 @@ Imports:
     qrng,
     stats,
     utils
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
 Suggests: 
     testthat
 URL: https://github.com/tnagler/kde1d

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -13,8 +13,8 @@
 #' @return `An Rcpp::List` containing the fitted density values on a grid and
 #'   additional information.
 #' @noRd
-fit_kde1d_cpp <- function(x, bw, xmin, xmax, deg) {
-    .Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, bw, xmin, xmax, deg)
+fit_kde1d_cpp <- function(x, bw, xmin, xmax, deg, weights) {
+    .Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, bw, xmin, xmax, deg, weights)
 }
 
 #' computes the pdf of a kernel density estimate by interpolation.
@@ -48,9 +48,10 @@ qkde1d_cpp <- function(x, R_object) {
 #' @param x vector of observations
 #' @param grid_size number of equally-spaced points over which binning is
 #' performed to obtain kernel functional approximation
+#' @param weights vector of weights for each observation (can be empty).
 #' @return the selected bandwidth
 #' @noRd
-select_bw_cpp <- function(x, bw, mult, discrete) {
-    .Call('_kde1d_select_bw_cpp', PACKAGE = 'kde1d', x, bw, mult, discrete)
+select_bw_cpp <- function(x, bw, mult, discrete, weights) {
+    .Call('_kde1d_select_bw_cpp', PACKAGE = 'kde1d', x, bw, mult, discrete, weights)
 }
 
diff --git a/R/kde1d.R b/R/kde1d.R
@@ -83,25 +83,30 @@
 #' @importFrom cctools cont_conv
 #' @importFrom stats na.omit
 #' @export
-kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2) {
+kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2,
+                  weights = numeric(0)) {
     x <- na.omit(x)
     # sanity checks
-    check_arguments(x, mult, xmin, xmax, bw, deg)
+    check_arguments(x, mult, xmin, xmax, bw, deg, weights)
 
     # jittering for discrete variables
     x <- cctools::cont_conv(x)
 
     # bandwidth selection
-    bw <- select_bw_cpp(boundary_transform(x, xmin, xmax), bw, mult,
-                        length(attr(x, "i_disc")) == 1)
+    bw <- select_bw_cpp(boundary_transform(x, xmin, xmax),
+                        bw,
+                        mult,
+                        length(attr(x, "i_disc")) == 1,
+                        weights)
 
     # fit model
-    fit <- fit_kde1d_cpp(x, bw, xmin, xmax, deg)
+    fit <- fit_kde1d_cpp(x, bw, xmin, xmax, deg, weights)
 
     # add info
     fit$jitter_info <- attributes(x)
     fit$var_name <- colnames(x)
     fit$nobs <- length(x)
+    fit$weights <- weights
 
     # return as kde1d object
     class(fit) <- "kde1d"

diff --git a/R/kde1d_methods.R b/R/kde1d_methods.R
@@ -72,10 +72,12 @@ pkde1d <- function(q, obj) {
         p <- pkde1d_cpp(q, obj)
     } else {
         if (!is.ordered(q))
+
             q <- ordered(q, obj$jitter_info$levels$x)
         x_all <- as.ordered(obj$jitter_info$levels$x)
         p_all <- dkde1d(x_all, obj)
-        p <- sapply(q, function(y) sum(p_all[x_all <= y]))
+        p_total <- sum(p_all)
+        p <- sapply(q, function(y) sum(p_all[x_all <= y] / p_total))
         p <- pmin(pmax(p, 0), 1)
     }
 

diff --git a/R/tools.R b/R/tools.R
@@ -15,8 +15,21 @@ check_boundary_violations <- function(x, xmin, xmax) {
 
 #' check and pre-process arguments passed to kde1d()
 #' @noRd
-check_arguments <- function(x, mult, xmin, xmax, bw, deg) {
+check_arguments <- function(x, mult, xmin, xmax, bw, deg, weights) {
     stopifnot(NCOL(x) == 1)
+    stopifnot(length(mult) == 1)
+    stopifnot(length(xmin) == 1)
+    stopifnot(length(xmax) == 1)
+    stopifnot(length(bw) == 1)
+    stopifnot(length(deg) == 1)
+
+    stopifnot(is.numeric(mult))
+    stopifnot(mult > 0)
+    stopifnot(is.numeric(xmin))
+    stopifnot(is.numeric(xmax))
+    stopifnot(is.numeric(xmax))
+    stopifnot(is.na(bw) | is.numeric(bw))
+    stopifnot(is.numeric(deg))
 
     if (!is.ordered(x) & is.factor(x))
         stop("Factors not allowed; use kdevine::kdevine() or cctools::cckde().")
@@ -34,6 +47,9 @@ check_arguments <- function(x, mult, xmin, xmax, bw, deg) {
 
     if (!(deg %in% 0:2))
         stop("deg must be either 0, 1, or 2.")
+
+    if ((length(weights) > 0) && (length(weights) != length(x)))
+        stop("x and weights must have same length.")
 }
 
 #' adjusts observations and evaluation points for boundary effects

diff --git a/inst/include/dpik.hpp b/inst/include/dpik.hpp
@@ -10,9 +10,11 @@
 //! @param a lower bound
 //! @param b upper bound
 //! @param m number of bins
+//' @param weights vector of weights for each observation.
 //! @return bin counts
 inline Eigen::VectorXd linbin(const Eigen::VectorXd& x,
-                              double a, double b, size_t m)
+                              double a, double b, size_t m,
+                              const Eigen::VectorXd& weights)
 {
     Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(m);
     double rem, lxi, delta;
@@ -23,11 +25,12 @@ inline Eigen::VectorXd linbin(const Eigen::VectorXd& x,
         size_t li = static_cast<size_t>(lxi);
         rem = lxi - li;
         if (li >= 1 && li < m) {
-            gcnts(li - 1) += 1 - rem;
-            gcnts(li) += rem;
+            gcnts(li - 1) += (1 - rem) * weights(i);
+            gcnts(li) += rem * weights(i);
         }
     }
-    return(gcnts);
+
+    return gcnts;
 }
 
 // Binned Kernel Functional Estimate
@@ -83,41 +86,54 @@ double bkfe(const Eigen::VectorXd& x,
 
 // Bandwidth for Kernel Density Estimation
 //! @param x vector of observations
+//! @param weights vector of weights for each observation (can be empty).
 //! @param grid_size number of equally-spaced points over which binning is
 //! performed to obtain kernel functional approximation
 //! @return the selected bandwidth
-inline double dpik(const Eigen::VectorXd& x, size_t grid_size = 401) {
+inline double dpik(const Eigen::VectorXd& x, 
+                   Eigen::VectorXd weights = Eigen::VectorXd(),
+                   size_t grid_size = 401) 
+{
+    if (weights.size() > 0 && (weights.size() != x.size()))
+        throw std::runtime_error("x and weights must have the same size");
 
-    double n = static_cast<double>(x.size());
+    if (weights.size() == 0) {
+        weights = Eigen::VectorXd::Constant(x.size(), 1.0);
+    } else {
+        weights = weights * x.size() / weights.sum();
+    }
 
+    double n = static_cast<double>(x.size());
     double a = x.minCoeff();
     double b = x.maxCoeff();
 
-    double m_x = x.mean();
+    double m_x = x.cwiseProduct(weights).mean();
     Eigen::VectorXd sx = (x - Eigen::VectorXd::Constant(x.size(), m_x));
-    double sd_x = std::sqrt(sx.cwiseAbs2().sum()/(n - 1));
+    double sd_x = std::sqrt(sx.cwiseAbs2().cwiseProduct(weights).sum()/(n - 1));
     Eigen::VectorXd q_x(2);
     q_x(0) = 0.75;
     q_x(1) = 0.25;
-    q_x = stats::quantile(x, q_x);
+    q_x = stats::quantile(x, q_x, weights);
     double scale = std::min((q_x(0) - q_x(1))/1.349, sd_x);
 
+    // use effective sample size for pilot bandwidths
+    double effn = std::pow(weights.sum(), 2) / weights.cwiseAbs2().sum();
     double bw = 0.0;
     try {
         sx /= scale;
         double sa = (a - m_x) / scale;
         double sb = (b - m_x) / scale;
-        auto x2 = linbin(sx, sa, sb, grid_size);
+        auto x2 = linbin(sx, sa, sb, grid_size, weights);
 
-        double alpha = std::pow(std::pow(2.0, 11.0 / 2.0)/(7.0 * n), 1.0 / 9.0);
+        double alpha = std::pow(std::pow(2.0, 11.0 / 2.0)/(7.0 * effn), 1.0 / 9.0);
         double psi6hat = bkfe(x2, 6, alpha, sa, sb);
-        alpha = std::pow(-3.0 * std::sqrt(2.0 / M_PI)/(psi6hat * n), 1.0 / 7.0);
+        alpha = std::pow(-3.0 * std::sqrt(2.0 / M_PI)/(psi6hat * effn), 1.0 / 7.0);
         double psi4hat = bkfe(x2, 4, alpha, sa, sb);
 
         double del0 = 1.0 / std::pow(4.0 * M_PI, 1.0 / 10.0);
-        bw = scale * del0 * std::pow(1.0 / (psi4hat * n), 1.0 / 5.0);
+        bw = scale * del0 * std::pow(1.0 / (psi4hat * effn), 1.0 / 5.0);
     } catch (...) {
-        bw = 4.0 * 1.06 *scale * std::pow(1.0 / n, 1.0 / 5.0);
+        bw = 4.0 * 1.06 * scale * std::pow(1.0 / effn, 1.0 / 5.0);
     }
 
     return(bw);

diff --git a/inst/include/lpdens.hpp b/inst/include/lpdens.hpp
@@ -10,7 +10,8 @@ class LPDens1d {
 public:
     // constructors
     LPDens1d() {}
-    LPDens1d(Eigen::VectorXd x, double bw, double xmin, double xmax, size_t p);
+    LPDens1d(Eigen::VectorXd x, double bw, double xmin, double xmax, size_t p,
+             const Eigen::VectorXd& weights = Eigen::VectorXd());
 
     // getters
     Eigen::VectorXd get_values() const {return grid_.get_values();}
@@ -35,12 +36,14 @@ class LPDens1d {
     // private methods
     Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x);
     Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x_ev,
-                           const Eigen::VectorXd& x);
+                           const Eigen::VectorXd& x,
+                           const Eigen::VectorXd& weights);
     double calculate_infl(const size_t& n,
                           const double& f0,
                           const double& b,
                           const double& bw,
-                          const double& s);
+                          const double& s,
+                          const double& weight);
     Eigen::VectorXd boundary_transform(const Eigen::VectorXd& x,
                                        bool inverse = false);
     Eigen::VectorXd boundary_correct(const Eigen::VectorXd& x,
@@ -58,16 +61,21 @@ class LPDens1d {
 //! @param xmax upper bound for the support of the density, `NaN` means no
 //!   boundary.
 //! @param p order of the local polynomial.
+//! @param weights vector of weights for each observation (can be empty).
 inline LPDens1d::LPDens1d(Eigen::VectorXd x,
                           double bw,
                           double xmin,
                           double xmax,
-                          size_t deg) :
+                          size_t deg,
+                          const Eigen::VectorXd& weights) :
     bw_(bw),
     xmin_(xmin),
     xmax_(xmax),
     deg_(deg)
 {
+    if (weights.size() > 0 && (weights.size() != x.size()))
+        throw std::runtime_error("x and weights must have the same size");
+
     // construct equally spaced grid on original domain
     Eigen::VectorXd grid_points = construct_grid_points(x);
 
@@ -76,7 +84,7 @@ inline LPDens1d::LPDens1d(Eigen::VectorXd x,
     x = boundary_transform(x);
 
     // fit model and evaluate in transformed domain
-    Eigen::MatrixXd fitted = fit_lp(grid_points, x);
+    Eigen::MatrixXd fitted = fit_lp(grid_points, x, weights);
 
     // back-transform grid to original domain
     grid_points = boundary_transform(grid_points, true);
@@ -123,10 +131,12 @@ inline Eigen::VectorXd LPDens1d::kern_gauss(const Eigen::VectorXd& x)
 //! function on a user-supplied grid.
 //! @param x_ev evaluation points.
 //! @param x observations.
+//! @param weights vector of weights for each observation (can be empty).
 //! @return a two-column matrix containing the density estimate in the first
 //!   and the influence function in the second column.
 inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
-                                        const Eigen::VectorXd& x)
+                                        const Eigen::VectorXd& x,
+                                        const Eigen::VectorXd& weights)
 {
     Eigen::MatrixXd res(x_ev.size(), 2);
     size_t n = x.size();
@@ -140,6 +150,8 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
         // classical (local constant) kernel density estimate
         xx = (x.array() - x_ev(k)) / bw_;
         kernels = kern_gauss(xx) / bw_;
+        if (weights.size() > 0)
+            kernels = kernels.cwiseProduct(weights);
         f0 = kernels.mean();
         res(k, 0) = f0;
 
@@ -168,7 +180,11 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
         }
 
         // influence function estimate
-        res(k, 1) = calculate_infl(n, f0, b, bw_, s);
+        if (weights.size() > 0) {
+            res(k, 1) = calculate_infl(n, f0, b, bw_, s, weights(k));
+        } else {
+            res(k, 1) = calculate_infl(n, f0, b, bw_, s, 1.0);
+        }
     }
 
     return res;
@@ -180,7 +196,8 @@ inline double LPDens1d::calculate_infl(const size_t &n,
                                        const double& f0,
                                        const double& b,
                                        const double& bw,
-                                       const double& s)
+                                       const double& s,
+                                       const double& weight)
 {
     Eigen::MatrixXd M;
     double bw2 = std::pow(bw, 2);
@@ -208,7 +225,7 @@ inline double LPDens1d::calculate_infl(const size_t &n,
     }
 
     double infl = kern_gauss(Eigen::VectorXd::Zero(1))(0) / bw;
-    infl *= M.inverse()(0, 0) / static_cast<double>(n);
+    infl *= M.inverse()(0, 0) * weight / static_cast<double>(n);
     return infl;
 }
 

diff --git a/inst/include/stats.hpp b/inst/include/stats.hpp
@@ -46,7 +46,7 @@ inline Eigen::MatrixXd qnorm(const Eigen::MatrixXd& x)
 //! @param x data.
 //! @param q evaluation points.
 //! @return vector of quantiles.
-inline Eigen::VectorXd quantile(const Eigen::VectorXd &x,
+inline Eigen::VectorXd quantile(const Eigen::VectorXd& x,
                                 const Eigen::VectorXd& q)
 {
     double n = static_cast<double>(x.size());
@@ -66,4 +66,47 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd &x,
     return res;
 }
 
+//! empirical quantiles
+//! @param x data.
+//! @param q evaluation points.
+//! @param w vector of weights.
+//! @return vector of quantiles.
+inline Eigen::VectorXd quantile(const Eigen::VectorXd& x,
+                                const Eigen::VectorXd& q,
+                                const Eigen::VectorXd& w)
+{
+    if (w.size() != x.size())
+        throw std::runtime_error("x and w must have the same size");
+    double n = static_cast<double>(x.size());
+    size_t m = q.size();
+    Eigen::VectorXd res(m);
+
+    // map to std::vector and sort
+    std::vector<size_t> ind(n);
+    for (size_t i = 0; i < n; ++i)
+        ind[i] = i;
+    std::sort(ind.begin(), ind.end(),
+              [&x] (size_t i, size_t j) { return x(i) < x(j); });
+
+    auto x2 = x;
+    auto wcum = w;
+    auto wrank = Eigen::VectorXd::Constant(n, 0.0);
+    double wacc = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        x2(i) = x(ind[i]);
+        wcum(i) = wacc;
+        wacc += w(ind[i]);
+    }
+
+    double wsum = w.sum() - w(ind[n - 1]);;
+    for (size_t j = 0; j < m; ++j) {
+        size_t i = 1;
+        while (wcum(i) < q(j) * wsum)
+            i++;
+        res(j) = x2(i - 1) + (x2(i) - x2(i - 1)) * (q(j) * wsum - wcum(i - 1));
+    }
+
+    return res;
+}
+
 }
diff --git a/man/dkde1d.Rd b/man/dkde1d.Rd
diff --git a/man/kde1d.Rd b/man/kde1d.Rd