diff --git a/.gitignore b/.gitignore index 5b6a065..92db1c7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +revdep diff --git a/DESCRIPTION b/DESCRIPTION index 81cad07..eeb3acb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: kde1d Type: Package Title: Univariate Kernel Density Estimation -Version: 0.1.2 +Version: 0.2.0 Authors@R: c( person("Thomas", "Nagler",, "mail@tnagler.com", role = c("aut", "cre")), person("Thibault", "Vatter",, "thibault.vatter@gmail.com", role = c("aut")) diff --git a/NEWS.md b/NEWS.md index e6d1acc..2bbd9f5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# kde1d 0.2.0 + +NEW FEATURES + + * improved stability of density estimates near a boundary (#21). + +BUG FIXES + + * consistent behavior when `dkde1d()` and `pkde1d()` are called with + non-`ordered` input although data are discrete (#19). + + * fixed bug in computation of kernel density estimates (#20). + + * adapt minimum `bw` allowed for discrete data to truncated Gaussian kernel + (#20). + + # kde1d 0.1.2 NEW FEATURES diff --git a/R/kde1d.R b/R/kde1d.R index 42e70e1..dc12c4e 100644 --- a/R/kde1d.R +++ b/R/kde1d.R @@ -50,22 +50,20 @@ #' Statistical Society, Series B, 53, 683–690. #' #' @examples -#' ## For reproducibility -#' set.seed(0) #' #' ## unbounded data -#' x <- rnorm(100) # simulate data +#' x <- rnorm(500) # simulate data #' fit <- kde1d(x) # estimate density -#' dkde1d(1000, fit) # evaluate density estimate +#' dkde1d(0, fit) # evaluate density estimate #' summary(fit) # information about the estimate #' plot(fit) # plot the density estimate #' curve(dnorm(x), add = TRUE, # add true density #' col = "red") #' #' ## bounded data, log-linear -#' x <- rgamma(100, shape = 1) # simulate data +#' x <- rgamma(500, shape = 1) # simulate data #' fit <- kde1d(x, xmin = 0, deg = 1) # estimate density -#' dkde1d(1000, fit) # evaluate density estimate +#' dkde1d(seq(0, 5, by = 1), fit) # evaluate density estimate #' summary(fit) # information about the estimate #' plot(fit) # plot the density estimate #' curve(dgamma(x, shape = 1), # add true density @@ -73,10 +71,10 @@ #' from = 1e-3) #' #' ## discrete data -#' x <- rbinom(100, size = 5, prob = 0.5) # simulate data +#' x <- rbinom(500, size = 5, prob = 0.5) # simulate data #' x <- ordered(x, levels = 0:5) # declare as ordered #' fit <- kde1d(x) # estimate density -#' dkde1d(2, fit) # evaluate density estimate +#' dkde1d(sort(unique(x)), fit) # evaluate density estimate #' summary(fit) # information about the estimate #' plot(fit) # plot the density estimate #' points(ordered(0:5, 0:5), # add true density @@ -109,49 +107,3 @@ kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2) { class(fit) <- "kde1d" fit } - -#' check and pre-process arguments passed to kde1d() -#' @noRd -check_arguments <- function(x, mult, xmin, xmax, bw, deg) { - stopifnot(NCOL(x) == 1) - - if (!is.ordered(x) & is.factor(x)) - stop("Factors not allowed; use kdevine::kdevine() or cctools::cckde().") - - stopifnot(mult > 0) - - if (is.ordered(x) & (!is.nan(xmin) | !is.nan(xmax))) - stop("xmin and xmax are not meaningful for x of type ordered.") - - if (!is.nan(xmax) & !is.nan(xmin)) { - if (xmin > xmax) - stop("xmin is larger than xmax.") - if (any(x < xmin) || any(x > xmax)) - stop("Not all data are contained in the interval [xmin, xmax].") - } else if (!is.nan(xmin)) { - if (any(x < xmin)) - stop("Not all data are larger than xmin.") - } else if (!is.nan(xmax)) { - if (any(x > xmax)) - stop("Not all data are samller than xmax.") - } - - if (!(deg %in% 0:2)) - stop("deg must be either 0, 1, or 2.") -} - -#' adjusts observations and evaluation points for boundary effects -#' @importFrom stats qnorm -#' @noRd -boundary_transform <- function(x, xmin, xmax) { - if (!is.nan(xmin) & !is.nan(xmax)) { # two boundaries - x <- qnorm((x - xmin) / (xmax - xmin + 1e-1)) - } else if (!is.nan(xmin)) { # left boundary - x <- log(x - xmin + 1e-3) - } else if (!is.nan(xmax)) { # right boundary - x <- log(xmax - x + 1e-3) - } - - x -} - diff --git a/R/kde1d_methods.R b/R/kde1d_methods.R index ef3cd5f..51744d0 100644 --- a/R/kde1d_methods.R +++ b/R/kde1d_methods.R @@ -38,13 +38,21 @@ dkde1d <- function(x, obj) { if (!is.ordered(x)) stopifnot(!is.factor(x)) - x <- expand_as_numeric(x) - fhat <- dkde1d_cpp(x, obj) + # adjust grid to stabilize estimates + rng <- diff(range(obj$grid_points)) + if (!is.nan(obj$xmin)) + obj$grid_points[1] <- obj$xmin - 0.1 * rng + if (!is.nan(obj$xmax)) + obj$grid_points[length(obj$grid_points)] <- obj$xmax + 0.1 * rng + + if (length(obj$jitter_info$i_disc) == 1 & !is.ordered(x)) + x <- ordered(x, obj$jitter_info$levels$x) + + fhat <- dkde1d_cpp(expand_as_numeric(x), obj) if (length(obj$jitter_info$i_disc) == 1) { # for discrete variables we can normalize - x_all_num <- expand_as_numeric(as.ordered(obj$jitter_info$levels$x)) - f_all <- dkde1d_cpp(x_all_num, obj) + f_all <- dkde1d_cpp(seq_along(obj$jitter_info$levels$x) - 1, obj) fhat <- fhat / sum(f_all) } @@ -60,15 +68,14 @@ pkde1d <- function(q, obj) { if (!is.ordered(q)) stopifnot(!is.factor(q)) - q <- expand_as_numeric(q) - if (length(obj$jitter_info$i_disc) != 1) { p <- pkde1d_cpp(q, obj) } else { - # for discrete variables we have to add the missing probability mass - x_all_num <- expand_as_numeric(as.ordered(obj$jitter_info$levels$x)) - f_all <- dkde1d(x_all_num, obj) - p <- sapply(q, function(y) sum(f_all[x_all_num <= y])) + if (!is.ordered(q)) + q <- ordered(q, obj$jitter_info$levels$x) + x_all <- as.ordered(obj$jitter_info$levels$x) + p_all <- dkde1d(x_all, obj) + p <- sapply(q, function(y) sum(p_all[x_all <= y])) p <- pmin(pmax(p, 0), 1) } @@ -81,19 +88,17 @@ pkde1d <- function(q, obj) { qkde1d <- function(p, obj) { stopifnot(all(na.omit(p) > 0.0) & all(na.omit(p) < 1.0)) if (length(obj$jitter_info$i_disc) != 1) { + ## qkde1d_cpp for continuous variables q <- qkde1d_cpp(p, obj) } else { ## for discrete variables compute quantile from the density - x_all_num <- expand_as_numeric(as.ordered(obj$jitter_info$levels$x)) + x_all <- as.ordered(obj$jitter_info$levels$x) # pdf at all possible values of x - dd <- dkde1d(x_all_num, obj) - pp <- c(cumsum(dd)) / sum(dd) + pp <- pkde1d(x_all, obj) # generalized inverse - q <- x_all_num[vapply(p, function(y) which(y <= pp)[1], integer(1))] - q <- ordered(obj$jitter_info$levels$x[q + 1], - levels = obj$jitter_info$levels$x) + q <- x_all[vapply(p, function(y) which(y <= pp)[1], integer(1))] } q @@ -156,13 +161,19 @@ plot.kde1d <- function(x, ...) { ev <- ordered(x$jitter_info$levels$x, levels = x$jitter_info$levels$x) plot_type <- "h" # for discrete variables, use a histrogram - x$values <- dkde1d(ev, x) - x$grid_points <- ev + } else { + # adjust grid if necessary + ev <- x$grid_points + if (!is.nan(x$xmin)) + ev[1] <- x$xmin + if (!is.nan(x$xmax)) + ev[length(ev)] <- x$xmax } + vals <- dkde1d(ev, x) pars <- list( - x = x$grid_points, - y = x$values, + x = ev, + y = vals, type = plot_type, xlab = "x", ylab = "density", diff --git a/R/tools.R b/R/tools.R new file mode 100644 index 0000000..8582a74 --- /dev/null +++ b/R/tools.R @@ -0,0 +1,53 @@ +#' check if data violate boundaries +#' @noRd +check_boundary_violations <- function(x, xmin, xmax) { + if (!is.nan(xmax) & !is.nan(xmin)) { + if (any(x < xmin) || any(x > xmax)) + stop("Not all data are contained in the interval [xmin, xmax].") + } else if (!is.nan(xmin)) { + if (any(x < xmin)) + stop("Not all data are larger than xmin.") + } else if (!is.nan(xmax)) { + if (any(x > xmax)) + stop("Not all data are samller than xmax.") + } +} + +#' check and pre-process arguments passed to kde1d() +#' @noRd +check_arguments <- function(x, mult, xmin, xmax, bw, deg) { + stopifnot(NCOL(x) == 1) + + if (!is.ordered(x) & is.factor(x)) + stop("Factors not allowed; use kdevine::kdevine() or cctools::cckde().") + + stopifnot(mult > 0) + + if (is.ordered(x) & (!is.nan(xmin) | !is.nan(xmax))) + stop("xmin and xmax are not meaningful for x of type ordered.") + + if (!is.nan(xmax) & !is.nan(xmin)) { + if (xmin > xmax) + stop("xmin is larger than xmax.") + } + check_boundary_violations(x, xmin, xmax) + + if (!(deg %in% 0:2)) + stop("deg must be either 0, 1, or 2.") +} + +#' adjusts observations and evaluation points for boundary effects +#' @importFrom stats qnorm +#' @noRd +boundary_transform <- function(x, xmin, xmax) { + if (!is.nan(xmin) & !is.nan(xmax)) { # two boundaries + x <- qnorm((x - xmin) / (xmax - xmin + 1e-1)) + } else if (!is.nan(xmin)) { # left boundary + x <- log(x - xmin + 1e-3) + } else if (!is.nan(xmax)) { # right boundary + x <- log(xmax - x + 1e-3) + } + + x +} + diff --git a/README.md b/README.md index 6ab0a14..e574ba0 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ devtools::install_github("tnagler/kde1d") ``` r x <- rnorm(100) # simulate data fit <- kde1d(x) # estimate density -dkde1d(1000, fit) # evaluate density estimate +dkde1d(0, fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate curve(dnorm(x), add = TRUE, # add true density @@ -48,7 +48,7 @@ curve(dnorm(x), add = TRUE, # add true density ``` r x <- rgamma(100, shape = 1) # simulate data fit <- kde1d(x, xmin = 0, deg = 1) # estimate density -dkde1d(1000, fit) # evaluate density estimate +dkde1d(seq(0, 5, by = 1), fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate curve(dgamma(x, shape = 1), # add true density @@ -61,7 +61,7 @@ curve(dgamma(x, shape = 1), # add true density x <- rbinom(100, size = 5, prob = 0.5) # simulate data x <- ordered(x, levels = 0:5) # declare as ordered fit <- kde1d(x) # estimate density -dkde1d(2, fit) # evaluate density estimate +dkde1d(sort(unique(x)), fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate points(ordered(0:5, 0:5), # add true density diff --git a/cran-comments.md b/cran-comments.md index dbf5918..5ec039e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,29 +1,8 @@ -## New submission after package has been archived - -Problem 1: ASAN failures -Solution: Fixed heap buffer overflows on clang-ASAN related to max-comparison -of a size_t and a negative number. - -Problem 2: Failed without long doubles -Solution: Fixed cdf calculations for discrete data which made /tests fail. - -Problem 3: Threw a C++ exception under valgrind (fatal to the R process) -Solution: This is a known issue of valgrind + boost; fixed by adding -'#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false' in /inst/include/stats.hpp -(as suggested in http://svn.boost.org/trac10/ticket/10005). - -We are sorry for the delay in fixing these issues, reproducing the bugs reported -by clang-ASAN was way harder than expected. - ## Test environments * Debian Linux with clang-4.0-ASAN enabled on rocker/r-devel-ubsan-clang (devel) -* Ubuntu 16.04 with clang-7.0.0-ASAN enabled (devel) -* Ubuntu 16.04 without long double support (devel) -* Ubuntu 16.04 with valgrind (devel) * local OS X install (release) -* ubuntu 12.04 on travis-ci (oldrel, release, devel) +* ubuntu 12.04 on travis-ci (release, devel) * win-builder (devel) -* Windows Server 2012 R2 x64 + x86 (release) ## R CMD check results @@ -31,4 +10,4 @@ by clang-ASAN was way harder than expected. ## Reverse dependencies -This is a new release, so there are no reverse dependencies. +There are no reverse dependencies. diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 2fd6130..ff960c3 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -54,7 +54,7 @@ kde1d - 0.1.2 + 0.2.0 diff --git a/docs/authors.html b/docs/authors.html index bd1562d..49f0a9b 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -54,7 +54,7 @@ kde1d - 0.1.2 + 0.2.0 diff --git a/docs/index.html b/docs/index.html index 60cb015..e307610 100644 --- a/docs/index.html +++ b/docs/index.html @@ -34,7 +34,7 @@ kde1d - 0.1.2 + 0.2.0 @@ -106,7 +106,7 @@
Unbounded data
x <- rnorm(100)                    # simulate data
 fit <- kde1d(x)                    # estimate density
-dkde1d(1000, fit)                  # evaluate density estimate
+dkde1d(0, fit)                     # evaluate density estimate
 summary(fit)                       # information about the estimate
 plot(fit)                          # plot the density estimate
 curve(dnorm(x), add = TRUE,        # add true density
@@ -117,7 +117,7 @@ 
Bounded data, log-linear
x <- rgamma(100, shape = 1)        # simulate data
 fit <- kde1d(x, xmin = 0, deg = 1) # estimate density
-dkde1d(1000, fit)                  # evaluate density estimate
+dkde1d(seq(0, 5, by = 1), fit)     # evaluate density estimate
 summary(fit)                       # information about the estimate
 plot(fit)                          # plot the density estimate
 curve(dgamma(x, shape = 1),        # add true density
@@ -130,7 +130,7 @@ 
x <- rbinom(100, size = 5, prob = 0.5)  # simulate data
 x <- ordered(x, levels = 0:5)           # declare as ordered
 fit <- kde1d(x)                         # estimate density
-dkde1d(2, fit)                          # evaluate density estimate
+dkde1d(sort(unique(x)), fit)            # evaluate density estimate
 summary(fit)                            # information about the estimate
 plot(fit)                               # plot the density estimate
 points(ordered(0:5, 0:5),               # add true density
diff --git a/docs/news/index.html b/docs/news/index.html
index 2d264db..14b9cd6 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -54,7 +54,7 @@
       
       
         kde1d
-        0.1.2
+        0.2.0
       
     
@@ -97,9 +97,24 @@

Changelog

Source: NEWS.md
+
+

+kde1d 0.2.0 Unreleased +

+

NEW FEATURES

+
    +
  • improved stability of density estimates near a boundary (#21).
  • +
+

BUG FIXES

+
    +
  • consistent behavior when dkde1d() and pkde1d() are called with non-ordered input although data are discrete (#19).

  • +
  • fixed bug in computation of kernel density estimates (#20).

  • +
  • adapt minimum bw allowed for discrete data to truncated Gaussian kernel (#20).

  • +
+

-kde1d 0.1.2 Unreleased +kde1d 0.1.2 2018-04-17

NEW FEATURES

    @@ -125,6 +140,7 @@

    Contents

    diff --git a/docs/reference/dkde1d-1.png b/docs/reference/dkde1d-1.png index 639c140..5b5d618 100644 Binary files a/docs/reference/dkde1d-1.png and b/docs/reference/dkde1d-1.png differ diff --git a/docs/reference/dkde1d.html b/docs/reference/dkde1d.html index c4d0793..7472e75 100644 --- a/docs/reference/dkde1d.html +++ b/docs/reference/dkde1d.html @@ -58,7 +58,7 @@ kde1d - 0.1.2 + 0.2.0
    @@ -169,7 +169,7 @@

    Examp
    set.seed(0) # for reproducibility x <- rnorm(100) # simulate some data fit <- kde1d(x) # estimate density -dkde1d(0, fit) # evaluate density estimate (close to dnorm(0))
    #> [1] 0.451555
    pkde1d(0, fit) # evaluate corresponding cdf (close to pnorm(0))
    #> [1] 0.4972006
    qkde1d(0.5, fit) # quantile function (close to qnorm(0))
    #> [1] 0.006201171
    hist(rkde1d(100, fit)) # simulate
    +dkde1d(0, fit) # evaluate density estimate (close to dnorm(0))
    #> [1] 0.4780549
    pkde1d(0, fit) # evaluate corresponding cdf (close to pnorm(0))
    #> [1] 0.5156368
    qkde1d(0.5, fit) # quantile function (close to qnorm(0))
    #> [1] -0.03249129
    hist(rkde1d(100, fit)) # simulate

diff --git a/docs/reference/kde1d-1.png b/docs/reference/kde1d-1.png index efe4ab1..1beafba 100644 Binary files a/docs/reference/kde1d-1.png and b/docs/reference/kde1d-1.png differ diff --git a/docs/reference/kde1d-2.png b/docs/reference/kde1d-2.png index ee68a73..a246cb6 100644 Binary files a/docs/reference/kde1d-2.png and b/docs/reference/kde1d-2.png differ diff --git a/docs/reference/kde1d-3.png b/docs/reference/kde1d-3.png index 81abf5e..80842fb 100644 Binary files a/docs/reference/kde1d-3.png and b/docs/reference/kde1d-3.png differ diff --git a/docs/reference/kde1d-package.html b/docs/reference/kde1d-package.html index eb143a1..a68054f 100644 --- a/docs/reference/kde1d-package.html +++ b/docs/reference/kde1d-package.html @@ -60,7 +60,7 @@ kde1d - 0.1.2 + 0.2.0
diff --git a/docs/reference/kde1d.html b/docs/reference/kde1d.html index faf7228..5f2832c 100644 --- a/docs/reference/kde1d.html +++ b/docs/reference/kde1d.html @@ -58,7 +58,7 @@ kde1d - 0.1.2 + 0.2.0 @@ -182,31 +182,29 @@

See a

Examples

-
## For reproducibility -set.seed(0) - +
## unbounded data -x <- rnorm(100) # simulate data +x <- rnorm(500) # simulate data fit <- kde1d(x) # estimate density -dkde1d(1000, fit) # evaluate density estimate
#> [1] 1e-12
summary(fit) # information about the estimate
#> kernel density estimate ('kde1d'), log-quadratic +dkde1d(0, fit) # evaluate density estimate
#> [1] 0.4123974
summary(fit) # information about the estimate
#> kernel density estimate ('kde1d'), log-quadratic #> ----------------------------------------------------------------- -#> nobs = 100, bw = 0.37, loglik = -128.39, d.f. = 2.24
plot(fit) # plot the density estimate
curve(dnorm(x), add = TRUE, # add true density +#> nobs = 500, bw = 0.29, loglik = -700.87, d.f. = 10.58
plot(fit) # plot the density estimate
curve(dnorm(x), add = TRUE, # add true density col = "red")
## bounded data, log-linear -x <- rgamma(100, shape = 1) # simulate data +x <- rgamma(500, shape = 1) # simulate data fit <- kde1d(x, xmin = 0, deg = 1) # estimate density -dkde1d(1000, fit) # evaluate density estimate
#> [1] 482652.4
summary(fit) # information about the estimate
#> kernel density estimate ('kde1d'), log-linear with bounded support (xmin = 0) +dkde1d(seq(0, 5, by = 1), fit) # evaluate density estimate
#> [1] 1.144082921 0.318018791 0.108508811 0.036081984 0.009148931 0.001913149
summary(fit) # information about the estimate
#> kernel density estimate ('kde1d'), log-linear with bounded support (xmin = 0) #> ----------------------------------------------------------------- -#> nobs = 100, bw = 0.46, loglik = -98.75, d.f. = 3.51
plot(fit) # plot the density estimate
curve(dgamma(x, shape = 1), # add true density +#> nobs = 500, bw = 0.35, loglik = -471.12, d.f. = 39.2
plot(fit) # plot the density estimate
curve(dgamma(x, shape = 1), # add true density add = TRUE, col = "red", from = 1e-3)
## discrete data -x <- rbinom(100, size = 5, prob = 0.5) # simulate data +x <- rbinom(500, size = 5, prob = 0.5) # simulate data x <- ordered(x, levels = 0:5) # declare as ordered fit <- kde1d(x) # estimate density -dkde1d(2, fit) # evaluate density estimate
#> [1] 0.332243
summary(fit) # information about the estimate
#> (jittered) kernel density estimate ('kde1d'), log-quadratic +dkde1d(sort(unique(x)), fit) # evaluate density estimate
#> [1] 0.04832389 0.15290500 0.28419148 0.30222901 0.18384615 0.02850447
summary(fit) # information about the estimate
#> (jittered) kernel density estimate ('kde1d'), log-quadratic #> ----------------------------------------------------------------- -#> nobs = 100, bw = 0.5, loglik = -154.49, d.f. = 2.94
plot(fit) # plot the density estimate
points(ordered(0:5, 0:5), # add true density +#> nobs = 500, bw = 0.36, loglik = -787.47, d.f. = 14.62
plot(fit) # plot the density estimate
points(ordered(0:5, 0:5), # add true density dbinom(0:5, 5, 0.5), col = "red")
diff --git a/docs/reference/plot.kde1d-1.png b/docs/reference/plot.kde1d-1.png index 966ed18..37bb3a2 100644 Binary files a/docs/reference/plot.kde1d-1.png and b/docs/reference/plot.kde1d-1.png differ diff --git a/docs/reference/plot.kde1d-2.png b/docs/reference/plot.kde1d-2.png index 5070e47..5804940 100644 Binary files a/docs/reference/plot.kde1d-2.png and b/docs/reference/plot.kde1d-2.png differ diff --git a/docs/reference/plot.kde1d.html b/docs/reference/plot.kde1d.html index f8b9d23..7bcf736 100644 --- a/docs/reference/plot.kde1d.html +++ b/docs/reference/plot.kde1d.html @@ -57,7 +57,7 @@ kde1d - 0.1.2 + 0.2.0 diff --git a/inst/include/lpdens.hpp b/inst/include/lpdens.hpp index 95b856e..11cfdb0 100644 --- a/inst/include/lpdens.hpp +++ b/inst/include/lpdens.hpp @@ -138,7 +138,7 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev, Eigen::VectorXd kernels(x.size()); for (size_t k = 0; k < x_ev.size(); k++) { // classical (local constant) kernel density estimate - xx = x.array() - x_ev(k); + xx = (x.array() - x_ev(k)) / bw_; kernels = kern_gauss(xx) / bw_; f0 = kernels.mean(); res(k, 0) = f0; @@ -149,13 +149,12 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev, f1 = xx.cwiseProduct(kernels).mean(); // first order derivative b = f1 / f0; - if (deg_ > 1) { + if (deg_ == 2) { // more calculations for local quadratic xx2 = xx.cwiseProduct(kernels) / (f0 * static_cast(n)); b *= std::pow(bw_, 2); - s = 1.0 / (std::pow(bw_, 4) * xx.cwiseProduct(xx2).sum() - - std::pow(b, 2)); - res(k, 0) *= std::sqrt(s) / bw_; + s = 1.0 / (std::pow(bw_, 4) * xx.transpose() * xx2 - std::pow(b, 2)); + res(k, 0) *= bw_ * std::sqrt(s); } // final estimate @@ -226,7 +225,7 @@ inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x, if (!inverse) { if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - x_new = (x.array() - xmin_ + 5e-3) / (xmax_ - xmin_ + 1e-2); + x_new = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4); x_new = stats::qnorm(x_new); } else if (!std::isnan(xmin_)) { // left boundary -> log transform @@ -240,8 +239,8 @@ inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x, } else { if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - x_new = stats::pnorm(x).array() + xmin_ - 5e-3; - x_new *= (xmax_ - xmin_ + 1e-2); + x_new = stats::pnorm(x).array() + xmin_ - 5e-5; + x_new *= (xmax_ - xmin_ + 1e-4); } else if (!std::isnan(xmin_)) { // left boundary -> log transform x_new = x.array().exp() + xmin_ - 1e-3; @@ -267,10 +266,10 @@ inline Eigen::VectorXd LPDens1d::boundary_correct(const Eigen::VectorXd& x, Eigen::VectorXd corr_term(fhat.size()); if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - corr_term = (x.array() - xmin_ + 5e-3) / (xmax_ - xmin_ + 1e-2); + corr_term = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4); corr_term = stats::dnorm(stats::qnorm(corr_term)); - corr_term /= (xmax_ - xmin_ + 1e-2); - corr_term = 1.0 / corr_term.array().max(1e-4); + corr_term /= (xmax_ - xmin_ + 1e-4); + corr_term = 1.0 / corr_term.array().max(1e-6); } else if (!std::isnan(xmin_)) { // left boundary -> log transform corr_term = 1.0 / (1e-3 + x.array() - xmin_); @@ -330,6 +329,7 @@ inline Eigen::VectorXd LPDens1d::construct_grid_points(const Eigen::VectorXd& x) //! @param grid_points the grid points. inline Eigen::VectorXd LPDens1d::finalize_grid(Eigen::VectorXd& grid_points) { + double range = grid_points.maxCoeff() - grid_points.minCoeff(); if (!std::isnan(xmin_)) grid_points(0) = xmin_; if (!std::isnan(xmax_)) diff --git a/man/kde1d.Rd b/man/kde1d.Rd index 9cee9bc..5261b45 100644 --- a/man/kde1d.Rd +++ b/man/kde1d.Rd @@ -40,22 +40,20 @@ log-transform is used if there is only one boundary (see, Geenens and Wang, Discrete variables are handled via jittering (see, Nagler, 2018a, 2018b). } \examples{ -## For reproducibility -set.seed(0) ## unbounded data -x <- rnorm(100) # simulate data +x <- rnorm(500) # simulate data fit <- kde1d(x) # estimate density -dkde1d(1000, fit) # evaluate density estimate +dkde1d(0, fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate curve(dnorm(x), add = TRUE, # add true density col = "red") ## bounded data, log-linear -x <- rgamma(100, shape = 1) # simulate data +x <- rgamma(500, shape = 1) # simulate data fit <- kde1d(x, xmin = 0, deg = 1) # estimate density -dkde1d(1000, fit) # evaluate density estimate +dkde1d(seq(0, 5, by = 1), fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate curve(dgamma(x, shape = 1), # add true density @@ -63,10 +61,10 @@ curve(dgamma(x, shape = 1), # add true density from = 1e-3) ## discrete data -x <- rbinom(100, size = 5, prob = 0.5) # simulate data +x <- rbinom(500, size = 5, prob = 0.5) # simulate data x <- ordered(x, levels = 0:5) # declare as ordered fit <- kde1d(x) # estimate density -dkde1d(2, fit) # evaluate density estimate +dkde1d(sort(unique(x)), fit) # evaluate density estimate summary(fit) # information about the estimate plot(fit) # plot the density estimate points(ordered(0:5, 0:5), # add true density diff --git a/src/wrappers.cpp b/src/wrappers.cpp index b003103..91c8a19 100644 --- a/src/wrappers.cpp +++ b/src/wrappers.cpp @@ -54,7 +54,17 @@ InterpolationGrid1d wrap_to_cpp(const Rcpp::List& R_object) Eigen::VectorXd dkde1d_cpp(const Eigen::VectorXd& x, const Rcpp::List& R_object) { - return wrap_to_cpp(R_object).interpolate(x); + Eigen::VectorXd fhat = wrap_to_cpp(R_object).interpolate(x); + double xmin = R_object["xmin"]; + double xmax = R_object["xmax"]; + if (!std::isnan(xmin)) { + fhat = (x.array() < xmin).select(Eigen::VectorXd::Zero(x.size()), fhat); + } + if (!std::isnan(xmax)) { + fhat = (x.array() > xmax).select(Eigen::VectorXd::Zero(x.size()), fhat); + } + + return fhat; } //' computes the cdf of a kernel density estimate by numerical integration. @@ -117,7 +127,7 @@ double select_bw_cpp(const Eigen::VectorXd& x, bw *= mult; if (discrete) { - bw = std::max(bw, 0.5); + bw = std::max(bw, 0.5 / 5); } return(bw); diff --git a/tests/testthat/test_kde1d.R b/tests/testthat/test_kde1d.R index ae1a635..4451888 100644 --- a/tests/testthat/test_kde1d.R +++ b/tests/testthat/test_kde1d.R @@ -73,6 +73,15 @@ test_that("d/p/r/h functions work", { expect_lte(max(pkde1d(sim, fit), 1), 1) expect_that(all(qkde1d(u, fit) >= xmin), equals(TRUE)) expect_that(all(qkde1d(u, fit) <= xmax), equals(TRUE)) + if (!is.nan(fit$xmin)) { + expect_equal(dkde1d(xmin - 1, fit), 0) + expect_equal(pkde1d(xmin - 1, fit), 0) + } + + if (!is.nan(fit$xmax)) { + expect_equal(dkde1d(xmax + 1, fit), 0) + expect_equal(pkde1d(xmax + 1, fit), 1) + } } sims <- lapply(fits, function(x) rkde1d(n, x)) @@ -108,4 +117,22 @@ test_that("other generics work", { } lapply(fits, test_other_generics) +}) + +test_that("behavior for discrete data is consistent", { + n <- 1e3 + x <- ordered(sample(5, n, TRUE), 1:5) + fit <- kde1d(x) + xx <- ordered(1:5, 1:5) + expect_equal(dkde1d(1:5, fit), dkde1d(xx, fit)) + expect_equal(pkde1d(1:5, fit), pkde1d(xx, fit)) + expect_true(all(is.na(dkde1d(c(0, 6), fit)))) + expect_true(all(is.na(pkde1d(c(0, 6), fit)))) + expect_true(all(rkde1d(n, fit) %in% x)) +}) + +test_that("estimates for discrete data are reasonable", { + x <- ordered(sample(5, 1e5, TRUE), 1:5) + fit <- kde1d(x) + expect_true(all(abs(dkde1d(1:5, fit) - 0.2) < 0.1)) }) \ No newline at end of file