From d58b00a58691e1b3753525759535d8abe942e304 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 12 Jun 2015 11:06:56 -0500 Subject: [PATCH] Use same density estimation code in stat_density and stat_ydensity. Fixes #972 --- R/position-collide.r | 9 +++-- R/stat-density.r | 61 ++++++++++++++++++++++++---------- R/stat-ydensity.r | 57 +++++++++---------------------- inst/tests/test-stat-density.R | 13 ++++++++ man/stat_density.Rd | 10 +++--- man/stat_ydensity.Rd | 8 +++-- 6 files changed, 88 insertions(+), 70 deletions(-) create mode 100644 inst/tests/test-stat-density.R diff --git a/R/position-collide.r b/R/position-collide.r index 489a888f0f..520df1a5a5 100644 --- a/R/position-collide.r +++ b/R/position-collide.r @@ -67,11 +67,10 @@ pos_stack <- function(df, width) { heights <- c(0, cumsum(y)) } - within(df, { - ymin <- heights[-n] - ymax <- heights[-1] - y <- ymax - }) + df$ymin <- heights[-n] + df$ymax <- heights[-1] + df$y <- df$ymax + df } # Stack overlapping intervals and set height to 1. diff --git a/R/stat-density.r b/R/stat-density.r index ff8e2fd3b5..2f2399f8e2 100644 --- a/R/stat-density.r +++ b/R/stat-density.r @@ -6,10 +6,12 @@ #' @param adjust see \code{\link{density}} for details #' @param kernel kernel used for density estimation, see #' \code{\link{density}} for details -#' @param trim if \code{TRUE}, the default, densities are trimmed to the -#' actual range of the data. If \code{FALSE}, they are extended by the -#' default 3 bandwidths (as specified by the \code{cut} parameter to -#' \code{\link{density}}) +#' @param trim This parameter only matters if you are displaying multiple +#' densities in one plot. If \code{FALSE}, the default, each density is +#' computed on the full range of the data. If \code{TRUE}, each density +#' is computed over the range of that group: this typically means the +#' estimated x values will not line-up, and hence you won't be able to +#' stack density values. #' @param na.rm If \code{FALSE} (the default), removes missing values with #' a warning. If \code{TRUE} silently removes missing values. #' @inheritParams stat_identity @@ -100,21 +102,14 @@ StatDensity <- proto(Stat, { data <- remove_missing(data, na.rm, "x", name = "stat_density", finite = TRUE) - n <- nrow(data) - if (n < 3) return(data.frame()) - if (is.null(data$weight)) data$weight <- rep(1, n) / n + if (trim) { + range <- range(data$x, na.rm = TRUE) + } else { + range <- scale_dimension(scales$x, c(0, 0)) + } - range <- scale_dimension(scales$x, c(0, 0)) - xgrid <- seq(range[1], range[2], length=200) - - dens <- stats::density(data$x, adjust=adjust, kernel=kernel, weight=data$weight, from=range[1], to=range[2]) - densdf <- as.data.frame(dens[c("x","y")]) - - densdf$scaled <- densdf$y / max(densdf$y, na.rm = TRUE) - if (trim) densdf <- subset(densdf, x > min(data$x, na.rm = TRUE) & x < max(data$x, na.rm = TRUE)) - - densdf$count <- densdf$y * n - rename(densdf, c(y = "density"), warn_missing = FALSE) + compute_density(data$x, data$w, from = range[1], to = range[2], + adjust = adjust, kernel = kernel) } default_geom <- function(.) GeomArea @@ -122,3 +117,33 @@ StatDensity <- proto(Stat, { required_aes <- c("x") }) + +compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, + kernel = "gaussian") { + n <- length(x) + if (is.null(w)) { + w <- rep(1 / n, n) + } + + # if less than 3 points, spread density evenly over points + if (n < 3) { + return(data.frame( + x = x, + density = w / sum(w), + scaled = w / max(w), + count = 1, + n = n + )) + } + + dens <- stats::density(x, weight = w, bw = bw, adjust = adjust, + kernel = kernel, from = from, to = to) + + data.frame( + x = dens$x, + density = dens$y, + scaled = dens$y / max(dens$y, na.rm = TRUE), + count = dens$y * n, + n = n + ) +} diff --git a/R/stat-ydensity.r b/R/stat-ydensity.r index 272c01b9c4..69664062f7 100644 --- a/R/stat-ydensity.r +++ b/R/stat-ydensity.r @@ -5,8 +5,6 @@ #' #' @inheritParams stat_density #' @inheritParams stat_identity -#' @param trim If \code{TRUE} (default), trim the tails of the violins -#' to the range of the data. If \code{FALSE}, don't trim the tails. #' @param scale if "area" (default), all violins have the same area (before trimming #' the tails). If "count", areas are scaled proportionally to the number of #' observations. If "width", all violins have the same maximum width. @@ -60,52 +58,29 @@ StatYdensity <- proto(Stat, { } calculate <- function(., data, scales, width=NULL, adjust=1, kernel="gaussian", - trim=TRUE, na.rm = FALSE, ...) { + trim = FALSE, na.rm = FALSE, ...) { + data <- remove_missing(data, na.rm, "x", name = "stat_density", + finite = TRUE) - n <- nrow(data) - - # if less than 3 points, return a density of 1 everywhere - if (n < 3) { - return(data.frame(data, density = 1, scaled = 1, count = 1)) - } - - # initialize weights if they are not supplied by the user - if (is.null(data$weight)) { data$weight <- rep(1, n) / n } - - # compute the density - dens <- stats::density(data$y, adjust = adjust, kernel = kernel, - weight = data$weight, n = 200) - - # NB: stat_density restricts to the scale range, here we leave that - # free so violins can extend the y scale - densdf <- data.frame(y = dens$x, density = dens$y) - - # scale density to a maximum of 1 - densdf$scaled <- densdf$density / max(densdf$density, na.rm = TRUE) - - # trim density outside of the data range if (trim) { - densdf <- subset(densdf, y > min(data$y, na.rm = TRUE) & y < max(data$y, na.rm = TRUE)) + range <- range(data$y, na.rm = TRUE) + } else { + range <- scale_dimension(scales$y, c(0, 0)) } - # NB: equivalently, we could have used these bounds in the from and - # to arguments of density() - - # scale density by the number of observations - densdf$count <- densdf$density * n - # record the number of observations to be able to scale the density later - densdf$n <- n - - # coordinate on the x axis - densdf$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x)) + dens <- compute_density(data$y, data$w, from = range[1], to = range[2], + adjust = adjust, kernel = kernel) - # width of the bounding box of the violin plot on the x axis for continuous x - if (length(unique(data$x)) > 1) { width <- diff(range(data$x)) * 0.9 } - densdf$width <- width + dens$y <- dens$x + dens$x <- mean(range(data$x)) - densdf + # Compute width if x has multiple values + if (length(unique(data$x)) > 1) { + width <- diff(range(data$x)) * 0.9 + } + dens$width <- width + dens } default_geom <- function(.) GeomViolin required_aes <- c("x", "y") - }) diff --git a/inst/tests/test-stat-density.R b/inst/tests/test-stat-density.R new file mode 100644 index 0000000000..13847d2ca4 --- /dev/null +++ b/inst/tests/test-stat-density.R @@ -0,0 +1,13 @@ +context("stat_density") # and stat_ydensity + +test_that("compute_density succeeds when variance is zero", { + dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5) + expect_equal(dens$n, rep(10, 512)) +}) + +test_that("compute_density returns useful df when <3 values", { + dens <- compute_density(c(1, 2), NULL, from = 0, to = 0) + + expect_equal(nrow(dens), 2) + expect_equal(names(dens), c("x", "density", "scaled", "count", "n")) +}) diff --git a/man/stat_density.Rd b/man/stat_density.Rd index dcdf9a57b6..20b41c7ab3 100644 --- a/man/stat_density.Rd +++ b/man/stat_density.Rd @@ -26,10 +26,12 @@ on this layer} \item{kernel}{kernel used for density estimation, see \code{\link{density}} for details} -\item{trim}{if \code{TRUE}, the default, densities are trimmed to the -actual range of the data. If \code{FALSE}, they are extended by the -default 3 bandwidths (as specified by the \code{cut} parameter to -\code{\link{density}})} +\item{trim}{This parameter only matters if you are displaying multiple +densities in one plot. If \code{FALSE}, the default, each density is +computed on the full range of the data. If \code{TRUE}, each density +is computed over the range of that group: this typically means the +estimated x values will not line-up, and hence you won't be able to +stack density values.} \item{na.rm}{If \code{FALSE} (the default), removes missing values with a warning. If \code{TRUE} silently removes missing values.} diff --git a/man/stat_ydensity.Rd b/man/stat_ydensity.Rd index 48e2064c7e..7b87cf51d5 100644 --- a/man/stat_ydensity.Rd +++ b/man/stat_ydensity.Rd @@ -26,8 +26,12 @@ on this layer} \item{kernel}{kernel used for density estimation, see \code{\link{density}} for details} -\item{trim}{If \code{TRUE} (default), trim the tails of the violins -to the range of the data. If \code{FALSE}, don't trim the tails.} +\item{trim}{This parameter only matters if you are displaying multiple +densities in one plot. If \code{FALSE}, the default, each density is +computed on the full range of the data. If \code{TRUE}, each density +is computed over the range of that group: this typically means the +estimated x values will not line-up, and hence you won't be able to +stack density values.} \item{scale}{if "area" (default), all violins have the same area (before trimming the tails). If "count", areas are scaled proportionally to the number of