Skip to content

Commit

Permalink
Use same density estimation code in stat_density and stat_ydensity.
Browse files Browse the repository at this point in the history
Fixes #972
  • Loading branch information
hadley committed Jun 12, 2015
1 parent 403b92f commit d58b00a
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 70 deletions.
9 changes: 4 additions & 5 deletions R/position-collide.r
Expand Up @@ -67,11 +67,10 @@ pos_stack <- function(df, width) {
heights <- c(0, cumsum(y))
}

within(df, {
ymin <- heights[-n]
ymax <- heights[-1]
y <- ymax
})
df$ymin <- heights[-n]
df$ymax <- heights[-1]
df$y <- df$ymax
df
}

# Stack overlapping intervals and set height to 1.
Expand Down
61 changes: 43 additions & 18 deletions R/stat-density.r
Expand Up @@ -6,10 +6,12 @@
#' @param adjust see \code{\link{density}} for details
#' @param kernel kernel used for density estimation, see
#' \code{\link{density}} for details
#' @param trim if \code{TRUE}, the default, densities are trimmed to the
#' actual range of the data. If \code{FALSE}, they are extended by the
#' default 3 bandwidths (as specified by the \code{cut} parameter to
#' \code{\link{density}})
#' @param trim This parameter only matters if you are displaying multiple
#' densities in one plot. If \code{FALSE}, the default, each density is
#' computed on the full range of the data. If \code{TRUE}, each density
#' is computed over the range of that group: this typically means the
#' estimated x values will not line-up, and hence you won't be able to
#' stack density values.
#' @param na.rm If \code{FALSE} (the default), removes missing values with
#' a warning. If \code{TRUE} silently removes missing values.
#' @inheritParams stat_identity
Expand Down Expand Up @@ -100,25 +102,48 @@ StatDensity <- proto(Stat, {
data <- remove_missing(data, na.rm, "x", name = "stat_density",
finite = TRUE)

n <- nrow(data)
if (n < 3) return(data.frame())
if (is.null(data$weight)) data$weight <- rep(1, n) / n
if (trim) {
range <- range(data$x, na.rm = TRUE)
} else {
range <- scale_dimension(scales$x, c(0, 0))
}

range <- scale_dimension(scales$x, c(0, 0))
xgrid <- seq(range[1], range[2], length=200)

dens <- stats::density(data$x, adjust=adjust, kernel=kernel, weight=data$weight, from=range[1], to=range[2])
densdf <- as.data.frame(dens[c("x","y")])

densdf$scaled <- densdf$y / max(densdf$y, na.rm = TRUE)
if (trim) densdf <- subset(densdf, x > min(data$x, na.rm = TRUE) & x < max(data$x, na.rm = TRUE))

densdf$count <- densdf$y * n
rename(densdf, c(y = "density"), warn_missing = FALSE)
compute_density(data$x, data$w, from = range[1], to = range[2],
adjust = adjust, kernel = kernel)
}

default_geom <- function(.) GeomArea
default_aes <- function(.) aes(y = ..density.., fill=NA)
required_aes <- c("x")

})

compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
kernel = "gaussian") {
n <- length(x)
if (is.null(w)) {
w <- rep(1 / n, n)
}

# if less than 3 points, spread density evenly over points
if (n < 3) {
return(data.frame(
x = x,
density = w / sum(w),
scaled = w / max(w),
count = 1,
n = n
))
}

dens <- stats::density(x, weight = w, bw = bw, adjust = adjust,
kernel = kernel, from = from, to = to)

data.frame(
x = dens$x,
density = dens$y,
scaled = dens$y / max(dens$y, na.rm = TRUE),
count = dens$y * n,
n = n
)
}
57 changes: 16 additions & 41 deletions R/stat-ydensity.r
Expand Up @@ -5,8 +5,6 @@
#'
#' @inheritParams stat_density
#' @inheritParams stat_identity
#' @param trim If \code{TRUE} (default), trim the tails of the violins
#' to the range of the data. If \code{FALSE}, don't trim the tails.
#' @param scale if "area" (default), all violins have the same area (before trimming
#' the tails). If "count", areas are scaled proportionally to the number of
#' observations. If "width", all violins have the same maximum width.
Expand Down Expand Up @@ -60,52 +58,29 @@ StatYdensity <- proto(Stat, {
}

calculate <- function(., data, scales, width=NULL, adjust=1, kernel="gaussian",
trim=TRUE, na.rm = FALSE, ...) {
trim = FALSE, na.rm = FALSE, ...) {
data <- remove_missing(data, na.rm, "x", name = "stat_density",
finite = TRUE)

n <- nrow(data)

# if less than 3 points, return a density of 1 everywhere
if (n < 3) {
return(data.frame(data, density = 1, scaled = 1, count = 1))
}

# initialize weights if they are not supplied by the user
if (is.null(data$weight)) { data$weight <- rep(1, n) / n }

# compute the density
dens <- stats::density(data$y, adjust = adjust, kernel = kernel,
weight = data$weight, n = 200)

# NB: stat_density restricts to the scale range, here we leave that
# free so violins can extend the y scale
densdf <- data.frame(y = dens$x, density = dens$y)

# scale density to a maximum of 1
densdf$scaled <- densdf$density / max(densdf$density, na.rm = TRUE)

# trim density outside of the data range
if (trim) {
densdf <- subset(densdf, y > min(data$y, na.rm = TRUE) & y < max(data$y, na.rm = TRUE))
range <- range(data$y, na.rm = TRUE)
} else {
range <- scale_dimension(scales$y, c(0, 0))
}
# NB: equivalently, we could have used these bounds in the from and
# to arguments of density()

# scale density by the number of observations
densdf$count <- densdf$density * n
# record the number of observations to be able to scale the density later
densdf$n <- n

# coordinate on the x axis
densdf$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
dens <- compute_density(data$y, data$w, from = range[1], to = range[2],
adjust = adjust, kernel = kernel)

# width of the bounding box of the violin plot on the x axis for continuous x
if (length(unique(data$x)) > 1) { width <- diff(range(data$x)) * 0.9 }
densdf$width <- width
dens$y <- dens$x
dens$x <- mean(range(data$x))

densdf
# Compute width if x has multiple values
if (length(unique(data$x)) > 1) {
width <- diff(range(data$x)) * 0.9
}
dens$width <- width
dens
}

default_geom <- function(.) GeomViolin
required_aes <- c("x", "y")

})
13 changes: 13 additions & 0 deletions inst/tests/test-stat-density.R
@@ -0,0 +1,13 @@
context("stat_density") # and stat_ydensity

test_that("compute_density succeeds when variance is zero", {
dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5)
expect_equal(dens$n, rep(10, 512))
})

test_that("compute_density returns useful df when <3 values", {
dens <- compute_density(c(1, 2), NULL, from = 0, to = 0)

expect_equal(nrow(dens), 2)
expect_equal(names(dens), c("x", "density", "scaled", "count", "n"))
})
10 changes: 6 additions & 4 deletions man/stat_density.Rd
Expand Up @@ -26,10 +26,12 @@ on this layer}
\item{kernel}{kernel used for density estimation, see
\code{\link{density}} for details}

\item{trim}{if \code{TRUE}, the default, densities are trimmed to the
actual range of the data. If \code{FALSE}, they are extended by the
default 3 bandwidths (as specified by the \code{cut} parameter to
\code{\link{density}})}
\item{trim}{This parameter only matters if you are displaying multiple
densities in one plot. If \code{FALSE}, the default, each density is
computed on the full range of the data. If \code{TRUE}, each density
is computed over the range of that group: this typically means the
estimated x values will not line-up, and hence you won't be able to
stack density values.}
\item{na.rm}{If \code{FALSE} (the default), removes missing values with
a warning. If \code{TRUE} silently removes missing values.}
Expand Down
8 changes: 6 additions & 2 deletions man/stat_ydensity.Rd
Expand Up @@ -26,8 +26,12 @@ on this layer}
\item{kernel}{kernel used for density estimation, see
\code{\link{density}} for details}

\item{trim}{If \code{TRUE} (default), trim the tails of the violins
to the range of the data. If \code{FALSE}, don't trim the tails.}
\item{trim}{This parameter only matters if you are displaying multiple
densities in one plot. If \code{FALSE}, the default, each density is
computed on the full range of the data. If \code{TRUE}, each density
is computed over the range of that group: this typically means the
estimated x values will not line-up, and hence you won't be able to
stack density values.}
\item{scale}{if "area" (default), all violins have the same area (before trimming
the tails). If "count", areas are scaled proportionally to the number of
Expand Down

0 comments on commit d58b00a

Please sign in to comment.