From db2cbba46c6b7d166e80b7a6229f85a070a8c262 Mon Sep 17 00:00:00 2001 From: hadley Date: Thu, 6 Oct 2016 09:56:50 -0500 Subject: [PATCH] Elucidate & implement consistent principles for discrete NA Fixes #1584 --- NEWS.md | 28 +++++++++++++++++ R/geom-point.r | 4 +-- R/guide-legend.r | 11 ++----- R/range.r | 4 +-- R/scale-.r | 19 +++++++++--- R/scale-discrete-.r | 2 +- tests/testthat/test-scale-discrete.R | 46 ++++++++++++++++++++++++++++ 7 files changed, 96 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 08ed0527b4..d447b0e5ed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,33 @@ # ggplot2 2.1.0.9000 +* Overhaul of how discrete missing values are treated (#1584). The underlying + principle is that we can naturally represent missing values on discrete + variables (by treating just like another level), so by default + we should. + + This principle applies to: + + * character vectors + * factors with implicit NA + * factors with explicit NA + + And to all scales (both position and non-position.) + + Compared to the previous version of ggplot2, that means there are 3 main + changes: + + * `scale_x_discrete()` and `scale_y_discrete()` always show discrete NA, + regardless of their source + + * If present, `NA`s are shown in discete legends. + + * All discrete scales gain a `na.translate` argument that allows you to + control whether `NA`s are translated to something that can be visualised + or left as missing. Note that if you leave as is (i.e. + `na.translate = FALSE)` they will passed on to the layer, which + will create warnings about dropping missing values. To suppress those, + you'll also need to add `na.rm = TRUE` to the layer call. + * Added scales `scale_x_time()` and `scale_y_time()` which are applied automatically when you plot objects of type hms (#1752). diff --git a/R/geom-point.r b/R/geom-point.r index b7c0ebf41e..c7f65e73a5 100644 --- a/R/geom-point.r +++ b/R/geom-point.r @@ -28,7 +28,7 @@ #' #' @seealso \code{\link{scale_size}} to see scale area of points, instead of #' radius, \code{\link{geom_jitter}} to jitter points to reduce (mild) -#' overplotting, \code{\link{geom_count}} to count the number of points +#' overplotting, \code{\link{geom_count}} to count the number of points #' at unique locations on a scatterplot and map the size of the point to #' the count. #' @inheritParams layer @@ -119,7 +119,7 @@ geom_point <- function(mapping = NULL, data = NULL, #' @export GeomPoint <- ggproto("GeomPoint", Geom, required_aes = c("x", "y"), - non_missing_aes = c("size", "shape"), + non_missing_aes = c("size", "shape", "colour"), default_aes = aes( shape = 19, colour = "black", size = 1.5, fill = NA, alpha = NA, stroke = 0.5 diff --git a/R/guide-legend.r b/R/guide-legend.r index 7f01aaa8ca..695ae1c991 100644 --- a/R/guide-legend.r +++ b/R/guide-legend.r @@ -216,16 +216,9 @@ guide_train.legend <- function(guide, scale) { stringsAsFactors = FALSE) key$.label <- scale$get_labels(breaks) - # this is a quick fix for #118 - # some scales have NA as na.value (e.g., size) - # some scales have non NA as na.value (e.g., "grey50" for colour) - # drop rows if data (instead of the mapped value) is NA - # - # Also, drop out-of-range values for continuous scale + # Drop out-of-range values for continuous scale # (should use scale$oob?) - if (scale$is_discrete()) { - key <- key[!is.na(breaks), , drop = FALSE] - } else { + if (!scale$is_discrete()) { limits <- scale$get_limits() noob <- !is.na(breaks) & limits[1] <= breaks & breaks <= limits[2] key <- key[noob, , drop = FALSE] diff --git a/R/range.r b/R/range.r index 5d363ad6a0..f2559efb84 100644 --- a/R/range.r +++ b/R/range.r @@ -13,8 +13,8 @@ Range <- ggproto("Range", NULL, ) RangeDiscrete <- ggproto("RangeDiscrete", Range, - train = function(self, x, drop = FALSE) { - self$range <- scales::train_discrete(x, self$range, drop) + train = function(self, x, drop = FALSE, na.rm = FALSE) { + self$range <- scales::train_discrete(x, self$range, drop = drop, na.rm = na.rm) } ) diff --git a/R/scale-.r b/R/scale-.r index d12646b996..95b0f69071 100644 --- a/R/scale-.r +++ b/R/scale-.r @@ -368,7 +368,7 @@ ScaleDiscrete <- ggproto("ScaleDiscrete", Scale, train = function(self, x) { if (length(x) == 0) return() - self$range$train(x, drop = self$drop) + self$range$train(x, drop = self$drop, na.rm = !self$na.translate) }, transform = function(x) { @@ -393,7 +393,11 @@ ScaleDiscrete <- ggproto("ScaleDiscrete", Scale, pal_match <- unname(pal_match) } - ifelse(is.na(x) | is.na(pal_match), self$na.value, pal_match) + if (self$na.translate) { + ifelse(is.na(x) | is.na(pal_match), self$na.value, pal_match) + } else { + pal_match + } }, dimension = function(self, expand = c(0, 0)) { @@ -631,7 +635,12 @@ continuous_scale <- function(aesthetics, scale_name, palette, name = waiver(), #' additive constant used to expand the range of the scales so that there #' is a small gap between the data and the axes. The defaults are (0,0.6) #' for discrete scales and (0.05,0) for continuous scales. -#' @param na.value how should missing values be displayed? +#' @param na.translate Unlike continuous scales, discrete scales can easily show +#' missing values, and do so by default. If you want to remove missing values +#' from a discrete scale, specify \code{na.translate = FALSE}. +#' @param na.value If \code{na.translate = TRUE}, what value aesthetic +#' value should missing be displayed as? Does not apply to position scales +#' where \code{NA} is always placed at the far right. #' @param guide the name of, or actual function, used to create the #' guide. See \code{\link{guides}} for more info. #' @param position The position of the axis. "left" or "right" for vertical @@ -640,7 +649,8 @@ continuous_scale <- function(aesthetics, scale_name, palette, name = waiver(), #' @keywords internal discrete_scale <- function(aesthetics, scale_name, palette, name = waiver(), breaks = waiver(), labels = waiver(), limits = NULL, expand = waiver(), - na.value = NA, drop = TRUE, guide = "legend", position = "left", super = ScaleDiscrete) { + na.translate = TRUE, na.value = NA, drop = TRUE, + guide = "legend", position = "left", super = ScaleDiscrete) { check_breaks_labels(breaks, labels) @@ -660,6 +670,7 @@ discrete_scale <- function(aesthetics, scale_name, palette, name = waiver(), range = discrete_range(), limits = limits, na.value = na.value, + na.translate = na.translate, expand = expand, name = name, diff --git a/R/scale-discrete-.r b/R/scale-discrete-.r index 046b63bd0e..c75f9bad48 100644 --- a/R/scale-discrete-.r +++ b/R/scale-discrete-.r @@ -77,7 +77,7 @@ scale_y_discrete <- function(..., expand = waiver(), position = "left") { ScaleDiscretePosition <- ggproto("ScaleDiscretePosition", ScaleDiscrete, train = function(self, x) { if (is.discrete(x)) { - self$range$train(x, drop = self$drop) + self$range$train(x, drop = self$drop, na.rm = !self$na.translate) } else { self$range_c$train(x) } diff --git a/tests/testthat/test-scale-discrete.R b/tests/testthat/test-scale-discrete.R index a7f31356a1..3dd26c9b53 100644 --- a/tests/testthat/test-scale-discrete.R +++ b/tests/testthat/test-scale-discrete.R @@ -1,5 +1,51 @@ context("scale_discrete") +# Missing values ---------------------------------------------------------- + +df <- tibble::tibble( + x1 = c("a", "b", NA), + x2 = factor(x1), + x3 = addNA(x2), + + y = 1:3 +) + +test_that("NAs translated/preserved for position scales", { + p1a <- ggplot(df, aes(x1, y)) + geom_point() + p2a <- ggplot(df, aes(x2, y)) + geom_point() + p3a <- ggplot(df, aes(x3, y)) + geom_point() + + expect_equal(layer_data(p1a)$x, c(1, 2, 3)) + expect_equal(layer_data(p2a)$x, c(1, 2, 3)) + expect_equal(layer_data(p3a)$x, c(1, 2, 3)) + + rm_na_x <- scale_x_discrete(na.translate = FALSE) + p1b <- p1a + rm_na_x + p2b <- p2a + rm_na_x + p3b <- p3a + rm_na_x + + expect_equal(layer_data(p1b)$x, c(1, 2, NA)) + expect_equal(layer_data(p2b)$x, c(1, 2, NA)) + expect_equal(layer_data(p3b)$x, c(1, 2, NA)) +}) + +test_that("NAs translated/preserved for non-position scales", { + p1a <- ggplot(df, aes(y, y, colour = x1)) + geom_point() + p2a <- ggplot(df, aes(y, y, colour = x2)) + geom_point() + p3a <- ggplot(df, aes(y, y, colour = x3)) + geom_point() + expect_equal(layer_data(p1a)$colour, c("#F8766D", "#00BFC4", "grey50")) + expect_equal(layer_data(p2a)$colour, c("#F8766D", "#00BFC4", "grey50")) + expect_equal(layer_data(p3a)$colour, c("#F8766D", "#00BFC4", "grey50")) + + rm_na_colour <- scale_colour_discrete(na.translate = FALSE) + p1b <- p1a + rm_na_colour + p2b <- p2a + rm_na_colour + p3b <- p3a + rm_na_colour + expect_equal(layer_data(p1b)$colour, c("#F8766D", "#00BFC4", NA)) + expect_equal(layer_data(p2b)$colour, c("#F8766D", "#00BFC4", NA)) + expect_equal(layer_data(p3b)$colour, c("#F8766D", "#00BFC4", NA)) +}) + # Ranges ------------------------------------------------------------------ test_that("discrete ranges also encompas continuous values", {