From db2cbba46c6b7d166e80b7a6229f85a070a8c262 Mon Sep 17 00:00:00 2001
From: hadley <h.wickham@gmail.com>
Date: Thu, 6 Oct 2016 09:56:50 -0500
Subject: [PATCH] Elucidate & implement consistent principles for discrete NA

Fixes #1584
---
 NEWS.md                              | 28 +++++++++++++++++
 R/geom-point.r                       |  4 +--
 R/guide-legend.r                     | 11 ++-----
 R/range.r                            |  4 +--
 R/scale-.r                           | 19 +++++++++---
 R/scale-discrete-.r                  |  2 +-
 tests/testthat/test-scale-discrete.R | 46 ++++++++++++++++++++++++++++
 7 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 08ed0527b4..d447b0e5ed 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,33 @@
 # ggplot2 2.1.0.9000 
 
+*   Overhaul of how discrete missing values are treated (#1584). The underlying 
+    principle is that we can naturally represent missing values on discrete 
+    variables (by treating just like another level), so by default
+    we should. 
+
+    This principle applies to:
+   
+    * character vectors
+    * factors with implicit NA
+    * factors with explicit NA
+   
+    And to all scales (both position and non-position.)
+
+    Compared to the previous version of ggplot2, that means there are 3 main
+    changes:
+    
+    * `scale_x_discrete()` and `scale_y_discrete()` always show discrete NA,
+      regardless of their source
+    
+    * If present, `NA`s are shown in discete legends.
+    
+    * All discrete scales gain a `na.translate` argument that allows you to 
+      control whether `NA`s are translated to something that can be visualised
+      or left as missing. Note that if you leave as is (i.e. 
+      `na.translate = FALSE)` they will passed on to the layer, which
+      will create warnings about dropping missing values. To suppress those,
+      you'll also need to add `na.rm = TRUE` to the layer call. 
+
 * Added scales `scale_x_time()` and `scale_y_time()` which are applied
   automatically when you plot objects of type hms (#1752).
   
diff --git a/R/geom-point.r b/R/geom-point.r
index b7c0ebf41e..c7f65e73a5 100644
--- a/R/geom-point.r
+++ b/R/geom-point.r
@@ -28,7 +28,7 @@
 #'
 #' @seealso \code{\link{scale_size}} to see scale area of points, instead of
 #'   radius, \code{\link{geom_jitter}} to jitter points to reduce (mild)
-#'   overplotting, \code{\link{geom_count}} to count the number of points 
+#'   overplotting, \code{\link{geom_count}} to count the number of points
 #'   at unique locations on a scatterplot and map the size of the point to
 #'   the count.
 #' @inheritParams layer
@@ -119,7 +119,7 @@ geom_point <- function(mapping = NULL, data = NULL,
 #' @export
 GeomPoint <- ggproto("GeomPoint", Geom,
   required_aes = c("x", "y"),
-  non_missing_aes = c("size", "shape"),
+  non_missing_aes = c("size", "shape", "colour"),
   default_aes = aes(
     shape = 19, colour = "black", size = 1.5, fill = NA,
     alpha = NA, stroke = 0.5
diff --git a/R/guide-legend.r b/R/guide-legend.r
index 7f01aaa8ca..695ae1c991 100644
--- a/R/guide-legend.r
+++ b/R/guide-legend.r
@@ -216,16 +216,9 @@ guide_train.legend <- function(guide, scale) {
     stringsAsFactors = FALSE)
   key$.label <- scale$get_labels(breaks)
 
-  # this is a quick fix for #118
-  # some scales have NA as na.value (e.g., size)
-  # some scales have non NA as na.value (e.g., "grey50" for colour)
-  # drop rows if data (instead of the mapped value) is NA
-  #
-  # Also, drop out-of-range values for continuous scale
+  # Drop out-of-range values for continuous scale
   # (should use scale$oob?)
-  if (scale$is_discrete()) {
-    key <- key[!is.na(breaks), , drop = FALSE]
-  } else {
+  if (!scale$is_discrete()) {
     limits <- scale$get_limits()
     noob <- !is.na(breaks) & limits[1] <= breaks & breaks <= limits[2]
     key <- key[noob, , drop = FALSE]
diff --git a/R/range.r b/R/range.r
index 5d363ad6a0..f2559efb84 100644
--- a/R/range.r
+++ b/R/range.r
@@ -13,8 +13,8 @@ Range <- ggproto("Range", NULL,
 )
 
 RangeDiscrete <- ggproto("RangeDiscrete", Range,
-  train = function(self, x, drop = FALSE) {
-    self$range <- scales::train_discrete(x, self$range, drop)
+  train = function(self, x, drop = FALSE, na.rm = FALSE) {
+    self$range <- scales::train_discrete(x, self$range, drop = drop, na.rm = na.rm)
   }
 )
 
diff --git a/R/scale-.r b/R/scale-.r
index d12646b996..95b0f69071 100644
--- a/R/scale-.r
+++ b/R/scale-.r
@@ -368,7 +368,7 @@ ScaleDiscrete <- ggproto("ScaleDiscrete", Scale,
 
   train = function(self, x) {
     if (length(x) == 0) return()
-    self$range$train(x, drop = self$drop)
+    self$range$train(x, drop = self$drop, na.rm = !self$na.translate)
   },
 
   transform = function(x) {
@@ -393,7 +393,11 @@ ScaleDiscrete <- ggproto("ScaleDiscrete", Scale,
       pal_match <- unname(pal_match)
     }
 
-    ifelse(is.na(x) | is.na(pal_match), self$na.value, pal_match)
+    if (self$na.translate) {
+      ifelse(is.na(x) | is.na(pal_match), self$na.value, pal_match)
+    } else {
+      pal_match
+    }
   },
 
   dimension = function(self, expand = c(0, 0)) {
@@ -631,7 +635,12 @@ continuous_scale <- function(aesthetics, scale_name, palette, name = waiver(),
 #'   additive constant used to expand the range of the scales so that there
 #'   is a small gap between the data and the axes. The defaults are (0,0.6)
 #'   for discrete scales and (0.05,0) for continuous scales.
-#' @param na.value how should missing values be displayed?
+#' @param na.translate Unlike continuous scales, discrete scales can easily show
+#'   missing values, and do so by default. If you want to remove missing values
+#'   from a discrete scale, specify \code{na.translate = FALSE}.
+#' @param na.value If \code{na.translate = TRUE}, what value aesthetic
+#'   value should missing be displayed as? Does not apply to position scales
+#'   where \code{NA} is always placed at the far right.
 #' @param guide the name of, or actual function, used to create the
 #'   guide. See \code{\link{guides}} for more info.
 #' @param position The position of the axis. "left" or "right" for vertical
@@ -640,7 +649,8 @@ continuous_scale <- function(aesthetics, scale_name, palette, name = waiver(),
 #' @keywords internal
 discrete_scale <- function(aesthetics, scale_name, palette, name = waiver(),
   breaks = waiver(), labels = waiver(), limits = NULL, expand = waiver(),
-  na.value = NA, drop = TRUE, guide = "legend", position = "left", super = ScaleDiscrete) {
+  na.translate = TRUE, na.value = NA, drop = TRUE,
+  guide = "legend", position = "left", super = ScaleDiscrete) {
 
   check_breaks_labels(breaks, labels)
 
@@ -660,6 +670,7 @@ discrete_scale <- function(aesthetics, scale_name, palette, name = waiver(),
     range = discrete_range(),
     limits = limits,
     na.value = na.value,
+    na.translate = na.translate,
     expand = expand,
 
     name = name,
diff --git a/R/scale-discrete-.r b/R/scale-discrete-.r
index 046b63bd0e..c75f9bad48 100644
--- a/R/scale-discrete-.r
+++ b/R/scale-discrete-.r
@@ -77,7 +77,7 @@ scale_y_discrete <- function(..., expand = waiver(), position = "left") {
 ScaleDiscretePosition <- ggproto("ScaleDiscretePosition", ScaleDiscrete,
   train = function(self, x) {
     if (is.discrete(x)) {
-      self$range$train(x, drop = self$drop)
+      self$range$train(x, drop = self$drop, na.rm = !self$na.translate)
     } else {
       self$range_c$train(x)
     }
diff --git a/tests/testthat/test-scale-discrete.R b/tests/testthat/test-scale-discrete.R
index a7f31356a1..3dd26c9b53 100644
--- a/tests/testthat/test-scale-discrete.R
+++ b/tests/testthat/test-scale-discrete.R
@@ -1,5 +1,51 @@
 context("scale_discrete")
 
+# Missing values ----------------------------------------------------------
+
+df <- tibble::tibble(
+  x1 = c("a", "b", NA),
+  x2 = factor(x1),
+  x3 = addNA(x2),
+
+  y = 1:3
+)
+
+test_that("NAs translated/preserved for position scales", {
+  p1a <- ggplot(df, aes(x1, y)) + geom_point()
+  p2a <- ggplot(df, aes(x2, y)) + geom_point()
+  p3a <- ggplot(df, aes(x3, y)) + geom_point()
+
+  expect_equal(layer_data(p1a)$x, c(1, 2, 3))
+  expect_equal(layer_data(p2a)$x, c(1, 2, 3))
+  expect_equal(layer_data(p3a)$x, c(1, 2, 3))
+
+  rm_na_x <- scale_x_discrete(na.translate = FALSE)
+  p1b <- p1a + rm_na_x
+  p2b <- p2a + rm_na_x
+  p3b <- p3a + rm_na_x
+
+  expect_equal(layer_data(p1b)$x, c(1, 2, NA))
+  expect_equal(layer_data(p2b)$x, c(1, 2, NA))
+  expect_equal(layer_data(p3b)$x, c(1, 2, NA))
+})
+
+test_that("NAs translated/preserved for non-position scales", {
+  p1a <- ggplot(df, aes(y, y, colour = x1)) + geom_point()
+  p2a <- ggplot(df, aes(y, y, colour = x2)) + geom_point()
+  p3a <- ggplot(df, aes(y, y, colour = x3)) + geom_point()
+  expect_equal(layer_data(p1a)$colour, c("#F8766D", "#00BFC4", "grey50"))
+  expect_equal(layer_data(p2a)$colour, c("#F8766D", "#00BFC4", "grey50"))
+  expect_equal(layer_data(p3a)$colour, c("#F8766D", "#00BFC4", "grey50"))
+
+  rm_na_colour <- scale_colour_discrete(na.translate = FALSE)
+  p1b <- p1a + rm_na_colour
+  p2b <- p2a + rm_na_colour
+  p3b <- p3a + rm_na_colour
+  expect_equal(layer_data(p1b)$colour, c("#F8766D", "#00BFC4", NA))
+  expect_equal(layer_data(p2b)$colour, c("#F8766D", "#00BFC4", NA))
+  expect_equal(layer_data(p3b)$colour, c("#F8766D", "#00BFC4", NA))
+})
+
 # Ranges ------------------------------------------------------------------
 
 test_that("discrete ranges also encompas continuous values", {