Merge pull request #352 from tidymodels/step-other-fixes

changes for #338 and #290
tidymodels · Jun 29, 2019 · b13c47a · b13c47a
2 parents 4504d9b + ab9bf68
commit b13c47a
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 35 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,20 +1,29 @@
 
 # `recipes` 0.1.6
 
+## Breaking Changes
+
+ * Previously, if `step_other()` did _not_ collapse any levels, it would still add an "other" level to the factor. This would lump new factor levels into "other" when data were baked (as `step_novel()` does). This no longer occurs since it was inconsistent with `?step_other`, which said that 
+
+ > "If no pooling is done the data are unmodified".
+
+## New Operations:
+
+* `step_normalize` centers and scales the data (if you are, like Max, too lazy to use two separate steps). 
+
 ## Other Changes:
 
  * `step_knnimpute` can now pass two options to the underlying knn code, including the number of threads ([#323](https://github.com/tidymodels/recipes/issues/323)). 
 
 * Due to changes by CRAN, `step_nnmf` only works on versions of R >= 3.6.0 due to dependency issues. 
 
-* `step_dummy()` is now tolerant to cases where that step's selectors do not capture any columns. In this case, no dummy variables are created. ([#348](https://github.com/tidymodels/recipes/issues/348))
+* `step_dummy()` and `step_other()` are now tolerant to cases where that step's selectors do not capture any columns. In this case, no modifications to the data are made. ([#290](https://github.com/tidymodels/recipes/issues/290), [#348](https://github.com/tidymodels/recipes/issues/348))
 
 * `step_dummy()` can now retain the original columns that are used to make the dummy variables. ([#328](https://github.com/tidymodels/recipes/issues/328)) 
 
-## New Operations:
+* `step_other()`'s print method only reports the variables with collapsed levels (as opposed to any column that was _tested_ to see if it needed collapsing). ([#338](https://github.com/tidymodels/recipes/issues/338)) 
+
 
-* `step_normalize` centers and scales the data (if you are, like Max, too lazy to use two separate steps). 
-
 # `recipes` 0.1.5
 
 Small release driven by changes in `sample()` in the current r-devel. 

diff --git a/R/dummy.R b/R/dummy.R
@@ -18,7 +18,7 @@
 #'  variables will be used as predictors in a model.
 #' @param one_hot A logical. For C levels, should C dummy variables be created
 #' rather than C-1?
-#' @param preserve A sinlge logical; should the selected column(s) be retained
+#' @param preserve A single logical; should the selected column(s) be retained
 #'  (in addition to the new dummy variables).
 #' @param naming A function that defines the naming convention for
 #'  new dummy columns. See Details below.
@@ -155,7 +155,7 @@ step_dummy_new <-
     )
   }
 
-no_dummies <- function(cmd) {
+passover <- function(cmd) {
   # cat("`step_dummy()` was not able to select any columns. ",
   #     "No dummy variables will be created.\n")
 } # figure out how to return a warning without exiting
@@ -164,7 +164,7 @@ no_dummies <- function(cmd) {
 #' @importFrom dplyr bind_cols
 #' @export
 prep.step_dummy <- function(x, training, info = NULL, ...) {
-  col_names <- terms_select(x$terms, info = info, empty_fun = no_dummies)
+  col_names <- terms_select(x$terms, info = info, empty_fun = passover)
 
   if (length(col_names) > 0) {
     fac_check <- vapply(training[, col_names], is.factor, logical(1))

diff --git a/R/other.R b/R/other.R
@@ -43,6 +43,9 @@
 #'   thrown. If `other` is in the list of discarded levels, no error
 #'   occurs.
 #'
+#' If no pooling is done, novel factor levels are converted to missing. If
+#'  pooling is needed, they will be placed into the other category.
+#'
 #' When data to be processed contains novel levels (i.e., not
 #' contained in the training set), the other category is assigned.
 #' @seealso [step_factor2string()], [step_string2factor()],
@@ -120,11 +123,17 @@ step_other_new <-
 #' @importFrom stats sd
 #' @export
 prep.step_other <- function(x, training, info = NULL, ...) {
-  col_names <- terms_select(x$terms, info = info)
-  objects <- lapply(training[, col_names],
-                    keep_levels,
-                    prop = x$threshold,
-                    other = x$other)
+  col_names <- terms_select(x$terms, info = info, empty_fun = passover)
+
+  if (length(col_names) > 0) {
+    objects <- lapply(training[, col_names],
+                      keep_levels,
+                      prop = x$threshold,
+                      other = x$other)
+  } else {
+    objects <- NULL
+  }
+
   step_other_new(
     terms = x$terms,
     role = x$role,
@@ -140,25 +149,27 @@ prep.step_other <- function(x, training, info = NULL, ...) {
 #' @importFrom tibble as_tibble is_tibble
 #' @export
 bake.step_other <- function(object, new_data, ...) {
-  for (i in names(object$objects)) {
-    if (object$objects[[i]]$collapse) {
-      tmp <- if (!is.character(new_data[, i]))
-        as.character(getElement(new_data, i))
-      else
-        getElement(new_data, i)
-
-      tmp <- ifelse(
-        !(tmp %in% object$objects[[i]]$keep) & !is.na(tmp),
-        object$objects[[i]]$other,
-        tmp
-      )
+  if (!is.null(object$objects)) {
+    for (i in names(object$objects)) {
+      if (object$objects[[i]]$collapse) {
+        tmp <- if (!is.character(new_data[, i]))
+          as.character(getElement(new_data, i))
+        else
+          getElement(new_data, i)
+
+        tmp <- ifelse(
+          !(tmp %in% object$objects[[i]]$keep) & !is.na(tmp),
+          object$objects[[i]]$other,
+          tmp
+        )
 
-      # assign other factor levels other here too.
-      tmp <- factor(tmp,
-                    levels = c(object$objects[[i]]$keep,
-                               object$objects[[i]]$other))
+        # assign other factor levels other here too.
+        tmp <- factor(tmp,
+                      levels = c(object$objects[[i]]$keep,
+                                 object$objects[[i]]$other))
 
-      new_data[, i] <- tmp
+        new_data[, i] <- tmp
+      }
     }
   }
   if (!is_tibble(new_data))
@@ -168,8 +179,20 @@ bake.step_other <- function(object, new_data, ...) {
 
 print.step_other <-
   function(x, width = max(20, options()$width - 30), ...) {
-    cat("Collapsing factor levels for ", sep = "")
-    printer(names(x$objects), x$terms, x$trained, width = width)
+
+    if (x$trained) {
+      collapsed <- map_lgl(x$objects, ~ .x$collapse)
+      collapsed <- names(collapsed)[collapsed]
+      if (length(collapsed) > 0) {
+        cat("Collapsing factor levels for ", sep = "")
+        printer(collapsed, x$terms, x$trained, width = width)
+      } else {
+        cat("No factor levels were collapsed\n")
+      }
+    } else {
+      cat("Collapsing factor levels for ", sep = "")
+      printer(names(x$objects), x$terms, x$trained, width = width)
+    }
     invisible(x)
   }
 
@@ -198,7 +221,7 @@ keep_levels <- function(x, prop = .1, other = "other") {
     )
 
   list(keep = orig[orig %in% keepers],
-       collapse = TRUE, # not needed but kept for old versions
+       collapse = length(dropped) > 0,
        other = other)
 }
 

diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd
diff --git a/man/step_other.Rd b/man/step_other.Rd
diff --git a/tests/testthat/test_other.R b/tests/testthat/test_other.R
@@ -155,7 +155,7 @@ test_that('novel levels', {
   df <- data.frame(
     y = c(1,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0,0,0,1,0),
     x1 = c('A','B','B','B','B','A','A','A','B','A','A','B',
-           'A','C','C','B','A','B','C','A'),
+           'A','C','C','B','A','B','C','D'),
     stringsAsFactors = FALSE)
   training <- df[1:10,]
   testing <- df[11:20,]
@@ -170,8 +170,24 @@ test_that('novel levels', {
   novel_level <- prep(novel_level, training = training, retain = TRUE)
   new_results <- bake(novel_level, new_data = testing)
   orig_results <- bake(novel_level, new_data = training)
-  expect_true(all(new_results$x1[testing$x1 == "C"] == "other"))
+  expect_true(all(is.na(new_results$x1[testing$x1 == "C"])))
   expect_true(!any(orig_results$x1 == "other"))
+
+  training <- df[1:14,]
+  testing <- df[15:20,]
+  training$y <- as.factor(training$y)
+  training$x1 <- as.factor(training$x1)
+  testing$y <- as.factor(testing$y)
+  testing$x1 <- as.factor(testing$x1)
+
+  novel_level <- recipe(y ~ ., data = training) %>%
+    step_other(x1, threshold = .1)
+
+  novel_level <- prep(novel_level, training = training, retain = TRUE)
+  new_results <- bake(novel_level, new_data = testing)
+  orig_results <- bake(novel_level, new_data = training)
+  expect_true(all(new_results$x1[testing$x1 == "D"] == "other"))
+  expect_true(any(new_results$x1 == "other"))
 })
 
 test_that("'other' already in use", {