From ab9bf68a5dfb9603bc1719aaf4d67f37ce91a3fc Mon Sep 17 00:00:00 2001 From: topepo Date: Fri, 28 Jun 2019 19:21:17 -0400 Subject: [PATCH] changes for #338 and #290 --- NEWS.md | 17 +++++++-- R/dummy.R | 6 +-- R/other.R | 73 ++++++++++++++++++++++++------------- man/step_dummy.Rd | 2 +- man/step_other.Rd | 3 ++ tests/testthat/test_other.R | 20 +++++++++- 6 files changed, 86 insertions(+), 35 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3049956bd..acb02ac67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,20 +1,29 @@ # `recipes` 0.1.6 +## Breaking Changes + + * Previously, if `step_other()` did _not_ collapse any levels, it would still add an "other" level to the factor. This would lump new factor levels into "other" when data were baked (as `step_novel()` does). This no longer occurs since it was inconsistent with `?step_other`, which said that + + > "If no pooling is done the data are unmodified". + +## New Operations: + +* `step_normalize` centers and scales the data (if you are, like Max, too lazy to use two separate steps). + ## Other Changes: * `step_knnimpute` can now pass two options to the underlying knn code, including the number of threads ([#323](https://github.com/tidymodels/recipes/issues/323)). * Due to changes by CRAN, `step_nnmf` only works on versions of R >= 3.6.0 due to dependency issues. -* `step_dummy()` is now tolerant to cases where that step's selectors do not capture any columns. In this case, no dummy variables are created. ([#348](https://github.com/tidymodels/recipes/issues/348)) +* `step_dummy()` and `step_other()` are now tolerant to cases where that step's selectors do not capture any columns. In this case, no modifications to the data are made. ([#290](https://github.com/tidymodels/recipes/issues/290), [#348](https://github.com/tidymodels/recipes/issues/348)) * `step_dummy()` can now retain the original columns that are used to make the dummy variables. ([#328](https://github.com/tidymodels/recipes/issues/328)) -## New Operations: +* `step_other()`'s print method only reports the variables with collapsed levels (as opposed to any column that was _tested_ to see if it needed collapsing). ([#338](https://github.com/tidymodels/recipes/issues/338)) + -* `step_normalize` centers and scales the data (if you are, like Max, too lazy to use two separate steps). - # `recipes` 0.1.5 Small release driven by changes in `sample()` in the current r-devel. diff --git a/R/dummy.R b/R/dummy.R index e6dcbde62..dd916de3e 100644 --- a/R/dummy.R +++ b/R/dummy.R @@ -18,7 +18,7 @@ #' variables will be used as predictors in a model. #' @param one_hot A logical. For C levels, should C dummy variables be created #' rather than C-1? -#' @param preserve A sinlge logical; should the selected column(s) be retained +#' @param preserve A single logical; should the selected column(s) be retained #' (in addition to the new dummy variables). #' @param naming A function that defines the naming convention for #' new dummy columns. See Details below. @@ -155,7 +155,7 @@ step_dummy_new <- ) } -no_dummies <- function(cmd) { +passover <- function(cmd) { # cat("`step_dummy()` was not able to select any columns. ", # "No dummy variables will be created.\n") } # figure out how to return a warning without exiting @@ -164,7 +164,7 @@ no_dummies <- function(cmd) { #' @importFrom dplyr bind_cols #' @export prep.step_dummy <- function(x, training, info = NULL, ...) { - col_names <- terms_select(x$terms, info = info, empty_fun = no_dummies) + col_names <- terms_select(x$terms, info = info, empty_fun = passover) if (length(col_names) > 0) { fac_check <- vapply(training[, col_names], is.factor, logical(1)) diff --git a/R/other.R b/R/other.R index f2df49893..0acc8cb3c 100644 --- a/R/other.R +++ b/R/other.R @@ -43,6 +43,9 @@ #' thrown. If `other` is in the list of discarded levels, no error #' occurs. #' +#' If no pooling is done, novel factor levels are converted to missing. If +#' pooling is needed, they will be placed into the other category. +#' #' When data to be processed contains novel levels (i.e., not #' contained in the training set), the other category is assigned. #' @seealso [step_factor2string()], [step_string2factor()], @@ -120,11 +123,17 @@ step_other_new <- #' @importFrom stats sd #' @export prep.step_other <- function(x, training, info = NULL, ...) { - col_names <- terms_select(x$terms, info = info) - objects <- lapply(training[, col_names], - keep_levels, - prop = x$threshold, - other = x$other) + col_names <- terms_select(x$terms, info = info, empty_fun = passover) + + if (length(col_names) > 0) { + objects <- lapply(training[, col_names], + keep_levels, + prop = x$threshold, + other = x$other) + } else { + objects <- NULL + } + step_other_new( terms = x$terms, role = x$role, @@ -140,25 +149,27 @@ prep.step_other <- function(x, training, info = NULL, ...) { #' @importFrom tibble as_tibble is_tibble #' @export bake.step_other <- function(object, new_data, ...) { - for (i in names(object$objects)) { - if (object$objects[[i]]$collapse) { - tmp <- if (!is.character(new_data[, i])) - as.character(getElement(new_data, i)) - else - getElement(new_data, i) - - tmp <- ifelse( - !(tmp %in% object$objects[[i]]$keep) & !is.na(tmp), - object$objects[[i]]$other, - tmp - ) + if (!is.null(object$objects)) { + for (i in names(object$objects)) { + if (object$objects[[i]]$collapse) { + tmp <- if (!is.character(new_data[, i])) + as.character(getElement(new_data, i)) + else + getElement(new_data, i) + + tmp <- ifelse( + !(tmp %in% object$objects[[i]]$keep) & !is.na(tmp), + object$objects[[i]]$other, + tmp + ) - # assign other factor levels other here too. - tmp <- factor(tmp, - levels = c(object$objects[[i]]$keep, - object$objects[[i]]$other)) + # assign other factor levels other here too. + tmp <- factor(tmp, + levels = c(object$objects[[i]]$keep, + object$objects[[i]]$other)) - new_data[, i] <- tmp + new_data[, i] <- tmp + } } } if (!is_tibble(new_data)) @@ -168,8 +179,20 @@ bake.step_other <- function(object, new_data, ...) { print.step_other <- function(x, width = max(20, options()$width - 30), ...) { - cat("Collapsing factor levels for ", sep = "") - printer(names(x$objects), x$terms, x$trained, width = width) + + if (x$trained) { + collapsed <- map_lgl(x$objects, ~ .x$collapse) + collapsed <- names(collapsed)[collapsed] + if (length(collapsed) > 0) { + cat("Collapsing factor levels for ", sep = "") + printer(collapsed, x$terms, x$trained, width = width) + } else { + cat("No factor levels were collapsed\n") + } + } else { + cat("Collapsing factor levels for ", sep = "") + printer(names(x$objects), x$terms, x$trained, width = width) + } invisible(x) } @@ -198,7 +221,7 @@ keep_levels <- function(x, prop = .1, other = "other") { ) list(keep = orig[orig %in% keepers], - collapse = TRUE, # not needed but kept for old versions + collapse = length(dropped) > 0, other = other) } diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd index 7170b29be..a1c8b5a7e 100644 --- a/man/step_dummy.Rd +++ b/man/step_dummy.Rd @@ -32,7 +32,7 @@ preprocessing have been estimated.} \item{one_hot}{A logical. For C levels, should C dummy variables be created rather than C-1?} -\item{preserve}{A sinlge logical; should the original column(s) be retained +\item{preserve}{A single logical; should the selected column(s) be retained (in addition to the new dummy variables).} \item{naming}{A function that defines the naming convention for diff --git a/man/step_other.Rd b/man/step_other.Rd index 5e4dbd337..ef32206fa 100644 --- a/man/step_other.Rd +++ b/man/step_other.Rd @@ -76,6 +76,9 @@ If the retained categories include the value of \code{other}, an error is thrown. If \code{other} is in the list of discarded levels, no error occurs. +If no pooling is done, novel factor levels are converted to missing. If +pooling is needed, they will be placed into the other category. + When data to be processed contains novel levels (i.e., not contained in the training set), the other category is assigned. } diff --git a/tests/testthat/test_other.R b/tests/testthat/test_other.R index 5deba789d..3217b0ebb 100644 --- a/tests/testthat/test_other.R +++ b/tests/testthat/test_other.R @@ -155,7 +155,7 @@ test_that('novel levels', { df <- data.frame( y = c(1,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0,0,0,1,0), x1 = c('A','B','B','B','B','A','A','A','B','A','A','B', - 'A','C','C','B','A','B','C','A'), + 'A','C','C','B','A','B','C','D'), stringsAsFactors = FALSE) training <- df[1:10,] testing <- df[11:20,] @@ -170,8 +170,24 @@ test_that('novel levels', { novel_level <- prep(novel_level, training = training, retain = TRUE) new_results <- bake(novel_level, new_data = testing) orig_results <- bake(novel_level, new_data = training) - expect_true(all(new_results$x1[testing$x1 == "C"] == "other")) + expect_true(all(is.na(new_results$x1[testing$x1 == "C"]))) expect_true(!any(orig_results$x1 == "other")) + + training <- df[1:14,] + testing <- df[15:20,] + training$y <- as.factor(training$y) + training$x1 <- as.factor(training$x1) + testing$y <- as.factor(testing$y) + testing$x1 <- as.factor(testing$x1) + + novel_level <- recipe(y ~ ., data = training) %>% + step_other(x1, threshold = .1) + + novel_level <- prep(novel_level, training = training, retain = TRUE) + new_results <- bake(novel_level, new_data = testing) + orig_results <- bake(novel_level, new_data = training) + expect_true(all(new_results$x1[testing$x1 == "D"] == "other")) + expect_true(any(new_results$x1 == "other")) }) test_that("'other' already in use", {