tidymodels · topepo · Feb 3, 2021 · Dec 18, 2020 · Dec 24, 2020 · Dec 27, 2020
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,7 @@ S3method(bake,step_ica)
 S3method(bake,step_impute_bag)
 S3method(bake,step_impute_knn)
 S3method(bake,step_impute_linear)
+S3method(bake,step_indicate_na)
 S3method(bake,step_impute_lower)
 S3method(bake,step_impute_mean)
 S3method(bake,step_impute_median)
@@ -115,6 +116,7 @@ S3method(prep,step_ica)
 S3method(prep,step_impute_bag)
 S3method(prep,step_impute_knn)
 S3method(prep,step_impute_linear)
+S3method(prep,step_indicate_na)
 S3method(prep,step_impute_lower)
 S3method(prep,step_impute_mean)
 S3method(prep,step_impute_median)
@@ -244,6 +246,7 @@ S3method(tidy,step_ica)
 S3method(tidy,step_impute_bag)
 S3method(tidy,step_impute_knn)
 S3method(tidy,step_impute_linear)
+S3method(tidy,step_indicate_na)
 S3method(tidy,step_impute_lower)
 S3method(tidy,step_impute_mean)
 S3method(tidy,step_impute_median)
@@ -401,6 +404,7 @@ export(step_ica)
 export(step_impute_bag)
 export(step_impute_knn)
 export(step_impute_linear)
+export(step_indicate_na)
 export(step_impute_lower)
 export(step_impute_mean)
 export(step_impute_median)

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 * Two new selectors that combine role and data type were added: `all_numeric_predictors()` and `all_nominal_predictors()`. (#620)
 
 * The `threshold`argument of `step_pca()` is now `tunable()` (#534).
+* Added a new step called `step_indicate_na()`, which will create and append additional binary columns to the dataset to indicate which observations are missing (#623)
 
 * Integer variables used in `step_profile()` are now kept as integers (and not doubles). 
 

diff --git a/R/naindicate.R b/R/naindicate.R
@@ -0,0 +1,147 @@
+#' Create Missing Data Column Indicators
+#'
+#' `step_indicate_na` creates a *specification* of a recipe step that will
+#'  create and append additional binary columns to the dataset to indicate
+#'  which observations are missing.
+#'
+#' @param recipe A recipe object. The check will be added to the
+#'  sequence of operations for this recipe.
+#' @param ... One or more selector functions to choose which variables are
+#'  affected by the step. See [selections()] for more details. For the `tidy`
+#'  method, these are not currently used.
+#' @param role For model terms created by this step, what analysis
+#'  role should they be assigned?. By default, the function assumes
+#'  that the new na indicator columns created from the original
+#'  variables will be used as predictors in a model.
+#' @param trained A logical for whether the selectors in `...`
+#' have been resolved by [prep()].
+#' @param columns A character string of variable names that will
+#'  be populated (eventually) by the terms argument.
+#' @param prefix A character string that will be the prefix to the
+#'  resulting new variables. Defaults to "na_ind".
+#' @param skip A logical. Should the check be skipped when the
+#'  recipe is baked by [bake.recipe()]? While all operations are baked
+#'  when [prep.recipe()] is run, some operations may not be able to be
+#'  conducted on new data (e.g. processing the outcome variable(s)).
+#'  Care should be taken when using `skip = TRUE` as it may affect
+#'  the computations for subsequent operations.
+#' @param id A character string that is unique to this step to identify it.
+#' @return An updated version of `recipe` with the new step added to the
+#'  sequence of existing steps (if any). For the `tidy` method, a tibble with
+#'  columns `terms` (the selectors or variables selected) and `model` (the
+#'  median value).
+#' @keywords datagen
+#' @concept preprocessing
+#' @concept imputation
+#' @export
+#' @examples
+#' library(modeldata)
+#' data("credit_data")
+#'
+#' ## missing data per column
+#' purrr::map_dbl(credit_data, function(x) mean(is.na(x)))
+#'
+#' set.seed(342)
+#' in_training <- sample(1:nrow(credit_data), 2000)
+#'
+#' credit_tr <- credit_data[ in_training, ]
+#' credit_te <- credit_data[-in_training, ]
+#'
+#' rec <- recipe(Price ~ ., data = credit_tr)
+#'
+#' impute_rec <- rec %>%
+#'   step_indicate_na(Income, Assets, Debt)
+#'
+#' imp_models <- prep(impute_rec, training = credit_tr)
+#'
+#' imputed_te <- bake(imp_models, new_data = credit_te, everything())
+
+step_indicate_na <-
+  function(recipe,
+           ...,
+           role = "predictor",
+           trained = FALSE,
+           columns = NULL,
+           prefix = "na_ind",
+           skip = FALSE,
+           id = rand_id("indicate_na")) {
+
+    terms = ellipse_check(...)
+
+    add_step(
+      recipe,
+      step_indicate_na_new(
+        terms = terms,
+        role = role,
+        trained = trained,
+        columns = columns,
+        prefix = prefix,
+        skip = skip,
+        id = id
+      )
+    )
+  }
+
+step_indicate_na_new <-
+  function(terms, role, trained, columns, prefix, skip, id) {
+    step(
+      subclass = "indicate_na",
+      terms = terms,
+      role = role,
+      trained = trained,
+      columns = columns,
+      prefix = prefix,
+      skip = skip,
+      id = id
+    )
+  }
+
+#' @export
+prep.step_indicate_na <- function(x, training, info = NULL, ...) {
+  col_names <- terms_select(x$terms, info)
+
+  step_indicate_na_new(
+    terms = x$terms,
+    role = x$role,
+    trained = TRUE,
+    columns = col_names,
+    prefix = x$prefix,
+    skip = x$skip,
+    id = x$id
+  )
+}
+
+#' @export
+bake.step_indicate_na <- function(object, new_data, ...) {
+  col_names <- object$columns
+
+  df_ind_na <- purrr::map_dfc(
+      new_data[col_names],
+      ~ifelse(is.na(.x), 1L, 0L)
+    ) %>%
+    dplyr::rename_with(~paste0(object$prefix, "_", .x))
+  new_data <- dplyr::bind_cols(new_data, df_ind_na)
+
+  tibble::as_tibble(new_data)
+}
+
+print.step_indicate_na <-
+  function(x, width = max(20, options()$width - 30), ...) {
+    cat("Creating missing data variable indicators for ", sep = "")
+    printer(x$columns, x$terms, x$trained, width = width)
+    invisible(x)
+  }
+
+#' @rdname step_indicate_na
+#' @param x A `step_indicate_na` object.
+#' @export
+tidy.step_indicate_na <- function(x, ...) {
+  if (is_trained(x)) {
+    res <- tibble::tibble(terms = x$columns)
+  } else {
+    res <- tibble::tibble(terms = sel2char(x$terms))
+  }
+  res$id <- x$id
+  res
+}
+
diff --git a/man/step_indicate_na.Rd b/man/step_indicate_na.Rd
diff --git a/tests/testthat/test_naindicate.R b/tests/testthat/test_naindicate.R
@@ -0,0 +1,100 @@
+library(testthat)
+library(recipes)
+library(tidyr)
+
+context("step_indicate_na")
+
+train <-
+  tibble::tibble(
+    col1 = c("a", "b", "c"),
+    col2 = c(NA, "d", "e"),
+    col3 = c("k", NA, "g")
+  )
+
+test <-
+  tibble::tibble(
+    col1 = c(NA, NA, NA),
+    col2 = c("t", "d", "e"),
+    col3 = c("z", "f", NA)
+  )
+
+test_that("step_indicate_na populates binaries correctly", {
+
+  rec1 <- recipe(train) %>%
+    step_indicate_na(col1) %>%
+    prep(train, verbose = FALSE, retain = TRUE)
+
+  expect_equal(
+    bake(rec1, train)$na_ind_col1,
+    c(0, 0, 0)
+  )
+
+  expect_equal(
+    bake(rec1, test)$na_ind_col1,
+    c(1, 1, 1)
+  )
+
+  rec2 <- recipe(train) %>%
+    step_indicate_na(col2, col3) %>%
+    prep(train, verbose = FALSE, retain = TRUE)
+
+  expect_equal(bake(rec2, train)$na_ind_col2, c(1, 0, 0))
+  expect_equal(bake(rec2, train)$na_ind_col3, c(0, 1, 0))
+
+  expect_equal(bake(rec2, test)$na_ind_col2, c(0, 0, 0))
+  expect_equal(bake(rec2, test)$na_ind_col3, c(0, 0, 1))
+
+})
+
+test_that("step_indicate_na on all columns", {
+
+  baked <- recipe(Ozone ~ ., data = airquality) %>%
+    step_indicate_na(everything()) %>%
+    prep(airquality, verbose = FALSE, retain = TRUE) %>%
+    juice()
+
+  expect_named(
+    baked,
+    c("Solar.R", "Wind", "Temp", "Month", "Day", "Ozone",
+      "na_ind_Solar.R", "na_ind_Wind", "na_ind_Temp",
+      "na_ind_Month", "na_ind_Day", "na_ind_Ozone"
+      )
+    )
+})
+
+test_that("step_indicate_na on subset of columns", {
+
+  baked <- recipe(Ozone ~ ., data = airquality) %>%
+    step_indicate_na(Ozone, Solar.R) %>%
+    prep(airquality, verbose = FALSE, retain = TRUE) %>%
+    juice()
+
+  expect_named(
+    baked,
+    c("Solar.R", "Wind", "Temp", "Month", "Day",
+      "Ozone", "na_ind_Ozone", "na_ind_Solar.R"
+      )
+    )
+
+  baked2 <- recipe(Ozone ~ ., data = airquality) %>%
+    step_indicate_na(Solar.R) %>%
+    prep(airquality, verbose = FALSE, retain = TRUE) %>%
+    juice()
+
+  expect_named(
+    baked2,
+    c("Solar.R", "Wind", "Temp", "Month", "Day",
+      "Ozone", "na_ind_Solar.R"
+      )
+    )
+})
+
+test_that("something prints", {
+  rec <- recipe(Ozone ~ ., data = airquality) %>%
+    step_indicate_na(all_predictors())
+
+  expect_output(print(rec))
+  expect_output(prep(rec, training = airquality, verbose = TRUE))
+})
+
+