Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new step_indicate_na() #623

Merged
merged 5 commits into from
Feb 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ S3method(bake,step_ica)
S3method(bake,step_impute_bag)
S3method(bake,step_impute_knn)
S3method(bake,step_impute_linear)
S3method(bake,step_indicate_na)
S3method(bake,step_impute_lower)
S3method(bake,step_impute_mean)
S3method(bake,step_impute_median)
Expand Down Expand Up @@ -115,6 +116,7 @@ S3method(prep,step_ica)
S3method(prep,step_impute_bag)
S3method(prep,step_impute_knn)
S3method(prep,step_impute_linear)
S3method(prep,step_indicate_na)
S3method(prep,step_impute_lower)
S3method(prep,step_impute_mean)
S3method(prep,step_impute_median)
Expand Down Expand Up @@ -244,6 +246,7 @@ S3method(tidy,step_ica)
S3method(tidy,step_impute_bag)
S3method(tidy,step_impute_knn)
S3method(tidy,step_impute_linear)
S3method(tidy,step_indicate_na)
S3method(tidy,step_impute_lower)
S3method(tidy,step_impute_mean)
S3method(tidy,step_impute_median)
Expand Down Expand Up @@ -401,6 +404,7 @@ export(step_ica)
export(step_impute_bag)
export(step_impute_knn)
export(step_impute_linear)
export(step_indicate_na)
export(step_impute_lower)
export(step_impute_mean)
export(step_impute_median)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Two new selectors that combine role and data type were added: `all_numeric_predictors()` and `all_nominal_predictors()`. (#620)

* The `threshold`argument of `step_pca()` is now `tunable()` (#534).
* Added a new step called `step_indicate_na()`, which will create and append additional binary columns to the dataset to indicate which observations are missing (#623)

* Integer variables used in `step_profile()` are now kept as integers (and not doubles).

Expand Down
147 changes: 147 additions & 0 deletions R/naindicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#' Create Missing Data Column Indicators
#'
#' `step_indicate_na` creates a *specification* of a recipe step that will
#' create and append additional binary columns to the dataset to indicate
#' which observations are missing.
#'
#' @param recipe A recipe object. The check will be added to the
#' sequence of operations for this recipe.
#' @param ... One or more selector functions to choose which variables are
#' affected by the step. See [selections()] for more details. For the `tidy`
#' method, these are not currently used.
#' @param role For model terms created by this step, what analysis
#' role should they be assigned?. By default, the function assumes
#' that the new na indicator columns created from the original
#' variables will be used as predictors in a model.
#' @param trained A logical for whether the selectors in `...`
#' have been resolved by [prep()].
#' @param columns A character string of variable names that will
#' be populated (eventually) by the terms argument.
#' @param prefix A character string that will be the prefix to the
#' resulting new variables. Defaults to "na_ind".
#' @param skip A logical. Should the check be skipped when the
#' recipe is baked by [bake.recipe()]? While all operations are baked
#' when [prep.recipe()] is run, some operations may not be able to be
#' conducted on new data (e.g. processing the outcome variable(s)).
#' Care should be taken when using `skip = TRUE` as it may affect
#' the computations for subsequent operations.
#' @param id A character string that is unique to this step to identify it.
#' @return An updated version of `recipe` with the new step added to the
#' sequence of existing steps (if any). For the `tidy` method, a tibble with
#' columns `terms` (the selectors or variables selected) and `model` (the
#' median value).
#' @keywords datagen
#' @concept preprocessing
#' @concept imputation
#' @export
#' @examples
#' library(modeldata)
#' data("credit_data")
#'
#' ## missing data per column
#' purrr::map_dbl(credit_data, function(x) mean(is.na(x)))
#'
#' set.seed(342)
#' in_training <- sample(1:nrow(credit_data), 2000)
#'
#' credit_tr <- credit_data[ in_training, ]
#' credit_te <- credit_data[-in_training, ]
#'
#' rec <- recipe(Price ~ ., data = credit_tr)
#'
#' impute_rec <- rec %>%
#' step_indicate_na(Income, Assets, Debt)
#'
#' imp_models <- prep(impute_rec, training = credit_tr)
#'
#' imputed_te <- bake(imp_models, new_data = credit_te, everything())

step_indicate_na <-
function(recipe,
...,
role = "predictor",
trained = FALSE,
columns = NULL,
prefix = "na_ind",
skip = FALSE,
id = rand_id("indicate_na")) {

terms = ellipse_check(...)

add_step(
recipe,
step_indicate_na_new(
terms = terms,
role = role,
trained = trained,
columns = columns,
prefix = prefix,
skip = skip,
id = id
)
)
}

step_indicate_na_new <-
function(terms, role, trained, columns, prefix, skip, id) {
step(
subclass = "indicate_na",
terms = terms,
role = role,
trained = trained,
columns = columns,
prefix = prefix,
skip = skip,
id = id
)
}

#' @export
prep.step_indicate_na <- function(x, training, info = NULL, ...) {
col_names <- terms_select(x$terms, info)

step_indicate_na_new(
terms = x$terms,
role = x$role,
trained = TRUE,
columns = col_names,
prefix = x$prefix,
skip = x$skip,
id = x$id
)
}

#' @export
bake.step_indicate_na <- function(object, new_data, ...) {
col_names <- object$columns

df_ind_na <- purrr::map_dfc(
new_data[col_names],
~ifelse(is.na(.x), 1L, 0L)
) %>%
dplyr::rename_with(~paste0(object$prefix, "_", .x))
new_data <- dplyr::bind_cols(new_data, df_ind_na)

tibble::as_tibble(new_data)
}

print.step_indicate_na <-
function(x, width = max(20, options()$width - 30), ...) {
cat("Creating missing data variable indicators for ", sep = "")
printer(x$columns, x$terms, x$trained, width = width)
invisible(x)
}

#' @rdname step_indicate_na
#' @param x A `step_indicate_na` object.
#' @export
tidy.step_indicate_na <- function(x, ...) {
if (is_trained(x)) {
res <- tibble::tibble(terms = x$columns)
} else {
res <- tibble::tibble(terms = sel2char(x$terms))
}
res$id <- x$id
res
}

89 changes: 89 additions & 0 deletions man/step_indicate_na.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

100 changes: 100 additions & 0 deletions tests/testthat/test_naindicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
library(testthat)
library(recipes)
library(tidyr)

context("step_indicate_na")

train <-
tibble::tibble(
col1 = c("a", "b", "c"),
col2 = c(NA, "d", "e"),
col3 = c("k", NA, "g")
)

test <-
tibble::tibble(
col1 = c(NA, NA, NA),
col2 = c("t", "d", "e"),
col3 = c("z", "f", NA)
)

test_that("step_indicate_na populates binaries correctly", {

rec1 <- recipe(train) %>%
step_indicate_na(col1) %>%
prep(train, verbose = FALSE, retain = TRUE)

expect_equal(
bake(rec1, train)$na_ind_col1,
c(0, 0, 0)
)

expect_equal(
bake(rec1, test)$na_ind_col1,
c(1, 1, 1)
)

rec2 <- recipe(train) %>%
step_indicate_na(col2, col3) %>%
prep(train, verbose = FALSE, retain = TRUE)

expect_equal(bake(rec2, train)$na_ind_col2, c(1, 0, 0))
expect_equal(bake(rec2, train)$na_ind_col3, c(0, 1, 0))

expect_equal(bake(rec2, test)$na_ind_col2, c(0, 0, 0))
expect_equal(bake(rec2, test)$na_ind_col3, c(0, 0, 1))

})

test_that("step_indicate_na on all columns", {

baked <- recipe(Ozone ~ ., data = airquality) %>%
step_indicate_na(everything()) %>%
prep(airquality, verbose = FALSE, retain = TRUE) %>%
juice()

expect_named(
baked,
c("Solar.R", "Wind", "Temp", "Month", "Day", "Ozone",
"na_ind_Solar.R", "na_ind_Wind", "na_ind_Temp",
"na_ind_Month", "na_ind_Day", "na_ind_Ozone"
)
)
})

test_that("step_indicate_na on subset of columns", {

baked <- recipe(Ozone ~ ., data = airquality) %>%
step_indicate_na(Ozone, Solar.R) %>%
prep(airquality, verbose = FALSE, retain = TRUE) %>%
juice()

expect_named(
baked,
c("Solar.R", "Wind", "Temp", "Month", "Day",
"Ozone", "na_ind_Ozone", "na_ind_Solar.R"
)
)

baked2 <- recipe(Ozone ~ ., data = airquality) %>%
step_indicate_na(Solar.R) %>%
prep(airquality, verbose = FALSE, retain = TRUE) %>%
juice()

expect_named(
baked2,
c("Solar.R", "Wind", "Temp", "Month", "Day",
"Ozone", "na_ind_Solar.R"
)
)
})

test_that("something prints", {
rec <- recipe(Ozone ~ ., data = airquality) %>%
step_indicate_na(all_predictors())

expect_output(print(rec))
expect_output(prep(rec, training = airquality, verbose = TRUE))
})