From 0c934d71c346ecf17cf0840119132f461894a5cf Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 27 Jun 2022 10:48:25 -0400 Subject: [PATCH 1/4] Remove `bake_dependent_roles` in favor of new recipes role requirements --- R/blueprint-recipe-default.R | 214 +++++++++--------- R/blueprint-recipe.R | 30 --- man/default_recipe_blueprint.Rd | 74 +++--- man/new-blueprint.Rd | 14 -- man/new-default-blueprint.Rd | 14 -- tests/testthat/_snaps/forge-recipe.md | 32 +++ tests/testthat/_snaps/mold-recipe.md | 23 -- tests/testthat/data-raw/hardhat-0.2.0.R | 31 ++- ....2.0-post-mold-recipe-nonstandard-role.rds | Bin 0 -> 4701 bytes tests/testthat/helper-recipes.R | 6 + tests/testthat/test-forge-recipe.R | 200 ++++++++++++---- tests/testthat/test-mold-recipe.R | 147 +++++++----- 12 files changed, 437 insertions(+), 348 deletions(-) create mode 100644 tests/testthat/_snaps/forge-recipe.md delete mode 100644 tests/testthat/_snaps/mold-recipe.md create mode 100644 tests/testthat/data/hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds create mode 100644 tests/testthat/helper-recipes.R diff --git a/R/blueprint-recipe-default.R b/R/blueprint-recipe-default.R index 19b757c2..7dc841f3 100644 --- a/R/blueprint-recipe-default.R +++ b/R/blueprint-recipe-default.R @@ -44,6 +44,7 @@ #' #' - It adds an intercept column onto `new_data` if `intercept = TRUE`. #' +#' @export #' @examples #' library(recipes) #' @@ -92,15 +93,21 @@ #' mold(rec2, iris)$predictors #' #' # --------------------------------------------------------------------------- -#' # Non standard roles +#' # Matrix output for predictors #' -#' # If you have custom recipes roles, it is assumed that they are required -#' # in `prep()` and afterwards for modeling, but are not required at `bake()` -#' # time and for prediction. This means that they are processed and returned -#' # in the `$extras$roles` slot of the return value of `mold()`, but they -#' # are not required to be in `new_data` in `forge()` and are not returned -#' # in the result. +#' # You can change the `composition` of the predictor data set +#' bp <- default_recipe_blueprint(composition = "dgCMatrix") +#' processed <- mold(rec, train, blueprint = bp) +#' class(processed$predictors) #' +#' @examplesIf utils::packageVersion("recipes") >= "0.2.0.9002" +#' # --------------------------------------------------------------------------- +#' # Non standard roles +#' +#' # If you have custom recipes roles, they are assumed to be required at +#' # `bake()` time when passing in `new_data`. This is an assumption that both +#' # recipes and hardhat makes, meaning that those roles are required at +#' # `forge()` time as well. #' rec_roles <- recipe(train) %>% #' update_role(Sepal.Width, new_role = "predictor") %>% #' update_role(Species, new_role = "outcome") %>% @@ -109,53 +116,35 @@ #' #' processed_roles <- mold(rec_roles, train) #' -#' # The custom roles will still be in the `mold()` result in case you need +#' # The custom roles will be in the `mold()` result in case you need #' # them for modeling. #' processed_roles$extras #' -#' # Notice that the columns with custom roles exist in `test`, -#' # but they weren't passed to `bake()` and aren't in the output. +#' # And they are in the `forge()` result #' forge(test, processed_roles$blueprint)$extras #' -#' # They can even be removed from `test` entirely, and it still works. +#' # If you remove a column with a custom role from the test data, then you +#' # won't be able to `forge()` even though this recipe technically didn't +#' # use that column in any steps #' test2 <- test #' test2$Petal.Length <- NULL -#' forge(test2, processed_roles$blueprint)$extras -#' -#' # Occasionally, you might have a custom role that is required to be able -#' # to `bake()` on `new_data`. In those cases, you can specify it with -#' # `bake_dependent_roles` in `default_recipe_blueprint()`, which will ensure -#' # that it is a required column when calling `forge()`, that it will be -#' # passed on to `bake()`, and that it will be returned in the result. -#' bp <- default_recipe_blueprint(bake_dependent_roles = "important") -#' -#' processed_roles <- mold(rec_roles, train, blueprint = bp) -#' -#' # Now `"important"` is a required role when `forge()`-ing -#' forge(test, processed_roles$blueprint)$extras$roles -#' -#' # Which means that we can't `forge()` with the data frame that is missing -#' # the `Petal.Length` column #' try(forge(test2, processed_roles$blueprint)) #' -#' # --------------------------------------------------------------------------- -#' # Matrix output for predictors -#' -#' # You can change the `composition` of the predictor data set -#' bp <- default_recipe_blueprint(composition = "dgCMatrix") -#' processed <- mold(rec, train, blueprint = bp) -#' class(processed$predictors) -#' @export +#' # Most of the time, if you find yourself in the above scenario, then we +#' # suggest that you remove `Petal.Length` from the data that is supplied to +#' # the recipe. If that isn't an option, you can declare that that column +#' # isn't required at `bake()` time by using `update_role_requirements()` +#' rec_roles <- update_role_requirements(rec_roles, "important", bake = FALSE) +#' processed_roles <- mold(rec_roles, train) +#' forge(test2, processed_roles$blueprint) default_recipe_blueprint <- function(intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble") { new_default_recipe_blueprint( intercept = intercept, allow_novel_levels = allow_novel_levels, fresh = fresh, - bake_dependent_roles = bake_dependent_roles, composition = composition ) } @@ -170,7 +159,6 @@ default_recipe_blueprint <- function(intercept = FALSE, new_default_recipe_blueprint <- function(intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble", ptypes = NULL, recipe = NULL, @@ -181,7 +169,6 @@ new_default_recipe_blueprint <- function(intercept = FALSE, intercept = intercept, allow_novel_levels = allow_novel_levels, fresh = fresh, - bake_dependent_roles = bake_dependent_roles, composition = composition, ptypes = ptypes, recipe = recipe, @@ -203,8 +190,6 @@ refresh_blueprint.default_recipe_blueprint <- function(blueprint) { run_mold.default_recipe_blueprint <- function(blueprint, ..., data) { check_dots_empty0(...) - blueprint <- patch_recipe_default_blueprint(blueprint) - cleaned <- mold_recipe_default_clean(blueprint = blueprint, data = data) blueprint <- cleaned$blueprint @@ -299,28 +284,29 @@ mold_recipe_default_process_outcomes <- function(blueprint, data) { mold_recipe_default_process_extras <- function(blueprint, data) { - # Capture original non standard role columns that are required by - # `bake_dependent_roles`. These are also required in `new_data`. + # Capture original non standard role columns that exist in `data` and are also + # required by the `recipe$requirements$bake` requirement. These columns are + # also required in `new_data` at `bake()` time. original_extra_role_cols <- get_extra_role_columns_original( blueprint$recipe, - data, - blueprint$bake_dependent_roles + data ) if (!is.null(original_extra_role_cols)) { + original_extra_role_ptypes <- lapply(original_extra_role_cols, extract_ptype) + blueprint <- update_blueprint( blueprint, - extra_role_ptypes = lapply(original_extra_role_cols, extract_ptype) + extra_role_ptypes = original_extra_role_ptypes ) } # Return all of the processed non standard role columns. # These might be generated by `prep()` and could differ from the ones in the # original data. - # These are not required in `new_data`, and are not affected by - # `bake_dependent_roles`, but we return them assuming the developer may - # need them for model fitting purposes. - processed_extra_role_cols <- get_extra_role_columns_mold( + # These are not required in `new_data`, but we return them assuming the + # developer may need them for model fitting purposes. + processed_extra_role_cols <- get_extra_role_columns_processed( blueprint$recipe, recipes::juice(blueprint$recipe) ) @@ -341,8 +327,6 @@ run_forge.default_recipe_blueprint <- function(blueprint, outcomes = FALSE) { check_dots_empty0(...) - blueprint <- patch_recipe_default_blueprint(blueprint) - cleaned <- forge_recipe_default_clean( blueprint = blueprint, new_data = new_data, @@ -472,7 +456,6 @@ forge_recipe_default_process <- function(blueprint, predictors, outcomes, extras extras, rec, baked_data, - blueprint$bake_dependent_roles, predictors_extras, outcomes_extras ) @@ -511,18 +494,16 @@ forge_recipe_default_process_outcomes <- function(blueprint, outcomes) { forge_recipe_default_process_extras <- function(extras, rec, baked_data, - bake_dependent_roles, predictors_extras, outcomes_extras) { # Remove old roles slot extras$roles <- NULL - # Get the `bake_dependent_roles` after `bake()` has been called. - processed_extra_role_cols <- get_extra_role_columns_forge( + # Get the processed extra role columns after `bake()` has been called. + processed_extra_role_cols <- get_extra_role_columns_processed( rec, - baked_data, - bake_dependent_roles + baked_data ) extras <- c( @@ -536,21 +517,6 @@ forge_recipe_default_process_extras <- function(extras, # ------------------------------------------------------------------------------ -patch_recipe_default_blueprint <- function(blueprint) { - blueprint <- patch_recipe_default_blueprint_pre_1.0.0(blueprint) - blueprint -} - -patch_recipe_default_blueprint_pre_1.0.0 <- function(blueprint) { - if (is.null(blueprint$bake_dependent_roles)) { - blueprint$bake_dependent_roles <- character() - } - - blueprint -} - -# ------------------------------------------------------------------------------ - get_original_predictor_ptype <- function(rec, data) { roles <- rec$var_info$role original_names <- rec$var_info$variable[strict_equal(roles, "predictor")] @@ -570,41 +536,38 @@ get_original_outcome_ptype <- function(rec, data) { extract_ptype(original_data) } -get_extra_role_columns_original <- function(rec, data, bake_dependent_roles) { +get_extra_role_columns_original <- function(rec, data) { # Extra roles that existed before `prep()` has been called. - # Only the ptypes of `bake_dependent_roles` are retained and required in `new_data`. + # To get "extra" roles that are required at bake time: + # - Start with the roles that actually exist in `data` + # - Remove the ones that aren't required at bake time + # - Remove the `"predictor"` role (it is always required, but isn't "extra") info_type <- "var_info" - # Take the intersection in case the user specified a bake dependent role that - # isn't actually present in the original data, which is allowed. - roles <- rec[[info_type]][["role"]] - extra_roles <- intersect(bake_dependent_roles, roles) + data_roles <- rec[[info_type]][["role"]] + data_roles <- chr_explicit_na(data_roles) + data_roles <- unique(data_roles) - get_extra_role_columns(rec, data, extra_roles, info_type) -} -get_extra_role_columns_mold <- function(rec, data) { - # Extra roles that exist after `prep()` has been called on the training data. - # These might include more / less than the original ones. - # These are not required in `new_data`, but we return all of the results from `mold()`. - info_type <- "term_info" + required_at_bake <- get_bake_role_requirements(rec) + not_required_at_bake <- required_at_bake[!required_at_bake] + not_required_at_bake_roles <- names(not_required_at_bake) - # Any non-standard role is returned from `mold()` in case the developer needs them. - roles <- rec[[info_type]][["role"]] - extra_roles <- setdiff(roles, c("outcome", "predictor", NA_character_)) + required_roles <- setdiff(data_roles, not_required_at_bake_roles) + extra_roles <- setdiff(required_roles, "predictor") get_extra_role_columns(rec, data, extra_roles, info_type) } -get_extra_role_columns_forge <- function(rec, data, bake_dependent_roles) { - # Extra roles that exist after `bake()` has been called on the test data. - # `data` should hold the columns specified by `bake_dependent_roles`. +get_extra_role_columns_processed <- function(rec, data) { + # Extra roles that exist after baking either the training data or testing + # data (i.e. through `prep()` or `bake()`). This might include more or less + # roles than in the original data, because steps may have created or removed + # them along the way. info_type <- "term_info" - # Take the intersection in case columns with one of the `bake_dependent_roles` - # got dropped during the baking process, which is allowed. If users create - # an entirely new role at bake-time, it should be listed as a bake dependent - # role to be returned. - roles <- rec[[info_type]][["role"]] - extra_roles <- intersect(bake_dependent_roles, roles) + data_roles <- rec[[info_type]][["role"]] + data_roles <- chr_explicit_na(data_roles) + + extra_roles <- setdiff(data_roles, c("outcome", "predictor")) get_extra_role_columns(rec, data, extra_roles, info_type) } @@ -616,23 +579,54 @@ get_extra_role_columns <- function(rec, data, extra_roles, info_type) { return(NULL) } - roles <- rec[[info_type]][["role"]] - vars <- rec[[info_type]][["variable"]] + data_names <- colnames(data) + + recipe_names <- rec[[info_type]][["variable"]] + recipe_roles <- rec[[info_type]][["role"]] + recipe_roles <- chr_explicit_na(recipe_roles) + + out <- lapply(extra_roles, function(role) { + role_names <- recipe_names[recipe_roles == role] - extra_role_cols_list <- lapply(extra_roles, function(.role) { - .role_cols <- vars[strict_equal(roles, .role)] - data[, .role_cols, drop = FALSE] + # Must restrict to names that are actually in the `data` in case some + # roles were declared as "not required" and columns with those roles weren't + # passed to `forge()` through `new_data`. + role_names <- intersect(role_names, data_names) + + data[, role_names, drop = FALSE] }) - names(extra_role_cols_list) <- extra_roles + names(out) <- extra_roles - extra_role_cols_list + out } -validate_is_0_row_tibble_or_null <- function(.x, .x_nm) { - if (is.null(.x)) { - return(invisible(.x)) - } +# ------------------------------------------------------------------------------ + +get_role_requirements <- function(recipe) { + # recipes:::get_role_requirements() + recipe$requirements %||% set_names(list(), nm = character()) +} + +get_bake_role_requirements <- function(recipe) { + # recipes:::get_bake_role_requirements() + requirements <- get_role_requirements(recipe) + requirements$bake %||% default_bake_role_requirements() +} +default_bake_role_requirements <- function() { + # recipes:::default_bake_role_requirements() + c( + "outcome" = FALSE, + "predictor" = TRUE, + "case_weights" = FALSE, + "NA" = TRUE + ) +} - validate_is_0_row_tibble(.x, .x_nm) +chr_explicit_na <- function(x) { + # recipes:::chr_explicit_na() + # To turn `NA_character_` into `"NA"` because you can't match + # against `NA_character_` when assigning with `[<-` + x[is.na(x)] <- "NA" + x } diff --git a/R/blueprint-recipe.R b/R/blueprint-recipe.R index 940137f6..772c7911 100644 --- a/R/blueprint-recipe.R +++ b/R/blueprint-recipe.R @@ -4,25 +4,11 @@ #' @param fresh Should already trained operations be re-trained when `prep()` is #' called? #' -#' @param bake_dependent_roles A character vector of recipes column "roles" -#' specifying roles that are required to [recipes::bake()] new data. Can't be -#' `"predictor"` or `"outcome"`, as predictors are always required and -#' outcomes are handled by the `outcomes` argument of [forge()]. -#' -#' Typically, non-standard roles (such as `"id"` or `"case_weights"`) are not -#' required to `bake()` new data. Unless specified by `bake_dependent_roles`, -#' these non-standard role columns are excluded from checks done in [forge()] -#' to validate the column structure of `new_data`, will not be passed to -#' `bake()` even if they existed in `new_data`, and will not be returned in -#' the `forge()$extras$roles` slot. See the documentation of -#' [recipes::add_role()] for more information about roles. -#' #' @rdname new-blueprint #' @export new_recipe_blueprint <- function(intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble", ptypes = NULL, recipe = NULL, @@ -31,14 +17,12 @@ new_recipe_blueprint <- function(intercept = FALSE, validate_recipes_available() validate_is_bool(fresh) - validate_bake_dependent_roles(bake_dependent_roles) validate_is_recipe_or_null(recipe) new_blueprint( intercept = intercept, allow_novel_levels = allow_novel_levels, fresh = fresh, - bake_dependent_roles = bake_dependent_roles, composition = composition, ptypes = ptypes, recipe = recipe, @@ -69,17 +53,3 @@ is_recipe <- function(x) { validate_is_recipe_or_null <- function(recipe) { validate_is_or_null(recipe, is_recipe, "recipe") } - -validate_bake_dependent_roles <- function(bake_dependent_roles) { - validate_is_character(bake_dependent_roles, "bake_dependent_roles") - - if (anyNA(bake_dependent_roles)) { - abort("`bake_dependent_roles` can't contain missing values.") - } - - if (any(bake_dependent_roles %in% c("outcome", "predictor"))) { - abort("`bake_dependent_roles` can't be \"outcome\" or \"predictor\", as these are already handled.") - } - - invisible(bake_dependent_roles) -} diff --git a/man/default_recipe_blueprint.Rd b/man/default_recipe_blueprint.Rd index ad35cf89..f18cc26c 100644 --- a/man/default_recipe_blueprint.Rd +++ b/man/default_recipe_blueprint.Rd @@ -9,7 +9,6 @@ default_recipe_blueprint( intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble" ) @@ -27,19 +26,6 @@ prediction time? This information is used by the \code{clean} function in the \item{fresh}{Should already trained operations be re-trained when \code{prep()} is called?} -\item{bake_dependent_roles}{A character vector of recipes column "roles" -specifying roles that are required to \code{\link[recipes:bake]{recipes::bake()}} new data. Can't be -\code{"predictor"} or \code{"outcome"}, as predictors are always required and -outcomes are handled by the \code{outcomes} argument of \code{\link[=forge]{forge()}}. - -Typically, non-standard roles (such as \code{"id"} or \code{"case_weights"}) are not -required to \code{bake()} new data. Unless specified by \code{bake_dependent_roles}, -these non-standard role columns are excluded from checks done in \code{\link[=forge]{forge()}} -to validate the column structure of \code{new_data}, will not be passed to -\code{bake()} even if they existed in \code{new_data}, and will not be returned in -the \code{forge()$extras$roles} slot. See the documentation of -\code{\link[recipes:roles]{recipes::add_role()}} for more information about roles.} - \item{composition}{Either "tibble", "matrix", or "dgCMatrix" for the format of the processed predictors. If "matrix" or "dgCMatrix" are chosen, all of the predictors must be numeric after the preprocessing method has been @@ -136,15 +122,21 @@ rec2 <- step_intercept(rec) mold(rec2, iris)$predictors # --------------------------------------------------------------------------- -# Non standard roles +# Matrix output for predictors + +# You can change the `composition` of the predictor data set +bp <- default_recipe_blueprint(composition = "dgCMatrix") +processed <- mold(rec, train, blueprint = bp) +class(processed$predictors) -# If you have custom recipes roles, it is assumed that they are required -# in `prep()` and afterwards for modeling, but are not required at `bake()` -# time and for prediction. This means that they are processed and returned -# in the `$extras$roles` slot of the return value of `mold()`, but they -# are not required to be in `new_data` in `forge()` and are not returned -# in the result. +\dontshow{if (utils::packageVersion("recipes") >= "0.2.0.9002") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# --------------------------------------------------------------------------- +# Non standard roles +# If you have custom recipes roles, they are assumed to be required at +# `bake()` time when passing in `new_data`. This is an assumption that both +# recipes and hardhat makes, meaning that those roles are required at +# `forge()` time as well. rec_roles <- recipe(train) \%>\% update_role(Sepal.Width, new_role = "predictor") \%>\% update_role(Species, new_role = "outcome") \%>\% @@ -153,40 +145,26 @@ rec_roles <- recipe(train) \%>\% processed_roles <- mold(rec_roles, train) -# The custom roles will still be in the `mold()` result in case you need +# The custom roles will be in the `mold()` result in case you need # them for modeling. processed_roles$extras -# Notice that the columns with custom roles exist in `test`, -# but they weren't passed to `bake()` and aren't in the output. +# And they are in the `forge()` result forge(test, processed_roles$blueprint)$extras -# They can even be removed from `test` entirely, and it still works. +# If you remove a column with a custom role from the test data, then you +# won't be able to `forge()` even though this recipe technically didn't +# use that column in any steps test2 <- test test2$Petal.Length <- NULL -forge(test2, processed_roles$blueprint)$extras - -# Occasionally, you might have a custom role that is required to be able -# to `bake()` on `new_data`. In those cases, you can specify it with -# `bake_dependent_roles` in `default_recipe_blueprint()`, which will ensure -# that it is a required column when calling `forge()`, that it will be -# passed on to `bake()`, and that it will be returned in the result. -bp <- default_recipe_blueprint(bake_dependent_roles = "important") - -processed_roles <- mold(rec_roles, train, blueprint = bp) - -# Now `"important"` is a required role when `forge()`-ing -forge(test, processed_roles$blueprint)$extras$roles - -# Which means that we can't `forge()` with the data frame that is missing -# the `Petal.Length` column try(forge(test2, processed_roles$blueprint)) -# --------------------------------------------------------------------------- -# Matrix output for predictors - -# You can change the `composition` of the predictor data set -bp <- default_recipe_blueprint(composition = "dgCMatrix") -processed <- mold(rec, train, blueprint = bp) -class(processed$predictors) +# Most of the time, if you find yourself in the above scenario, then we +# suggest that you remove `Petal.Length` from the data that is supplied to +# the recipe. If that isn't an option, you can declare that that column +# isn't required at `bake()` time by using `update_role_requirements()` +rec_roles <- update_role_requirements(rec_roles, "important", bake = FALSE) +processed_roles <- mold(rec_roles, train) +forge(test2, processed_roles$blueprint) +\dontshow{\}) # examplesIf} } diff --git a/man/new-blueprint.Rd b/man/new-blueprint.Rd index 7954eb6b..0ade58ce 100644 --- a/man/new-blueprint.Rd +++ b/man/new-blueprint.Rd @@ -24,7 +24,6 @@ new_recipe_blueprint( intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble", ptypes = NULL, recipe = NULL, @@ -94,19 +93,6 @@ subclass this blueprint.} \item{fresh}{Should already trained operations be re-trained when \code{prep()} is called?} -\item{bake_dependent_roles}{A character vector of recipes column "roles" -specifying roles that are required to \code{\link[recipes:bake]{recipes::bake()}} new data. Can't be -\code{"predictor"} or \code{"outcome"}, as predictors are always required and -outcomes are handled by the \code{outcomes} argument of \code{\link[=forge]{forge()}}. - -Typically, non-standard roles (such as \code{"id"} or \code{"case_weights"}) are not -required to \code{bake()} new data. Unless specified by \code{bake_dependent_roles}, -these non-standard role columns are excluded from checks done in \code{\link[=forge]{forge()}} -to validate the column structure of \code{new_data}, will not be passed to -\code{bake()} even if they existed in \code{new_data}, and will not be returned in -the \code{forge()$extras$roles} slot. See the documentation of -\code{\link[recipes:roles]{recipes::add_role()}} for more information about roles.} - \item{recipe}{Either \code{NULL}, or an unprepped recipe. This argument is set automatically at \code{\link[=mold]{mold()}} time.} } diff --git a/man/new-default-blueprint.Rd b/man/new-default-blueprint.Rd index 2eefab03..16760e86 100644 --- a/man/new-default-blueprint.Rd +++ b/man/new-default-blueprint.Rd @@ -24,7 +24,6 @@ new_default_recipe_blueprint( intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - bake_dependent_roles = character(), composition = "tibble", ptypes = NULL, recipe = NULL, @@ -90,19 +89,6 @@ subclass this blueprint.} \item{fresh}{Should already trained operations be re-trained when \code{prep()} is called?} -\item{bake_dependent_roles}{A character vector of recipes column "roles" -specifying roles that are required to \code{\link[recipes:bake]{recipes::bake()}} new data. Can't be -\code{"predictor"} or \code{"outcome"}, as predictors are always required and -outcomes are handled by the \code{outcomes} argument of \code{\link[=forge]{forge()}}. - -Typically, non-standard roles (such as \code{"id"} or \code{"case_weights"}) are not -required to \code{bake()} new data. Unless specified by \code{bake_dependent_roles}, -these non-standard role columns are excluded from checks done in \code{\link[=forge]{forge()}} -to validate the column structure of \code{new_data}, will not be passed to -\code{bake()} even if they existed in \code{new_data}, and will not be returned in -the \code{forge()$extras$roles} slot. See the documentation of -\code{\link[recipes:roles]{recipes::add_role()}} for more information about roles.} - \item{recipe}{Either \code{NULL}, or an unprepped recipe. This argument is set automatically at \code{\link[=mold]{mold()}} time.} diff --git a/tests/testthat/_snaps/forge-recipe.md b/tests/testthat/_snaps/forge-recipe.md new file mode 100644 index 00000000..2eb62e3e --- /dev/null +++ b/tests/testthat/_snaps/forge-recipe.md @@ -0,0 +1,32 @@ +# `forge()` will error if required non standard roles are missing + + Code + forge(iris, x$blueprint) + Condition + Error in `glubort()`: + ! The following required columns are missing: 'Sepal.Width'. + +# `NA` roles are treated as extra roles that are required at `forge()` time + + Code + forge(iris, x$blueprint) + Condition + Error in `glubort()`: + ! The following required columns are missing: 'Petal.Length'. + +# `forge()` is compatible with hardhat 0.2.0 molded blueprints with a basic recipe + + Code + forge(new_data, blueprint) + Condition + Error in `glubort()`: + ! The following required columns are missing: 'x'. + +# `forge()` is compatible with hardhat 0.2.0 molded blueprints with a recipe with a nonstandard role + + Code + forge(new_data, blueprint) + Condition + Error in `glubort()`: + ! The following required columns are missing: 'id'. + diff --git a/tests/testthat/_snaps/mold-recipe.md b/tests/testthat/_snaps/mold-recipe.md deleted file mode 100644 index c7e59c7b..00000000 --- a/tests/testthat/_snaps/mold-recipe.md +++ /dev/null @@ -1,23 +0,0 @@ -# `bake_dependent_roles` is validated - - Code - (expect_error(default_recipe_blueprint(bake_dependent_roles = 1))) - Output - - Error in `glubort()`: - ! bake_dependent_roles should be a character, not a numeric. - Code - (expect_error(default_recipe_blueprint(bake_dependent_roles = c("outcome", "x"))) - ) - Output - - Error in `validate_bake_dependent_roles()`: - ! `bake_dependent_roles` can't be "outcome" or "predictor", as these are already handled. - Code - (expect_error(default_recipe_blueprint(bake_dependent_roles = c("predictor", - "x")))) - Output - - Error in `validate_bake_dependent_roles()`: - ! `bake_dependent_roles` can't be "outcome" or "predictor", as these are already handled. - diff --git a/tests/testthat/data-raw/hardhat-0.2.0.R b/tests/testthat/data-raw/hardhat-0.2.0.R index 870f6a6e..1c1af777 100644 --- a/tests/testthat/data-raw/hardhat-0.2.0.R +++ b/tests/testthat/data-raw/hardhat-0.2.0.R @@ -3,8 +3,7 @@ # devtools::install_version("hardhat", "0.2.0") # ------------------------------------------------------------------------------ -# Testing compatibility of `mold()` after adding `bake_dependent_roles` -# in hardhat 1.0.0 +# Testing compatibility of `mold()` and a basic recipe dir <- here::here("tests", "testthat", "data") file <- fs::path(dir, "hardhat-0.2.0-pre-mold-recipe.rds") @@ -22,8 +21,7 @@ saveRDS( ) # ------------------------------------------------------------------------------ -# Testing compatibility of `forge()` after adding `bake_dependent_roles` -# in hardhat 1.0.0 +# Testing compatibility of `forge()` and a basic recipe dir <- here::here("tests", "testthat", "data") file <- fs::path(dir, "hardhat-0.2.0-post-mold-recipe.rds") @@ -47,3 +45,28 @@ saveRDS( ) # ------------------------------------------------------------------------------ +# Testing compatibility of `forge()` and a recipe with a nonstandard role + +dir <- here::here("tests", "testthat", "data") +file <- fs::path(dir, "hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds") + +data <- tibble::tibble(y = 1:5, x = 6:10, id = 1:5) +new_data <- tibble::tibble(y = 6:10, x = 11:15, id = 6:10) + +rec <- recipes::recipe(y ~ ., data = data) +rec <- recipes::update_role(rec, id, new_role = "id") +rec <- recipes::step_mutate(rec, z = 1) + +blueprint <- hardhat::default_recipe_blueprint() +mold <- hardhat::mold(rec, data = data, blueprint = blueprint) +blueprint <- mold$blueprint + +object <- list(new_data = new_data, blueprint = blueprint) + +saveRDS( + object, + file = file, + version = 2 +) + +# ------------------------------------------------------------------------------ diff --git a/tests/testthat/data/hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds b/tests/testthat/data/hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds new file mode 100644 index 0000000000000000000000000000000000000000..beb36e6f1122962ac851cb5fe54ef672573ffba0 GIT binary patch literal 4701 zcmV-j5~A%NiwFP!000001JxaAcigrST+qS_N~|P~9lMT`rg59J zNxNOGD2wrGSKdQ*+V@)jgZ_^Ggnnp$LVinMoiBZ9trFx6IRrtFly@ap?~OnZgSjvS zzyZMhx|=bUU`efmB|7-!kdfKCt4?uYuXK|z#H`y(*p1wTTe3^DQE8o}g9$3jglpyps{UoSl^81(|TMWJjw z)g_b|srg!}}GJavf%w-78Il9z5R-b-3Q6 zphFVq_W^D{q&cY519h@cXMofh0^AY69f15O^(bvxK1FEG z5Z-fee?j2ADDa+d5$`3!`#juVh4cdOT6qIrmrPiL@--;ubv6KR4Rqnxm!ZxTxX-Vz znsANCd;`*VK(^~ppUb`}@S*L03(_s-ZU37R*-?)7O}Gu$+kzY)Li!QV`2gtLmeBdw zgu8Hk&xB71od=Nq4AO^?ehTR$pnsp)&&ub_GuC9p;gNaEJA=lebWJZAhdK)$T*V=g z)=u2I6+ky*)2#;-C(RDz5blZNuQ1+BmKWse0q$fL7SD;gD}@&v4LER?7A%uZ~d zMH;(BrbDuERi+H~!2#`%65B1!Rwf_ilTpj%PB#y0)sorQ z>$?Jh&HzD|`3$CDw|-$)-zzrDjRFucQ+?McHL5l6i`21Ke4MU?&e6JJnzfrLn|DN3 zU&X&CqWQ9}#)WddAyDa!CzabM4ZU`N1XW`8sl^--&6H*GM$M}l)tt?ist5X;G-=eP zY-JM)?J8lV&PRnqj7tW=d?Q5hiaZtrSvmlvL#TF@++$d|a()k4jPLJLaBIDV77QFrj8P z7KIt7GpVLC8Krf^l+r)P;tdj0Ui{$zd~0mw_|r_@x*4;(IIQ}5`x&*m%6Ux`b!XM; zx{u0X<9qMSjMtuHUXPnsNXyF?)Y^z1Hy7Y>GeOR=sHQJEnd`Zhb^%25YNDdY%}1T( zEeXb34l&;Oc*g4udt5Wn$IH=tj=mDlw&|cYT8(E;DIEsB5%!>)uO$66sFGvepagm0 z5R33-LyULvIWWd=`27dKE&=%@^TeJkGivZ)Vk{=}(yoIs9pSHSaitc=;RcUSV3(xV zE<^3I3(`j0lX&4U=F`#am!kNm7!S<0gD=%5W#>mR-x<>$ou_M$bUXMWT@7Y-oPGw< zZb-Wz?X~g70e=kfVFhCX^Vyho%$|`QTgmqo@YSFT=lOm+e-85lG3_#NMs{f>-&cUg z(;>V56!ICg^V66gvGdM)#yF%9>z={;qbz>EJHcF^MH$*T9z~UFlDR(9PPtxzEIgfJ z@%(U#wcZaqnSW)K+h&=_fi{}9^Rp)zZ)b=u?VN`o&CZv8LT^Hs>u?Uz1zXNJY?paR z=T2akMdo=P=HHg*AG)+MLIvcbnUK!)Cr)MKsF)_5*UsN69mQqCJCG)faOyJ zW(my4ggk+h@Lndc5&{k6QB-X(A|BD_Zc<#?aKHh~Wm zaEAcW`Ix{x0{2e_vG!}2xE+X#ThL&0kuR)d;?u2bvV9&7zs`#Oyx{Ic^Olj-BDyDp z`vZUWpE30=HC)e6GB^Ip=Mov_?)^X<+#iDbLb^y<4|8{}@iYrKoHw77_Y>R!z!@UA zJmt^#EXdF64hnULp?(MG_nE+Z64G-}e*)@{NqEl)xYGoe+vN-9?!Tb*7XUX8=@Qgm z1l(mJD}Rppn8(Mr3vkV!Xa0QRK81BiHz3`F^fJ&|gLb>hV(m|G{A1#%ST?McZlhj6 zKa%X?TMUXxY5|1j9K!p>dZAe{o;CGCsakFxR0_D4Q>0HrtvM!laG4B=02y*rxt(gY z?B$UU(n;765g6I;7T4Du3dsP8()InCQF*G+o!1sNG5EsSX+=5nuWSUZ1*}93w})(A zSP|t5+N+SRmuhmiKu#i;fac1Jjp zB#b@g_fUxqe2;g?{)oJYFYJU^74Ur?DQs-;d1N-8rGnZhAJu~WG4;)P@h_ip!2Kkqft`<`ct%Y*rSn!;{ zq;(#7CZfrhr}!oR@ZsrfLB!D*tTVPSqdN|0!mS^D(10NdrGRoO4 z$Vr0*!e3lFt{{J~(*?{&TxeUN_-7SR>ms()V1Cf11>1|LZ-#mO8yDhwquI^z$KE#B z?Iix#M{I}-Z^R$-r{RydP>Bmtar&2B3+;#tgN)<{Y&vVmZz!64DT;rJfy~8r@YVba zgU&+WsGVQOJS-GC2b+dt+H3f9?X_}7eDi0-cd8wHkcNk`Na@8PXL3~s|08Odo3Qc;*i`5xz3a8DY$n0l?Sd( z-_hn+@Yu!MBgx#rUJ}ar7?y!+JW0)QyF+mCR5r)WGB>!$^<4m8;3@Azc@S`bwCM7h8(^%BrnV{Wi?7U*!?i$HgS$i?l+4Ystv6{vp= zcwLA3JgE8@a5SCyrYnl%fx;AAIy6|^AqMCp5t{i<{qM(1^qak`#|R*k>erM zdj$1hT=tcJZow}|)Gg!gZN*SC=V7Si8AT7dNTAj3C6 zqeys^fXA*0I$ZN;A3rUJ{%k<{9i&fz-ag37Bd0Nf{DE=nj+$N)s%k7&fGhO8TOy(; zsamz{+Z!*$-+jTHq@vvX+X!N2o#QGPmA$G&YKoZTQiW(3a?;SHbyV~%52yn8F%-KT z5P))nBqnd8xD%>NNPyoL4yFNqNuazxu(gj$4UaKsVb_sfDwKAN%C1~1Wf)BrR@deQ zL?K%Kj!39Cq`WVHQMy^FSl>8_6C0lr@|#g&#MC^_!8z+gDutSUSTt&YlOyGs1EXRb zcupCKR*n!_cgSU()k1MOPZDtNh#TJ>h$uxbtSri85Q5pXnRp#9#am$#38C)LXJnWK zc;h#Uiu~^ zat?x&PQz4pzSBz$yAV4&bd^w-5zgcV>Nw#Xv3Ac#OZ{iUOYnVx;8n53h4Xc;kJcMC zlS@H=)+`E3(3_%=Ia0675l^DI%mN_?@O-EXN>xKrCC-j^NZrBrc-ljb2@CtT!(_)Hd1$*S6kcXP77ZAJF;;i$aOLx zh(vSI0-V*NT2tRhn^&t6<~VobIF3qgdJa$`x9=SHGLt*bdFgUx)UPPu&I0Lv3d&wM zh8TK4Hssu3?8DuwYW`y){|oW>haS7XsOIl$u1iebTuaL4Qhy%-dxNOG!l|9SUsLm) zJR#rEMp;+$^)2~tFnOaaDjP+%=r7KE#n+o{k@cop^~eX|FRSUqItWkcULkaOUf7i9 zdyl=QmP6c>upaQbw-BV~^?3FQJa>E}p1p$39S7}maMo5Jwr>R&H1G%9&wy*>8oc(B zwc#s(zlrw~%(HfV4a%=#IjnIz=bNqgG4qa~&a0|e;CW4bTUMl_tmL$VZ>E-ogU`S&nC$RfGaEsEd5Mcgf=JxM<8%Tp;k6 zLh(TYc<03R;0P)@ea~oXRs8Tc8vn&M@!v+eBg}KS^$y;jj3(bE#c$UhFV73G1}~hP zDW2|~V&1Ox9P@U0v&=oo$4Q`v$S{IT=z)LYewpT04!Q(L4B#;HTOyFt=Al^F!HUQp@ z0k;Tj0r11c?=$yf(+DW%Hj8%1_ydCXF#(kGZVb2wa9e=52a`V~@DSh;fzJqF`R4?F zLEv#r_=>kKfHz$N00wsW50(%7Zqd_|ldV7{X=r!e)ZWue(Mh?$!y>QGv8`;Cv zS!{kGh#vp9S`x=D4=!yqzTdZ;uhgpa1g`(UJGHF_l4l@I3W9llLdI3XgOjN=5A>Q* z3dwpak(XGJ0eR)4H9D&`*s#3UC`sS09|(U~U+APeoA5YpwOK2P=1#n1)-Uk< z_l;W6@sy+>oZQZE@dp7pqQRy1PsXsnB;4RJXygU~y1S+vNBk4+ZETn=OwQA)+x4y0aFNRZl zaD_iHdwyp{cDMh@@K5XsG~zo_c{)KJ>>!e5!}?UP*zr?y9ouqQgHeY&DA{tpv;`&CQkl|6#d6bWmrS*0J~r0x z9=!g4An2(OHi}?xm}cItfS|o_@mt&*(r#Fzju`yF(*j|X3dDc&-qJqofDe06%G?7K z!k@>NQJn54wQBQ_Z)(Fn@dR^_1a}g*({(3;+$LxMR%cq;Q}pin+4BYw7A zH0?w%NzV@l1--qv-!K3pcq;u5ii3-#<#PZ4<>FZ$ literal 0 HcmV?d00001 diff --git a/tests/testthat/helper-recipes.R b/tests/testthat/helper-recipes.R new file mode 100644 index 00000000..d7c20a92 --- /dev/null +++ b/tests/testthat/helper-recipes.R @@ -0,0 +1,6 @@ +skip_if_no_update_role_requirements <- function() { + skip_if( + condition = utils::packageVersion("recipes") < "0.2.0.9002", + "Doesn't have `recipes::update_role_requirements()`" + ) +} diff --git a/tests/testthat/test-forge-recipe.R b/tests/testthat/test-forge-recipe.R index f47b1cd0..c8dac418 100644 --- a/tests/testthat/test-forge-recipe.R +++ b/tests/testthat/test-forge-recipe.R @@ -459,10 +459,9 @@ test_that("an `extras` slot exists for `roles`", { ) }) -test_that("non-standard roles generated in the recipe must be listed as `bake_dependent_roles` to be returned", { - # This is a convention we follow to make internal coding of `bake_dependent_roles` more straightforward. - # You ONLY get back `bake_dependent_roles` in the `extras` slot of `forge()`. - # I imagine this should be very rare. +test_that("non-standard roles generated in the recipe are returned by both `mold()` and `forge()`", { + # This is a convention we follow to have consistency between `mold()` and `forge()`. + # Both should have the same `.$extras$roles$` slot names. rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3) @@ -472,75 +471,78 @@ test_that("non-standard roles generated in the recipe must be listed as `bake_de expect_identical(x$blueprint$extra_role_ptypes, NULL) - # Even though `step_bs()` created some columns, we don't get them back in `forge()` + # Same slots in both `mold()` and `forge()` results expect_identical( - xx$extras, - list(roles = NULL) + colnames(x$extras$roles$dummy), + paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_") ) - - bp <- default_recipe_blueprint(bake_dependent_roles = "dummy") - x <- mold(rec, iris, blueprint = bp) - xx <- forge(iris, x$blueprint) - - # Still none because they weren't in the original data - expect_identical(x$blueprint$extra_role_ptypes, NULL) - - # We do get them here, because we specified them as bake dependent expect_identical( colnames(xx$extras$roles$dummy), paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_") ) }) -test_that("non-`bake_dependent_roles` should not be required in `forge()`, and won't be returned", { +test_that("`forge()` returns a slot for non standard roles that aren't required at `bake()` time", { + skip_if_no_update_role_requirements() + + # This is a convention we follow to have consistency between `mold()` and `forge()`. + # Both should have the same `.$extras$roles$` slot names, even if there + # wasn't any data there at `forge()` time (because it wasn't a required role). + rec <- recipes::recipe(Species ~ ., iris) - rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1") - rec <- recipes::update_role(rec, Sepal.Length, new_role = "dummy2") + rec <- recipes::update_role(rec, Sepal.Width, new_role = "id") + rec <- recipes::update_role_requirements(rec, "id", bake = FALSE) + rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3) x <- mold(rec, iris) xx <- forge(iris, x$blueprint) - expect_named(xx$predictors, c("Petal.Length", "Petal.Width")) - expect_null(xx$outcomes) + expect_identical(x$blueprint$extra_role_ptypes, NULL) + # Same slots in both `mold()` and `forge()` results expect_identical( - xx$extras, - list(roles = NULL) + colnames(x$extras$roles$dummy), + paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_") + ) + expect_identical( + colnames(xx$extras$roles$dummy), + paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_") ) - - bp <- default_recipe_blueprint(bake_dependent_roles = "dummy2") - x <- mold(rec, iris, blueprint = bp) - xx <- forge(iris, x$blueprint) - - expect_named(xx$predictors, c("Petal.Length", "Petal.Width")) - expect_null(xx$outcomes) expect_identical( - xx$extras$roles, - list(dummy2 = tibble::tibble(Sepal.Length = iris$Sepal.Length)) + x$extras$roles$id, + tibble(Sepal.Width = iris$Sepal.Width) + ) + # There is a slot for this, but no data is returned here, because `"id"` + # wasn't a required role, so `Sepal.Width` was removed before being passed + # to `bake()` + expect_identical( + xx$extras$roles$id, + tibble::new_tibble(x = list(), nrow = nrow(iris)) ) }) -test_that("`bake_dependent_roles` can be dropped during the baking process", { +test_that("required non standard roles can be dropped during the baking process", { rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1") rec <- recipes::update_role(rec, Sepal.Length, new_role = "dummy2") rec <- recipes::step_rm(rec, Sepal.Length) - bp <- default_recipe_blueprint(bake_dependent_roles = c("dummy1", "dummy2")) - x <- mold(rec, iris, blueprint = bp) + x <- mold(rec, iris) xx <- forge(iris, x$blueprint) # `extra_role_ptypes` holds ptypes for both expect_identical( - x$blueprint$extra_role_ptypes, - list( - dummy1 = tibble::tibble(Sepal.Width = double()), - dummy2 = tibble::tibble(Sepal.Length = double()) - ) + x$blueprint$extra_role_ptypes$dummy1, + tibble::tibble(Sepal.Width = double()) + ) + expect_identical( + x$blueprint$extra_role_ptypes$dummy2, + tibble::tibble(Sepal.Length = double()) ) - # But `Sepal.Length` got dropped along the way + # But `Sepal.Length` got dropped along the way and isn't in the `term_info` + # of the recipe, so it doesn't appear in the final result expect_identical( x$extras$roles, list(dummy1 = tibble::tibble(Sepal.Width = iris$Sepal.Width)) @@ -551,16 +553,95 @@ test_that("`bake_dependent_roles` can be dropped during the baking process", { ) }) -test_that("recipes should error if `bake_dependent_roles` have been left out", { +test_that("`forge()` will error if required non standard roles are missing", { + rec <- recipes::recipe(Species ~ ., iris) + rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1") + + x <- mold(rec, iris) + + iris$Sepal.Width <- NULL + + expect_snapshot(error = TRUE, { + forge(iris, x$blueprint) + }) +}) + +test_that("recipes will error if the role is declared as not required, but really was", { + skip_if_no_update_role_requirements() + rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1") + rec <- recipes::update_role_requirements(rec, "dummy1", bake = FALSE) rec <- recipes::step_log(rec, Sepal.Width) x <- mold(rec, iris) + # Error is specific to however `step_log()` handles this expect_error(forge(iris, x$blueprint)) }) +test_that("`NA` roles are treated as extra roles that are required at `forge()` time", { + # `Petal.Length`, `Petal.Width` have `NA` roles + rec <- recipes::recipe(iris) + rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor") + rec <- recipes::update_role(rec, Species, new_role = "outcome") + rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom") + + x <- mold(rec, iris) + xx <- forge(iris, x$blueprint) + + # They are returned in `forge()` as extras + expect_identical( + xx$extras$roles$`NA`, + tibble(Petal.Length = iris$Petal.Length, Petal.Width = iris$Petal.Width) + ) + + # They are required at `forge()` time + iris$Petal.Length <- NULL + + expect_snapshot(error = TRUE, { + forge(iris, x$blueprint) + }) +}) + +test_that("`NA` roles can be declared as not required at `forge()` time", { + skip_if_no_update_role_requirements() + + # Petal.Length, Petal.Width have `NA` roles + rec <- recipes::recipe(iris) + rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor") + rec <- recipes::update_role(rec, Species, new_role = "outcome") + rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom") + rec <- recipes::update_role_requirements(rec, "NA", bake = FALSE) + + x <- mold(rec, iris) + + # No longer required at `forge()` time + iris$Petal.Length <- NULL + + xx <- forge(iris, x$blueprint) + + # And `Petal.Width` (which is still in `iris`) won't be in the output + expect_identical( + xx$extras$roles$`NA`, + tibble::new_tibble(x = list(), nrow = nrow(iris)) + ) +}) + +test_that("`extras` only hold roles that actually exist in the data", { + df <- tibble(y = 1, x = 1, z = 2) + + rec <- recipes::recipe(y ~ ., df) + rec <- recipes::update_role(rec, x, new_role = "dummy") + + # For example `default_bake_role_requirements()` specifies that `NA` is + # a required role, but there aren't any columns with that role in `df` + x <- mold(rec, df) + xx <- forge(df, x$blueprint) + + expect_named(xx$extras$roles, "dummy") +}) + test_that("Missing y value still returns `NULL` if no outcomes are asked for", { sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") rec <- recipes::recipe(~Sepal.Width, data = iris) @@ -590,10 +671,9 @@ test_that("Predictors with multiple roles are only included once before baking ( rec <- recipes::add_role(rec, Sepal.Length, new_role = "test1") rec <- recipes::add_role(rec, Sepal.Length, new_role = "test2") - bp <- default_recipe_blueprint(bake_dependent_roles = c("test1", "test2")) - sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix", bake_dependent_roles = c("test1", "test2")) + sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") - x1 <- mold(rec, iris, blueprint = bp) + x1 <- mold(rec, iris) x2 <- mold(rec, iris, blueprint = sparse_bp) xx1 <- forge(iris, x1$blueprint) @@ -607,7 +687,7 @@ test_that("Predictors with multiple roles are only included once before baking ( expect_true("Sepal.Length" %in% colnames(xx2$extras$roles$test2)) }) -test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints that don't have `bake_dependent_roles`", { +test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints with a basic recipe", { path <- test_path("data", "hardhat-0.2.0-post-mold-recipe.rds") object <- readRDS(path) @@ -620,4 +700,32 @@ test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints that don expect_identical(out$predictors, expect) expect_identical(out$extras, list(roles = NULL)) + + new_data$x <- NULL + + expect_snapshot(error = TRUE, { + forge(new_data, blueprint) + }) +}) + +test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints with a recipe with a nonstandard role", { + path <- test_path("data", "hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds") + object <- readRDS(path) + + new_data <- object$new_data + blueprint <- object$blueprint + + out <- forge(new_data, blueprint) + + expect <- recipes::bake(blueprint$recipe, new_data, recipes::all_predictors()) + expect_identical(out$predictors, expect) + + expect <- recipes::bake(blueprint$recipe, new_data, id) + expect_identical(out$extras, list(roles = list(id = expect))) + + new_data$id <- NULL + + expect_snapshot(error = TRUE, { + forge(new_data, blueprint) + }) }) diff --git a/tests/testthat/test-mold-recipe.R b/tests/testthat/test-mold-recipe.R index 2eabb332..7841ace5 100644 --- a/tests/testthat/test-mold-recipe.R +++ b/tests/testthat/test-mold-recipe.R @@ -133,33 +133,33 @@ test_that("`extras` holds a slot for `roles`", { expect_equal(x3$blueprint$extra_role_ptypes, NULL) }) -test_that("non-standard role ptypes are not retained by default", { +test_that("non-standard roles ptypes are stored by default", { rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy") x <- mold(rec, iris) - expect_true("extra_role_ptypes" %in% names(x$blueprint)) - expect_identical( x$blueprint$extra_role_ptypes, - NULL + list(dummy = tibble::tibble(Sepal.Width = double())) ) - # But they are returned from `mold()` for developer usage expect_identical( x$extras$roles, list(dummy = tibble::tibble(Sepal.Width = iris$Sepal.Width)) ) }) -test_that("non-standard roles ptypes can be stored upon request", { - bp <- default_recipe_blueprint(bake_dependent_roles = "dummy") +test_that("case weights is not considered a required extra role by default", { + # If we have this, we also have case weight support + skip_if_no_update_role_requirements() + + iris$weight <- frequency_weights(seq_len(nrow(iris))) rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy") - x <- mold(rec, iris, blueprint = bp) + x <- mold(rec, iris) expect_identical( x$blueprint$extra_role_ptypes, @@ -167,21 +167,54 @@ test_that("non-standard roles ptypes can be stored upon request", { ) expect_identical( - x$extras$roles, - list(dummy = tibble::tibble(Sepal.Width = iris$Sepal.Width)) + x$extras$roles$dummy, + tibble::tibble(Sepal.Width = iris$Sepal.Width) + ) + expect_identical( + x$extras$roles$case_weights, + tibble::tibble(weight = iris$weight) + ) +}) + +test_that("case weights can be updated to be a required extra role", { + skip_if_no_update_role_requirements() + + iris$weight <- frequency_weights(seq_len(nrow(iris))) + + rec <- recipes::recipe(Species ~ ., iris) + rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy") + rec <- recipes::update_role_requirements(rec, "case_weights", bake = TRUE) + + x <- mold(rec, iris) + + expect_identical( + x$blueprint$extra_role_ptypes$dummy, + tibble::tibble(Sepal.Width = double()) + ) + expect_identical( + x$blueprint$extra_role_ptypes$case_weights, + tibble::tibble(weight = frequency_weights(integer())) + ) + + expect_identical( + x$extras$roles$dummy, + tibble::tibble(Sepal.Width = iris$Sepal.Width) + ) + expect_identical( + x$extras$roles$case_weights, + tibble::tibble(weight = iris$weight) ) }) test_that("only original non-standard columns are in the extra roles ptype", { - bp <- default_recipe_blueprint(bake_dependent_roles = "dummy") - sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix", bake_dependent_roles = "dummy") + sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") # same custom role, but note step_bs() columns aren't original columns rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy") rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3) - x1 <- mold(rec, iris, blueprint = bp) + x1 <- mold(rec, iris) x2 <- mold(rec, iris, blueprint = sparse_bp) # extra roles ptype only has original columns @@ -205,6 +238,20 @@ test_that("only original non-standard columns are in the extra roles ptype", { ) }) +test_that("`extra_role_ptypes` and `extras` only hold roles that actually exist in the data", { + df <- tibble(y = 1, x = 1, z = 2) + + rec <- recipes::recipe(y ~ ., df) + rec <- recipes::update_role(rec, x, new_role = "dummy") + + # For example `default_bake_role_requirements()` specifies that `NA` is + # a required role, but there aren't any columns with that role in `df` + x <- mold(rec, df) + + expect_named(x$blueprint$extra_role_ptypes, "dummy") + expect_named(x$extras$roles, "dummy") +}) + test_that("multiple extra roles types can be stored", { sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") @@ -225,8 +272,7 @@ test_that("multiple extra roles types can be stored", { NULL ) - # we don't need to request them with `bake_dependent_roles`, they - # automatically are returned in the `mold()` result + # They are automatically are returned in the `mold()` result expect_equal( names(x1$extras$roles), c("dummy", "dummy2") @@ -237,16 +283,15 @@ test_that("multiple extra roles types can be stored", { ) }) -test_that("`NA` roles are skipped over", { - bp <- default_recipe_blueprint(bake_dependent_roles = "custom") - sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix", bake_dependent_roles = "custom") +test_that("`NA` roles are treated as extra roles", { + sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") rec <- recipes::recipe(iris) rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor") rec <- recipes::update_role(rec, Species, new_role = "outcome") rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom") - x1 <- mold(rec, iris, blueprint = bp) + x1 <- mold(rec, iris) x2 <- mold(rec, iris, blueprint = sparse_bp) expect_equal( @@ -294,6 +339,15 @@ test_that("`NA` roles are skipped over", { "Sepal.Width" ) + expect_equal( + colnames(x1$extras$roles$`NA`), + c("Petal.Length", "Petal.Width") + ) + expect_equal( + colnames(x2$extras$roles$`NA`), + c("Petal.Length", "Petal.Width") + ) + expect_equal( colnames(x1$blueprint$extra_role_ptypes$custom), "Sepal.Width" @@ -302,15 +356,26 @@ test_that("`NA` roles are skipped over", { colnames(x2$blueprint$extra_role_ptypes$custom), "Sepal.Width" ) + + expect_equal( + colnames(x1$blueprint$extra_role_ptypes$`NA`), + c("Petal.Length", "Petal.Width") + ) + expect_equal( + colnames(x2$blueprint$extra_role_ptypes$`NA`), + c("Petal.Length", "Petal.Width") + ) }) -test_that("non-`bake_dependent_roles` are not retained as `extra_role_ptypes`, but are in the mold result", { +test_that("roles that aren't required are not retained as `extra_role_ptypes`, but are in the mold result", { + skip_if_no_update_role_requirements() + rec <- recipes::recipe(Species ~ ., iris) rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1") rec <- recipes::update_role(rec, Sepal.Length, new_role = "dummy2") + rec <- recipes::update_role_requirements(rec, "dummy1", bake = FALSE) - bp <- default_recipe_blueprint(bake_dependent_roles = "dummy2") - x <- mold(rec, iris, blueprint = bp) + x <- mold(rec, iris) expect_equal( x$blueprint$extra_role_ptypes, @@ -327,14 +392,6 @@ test_that("non-`bake_dependent_roles` are not retained as `extra_role_ptypes`, b ) }) -test_that("`bake_dependent_roles` is validated", { - expect_snapshot({ - (expect_error(default_recipe_blueprint(bake_dependent_roles = 1))) - (expect_error(default_recipe_blueprint(bake_dependent_roles = c("outcome", "x")))) - (expect_error(default_recipe_blueprint(bake_dependent_roles = c("predictor", "x")))) - }) -}) - test_that("Missing y value returns a 0 column tibble for `outcomes`", { sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix") rec <- recipes::recipe(~Sepal.Width, data = iris) @@ -355,7 +412,7 @@ test_that("Missing y value returns a 0 column / 0 row tibble for `ptype`", { expect_equal(x2$blueprint$ptypes$outcomes, tibble()) }) -test_that("`mold()` is compatible with hardhat 0.2.0 blueprints that don't have `bake_dependent_roles`", { +test_that("`mold()` is compatible with hardhat 0.2.0 blueprints", { path <- test_path("data", "hardhat-0.2.0-pre-mold-recipe.rds") object <- readRDS(path) @@ -367,34 +424,6 @@ test_that("`mold()` is compatible with hardhat 0.2.0 blueprints that don't have out <- mold(rec, data = data, blueprint = blueprint) - expect_identical(out$blueprint$bake_dependent_roles, character()) - expect <- tibble::tibble(x = data$x, z = 1) expect_identical(out$predictors, expect) }) - -test_that("`patch_recipe_default_blueprint()` patches `bake_dependent_roles` on pre hardhat 1.0.0 blueprints that haven't gone through `mold()`", { - path <- test_path("data", "hardhat-0.2.0-pre-mold-recipe.rds") - object <- readRDS(path) - - blueprint <- object$blueprint - - expect_null(blueprint$bake_dependent_roles) - - blueprint <- patch_recipe_default_blueprint(blueprint) - - expect_identical(blueprint$bake_dependent_roles, character()) -}) - -test_that("`patch_recipe_default_blueprint()` patches `bake_dependent_roles` on pre hardhat 1.0.0 blueprints that have gone through `mold()`", { - path <- test_path("data", "hardhat-0.2.0-post-mold-recipe.rds") - object <- readRDS(path) - - blueprint <- object$blueprint - - expect_null(blueprint$bake_dependent_roles) - - blueprint <- patch_recipe_default_blueprint(blueprint) - - expect_identical(blueprint$bake_dependent_roles, character()) -}) From 521dccd3c4c4083026dac4ed1b7284ed4e297f0c Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 27 Jun 2022 10:56:29 -0400 Subject: [PATCH 2/4] NEWS bullet --- NEWS.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/NEWS.md b/NEWS.md index b4f718f1..1440d2e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,30 @@ # hardhat (development version) +* We have reverted the change made in hardhat 1.0.0 that caused recipe + preprocessors to drop non-standard roles by default when calling `forge()`. + Determining what roles are required at `bake()` time is really something + that should be controlled within recipes, not hardhat. This results in the + following changes (#207): + + * The new argument, `bake_dependent_roles`, that was added to + `default_recipe_blueprint()` in 1.0.0 has been removed. It is no longer + needed with the new behavior. + + * By default, `forge()` will pass on all columns from `new_data` to `bake()` + except those with roles of `"outcome"` or `"case_weights"`. With + `outcomes = TRUE`, it will also pass on the `"outcome"` role. This is + essentially the same as the pre-1.0.0 behavior, and means that, by default, + all non-standard roles are required at `bake()` time. This assumption is + now also enforced by recipes 1.0.0, even if you aren't using hardhat or + a workflow. + + * In the development version of recipes, which will become recipes 1.0.0, + there is a new `update_role_requirements()` function that can be used to + declare that a role is not required at `bake()` time. hardhat now knows how + to respect that feature, and in `forge()` it won't pass on columns of + `new_data` to `bake()` that have roles that aren't required at `bake()` + time. + # hardhat 1.1.0 * Fixed a bug where the results from calling `mold()` using hardhat < 1.0.0 were From 01826e63c52b3dc99ee8b37f4d8b1bbbf20f04c5 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 27 Jun 2022 10:56:56 -0400 Subject: [PATCH 3/4] Add dev recipes workflow to test against recipes 1.0.0 role behavior --- .../workflows/R-CMD-check-dev-recipes.yaml | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/R-CMD-check-dev-recipes.yaml diff --git a/.github/workflows/R-CMD-check-dev-recipes.yaml b/.github/workflows/R-CMD-check-dev-recipes.yaml new file mode 100644 index 00000000..8f5e7dbc --- /dev/null +++ b/.github/workflows/R-CMD-check-dev-recipes.yaml @@ -0,0 +1,59 @@ +# NOTE: Used to check against dev recipes until recipes 1.0.0 is released. +# Needed to ensure that hardhat's special recipes 1.0.0 +# `update_role_requirements()` support has correct behavior in pre and post +# recipes 1.0.0. After recipes 1.0.0 is release, we can: +# - Remove this workflow +# - Remove `skip_if_no_update_role_requirements()` +# - Put `recipes (>= 1.0.0)` in Suggests + +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# NOTE: This workflow is overkill for most R packages and +# check-standard.yaml is likely a better choice. +# usethis::use_github_action("check-standard") will install it. +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check-dev-recipes + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macOS-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck, tidymodels/recipes#1011 + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true From 1962182248f7d730b86da286652df3af6c6598f3 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 28 Jun 2022 16:29:29 -0400 Subject: [PATCH 4/4] Refactor to use `compute_bake_role_requirements()` --- R/blueprint-recipe-default.R | 55 ++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/R/blueprint-recipe-default.R b/R/blueprint-recipe-default.R index 7dc841f3..f026f72e 100644 --- a/R/blueprint-recipe-default.R +++ b/R/blueprint-recipe-default.R @@ -539,21 +539,19 @@ get_original_outcome_ptype <- function(rec, data) { get_extra_role_columns_original <- function(rec, data) { # Extra roles that existed before `prep()` has been called. # To get "extra" roles that are required at bake time: - # - Start with the roles that actually exist in `data` - # - Remove the ones that aren't required at bake time + # - Compute the bake role requirements named logical vector. + # It has information about every role in the original data. + # - Subset that vector to only `TRUE` locations, where the role is required # - Remove the `"predictor"` role (it is always required, but isn't "extra") info_type <- "var_info" - data_roles <- rec[[info_type]][["role"]] - data_roles <- chr_explicit_na(data_roles) - data_roles <- unique(data_roles) + requirements <- compute_bake_role_requirements(rec) - required_at_bake <- get_bake_role_requirements(rec) - not_required_at_bake <- required_at_bake[!required_at_bake] - not_required_at_bake_roles <- names(not_required_at_bake) + # Filter down to the roles that are actually required + requirements <- requirements[requirements] + requirement_roles <- names(requirements) - required_roles <- setdiff(data_roles, not_required_at_bake_roles) - extra_roles <- setdiff(required_roles, "predictor") + extra_roles <- setdiff(requirement_roles, "predictor") get_extra_role_columns(rec, data, extra_roles, info_type) } @@ -603,15 +601,25 @@ get_extra_role_columns <- function(rec, data, extra_roles, info_type) { # ------------------------------------------------------------------------------ +new_role_requirements <- function() { + # recipes:::new_role_requirements() + list( + bake = new_bake_role_requirements() + ) +} get_role_requirements <- function(recipe) { # recipes:::get_role_requirements() - recipe$requirements %||% set_names(list(), nm = character()) + recipe$requirements %||% new_role_requirements() } +new_bake_role_requirements <- function() { + # recipes:::new_bake_role_requirements() + set_names(logical(), nms = character()) +} get_bake_role_requirements <- function(recipe) { # recipes:::get_bake_role_requirements() requirements <- get_role_requirements(recipe) - requirements$bake %||% default_bake_role_requirements() + requirements$bake } default_bake_role_requirements <- function() { # recipes:::default_bake_role_requirements() @@ -622,6 +630,29 @@ default_bake_role_requirements <- function() { "NA" = TRUE ) } +compute_bake_role_requirements <- function(recipe) { + # recipes:::compute_bake_role_requirements() + var_info <- recipe$var_info + var_roles <- var_info$role + var_roles <- chr_explicit_na(var_roles) + var_roles <- unique(var_roles) + + # Start with default requirements + requirements <- default_bake_role_requirements() + + # Drop unused default requirements + requirements <- requirements[names(requirements) %in% var_roles] + + # Update with nonstandard roles in the recipe, which are required by default + nonstandard_roles <- var_roles[!var_roles %in% names(requirements)] + requirements[nonstandard_roles] <- TRUE + + # Override with `update_role_requirements()` changes + user_requirements <- get_bake_role_requirements(recipe) + requirements[names(user_requirements)] <- user_requirements + + requirements +} chr_explicit_na <- function(x) { # recipes:::chr_explicit_na()