From cc083f695f8523ea1695699b12c8ee2d95a5ae01 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Fri, 24 Mar 2023 10:14:35 -0400 Subject: [PATCH 1/3] Implement `recompose()` --- NAMESPACE | 1 + R/blueprint-formula-default.R | 4 +- R/blueprint-recipe-default.R | 4 +- R/blueprint-xy-default.R | 4 +- R/recompose.R | 119 ++++++++++++++++--------- man/recompose.Rd | 45 ++++++++++ tests/testthat/_snaps/forge-formula.md | 9 ++ tests/testthat/_snaps/recompose.md | 53 +++++++++++ tests/testthat/test-forge-formula.R | 8 +- tests/testthat/test-recompose.R | 66 ++++++++++++++ 10 files changed, 260 insertions(+), 53 deletions(-) create mode 100644 man/recompose.Rd create mode 100644 tests/testthat/_snaps/forge-formula.md create mode 100644 tests/testthat/_snaps/recompose.md create mode 100644 tests/testthat/test-recompose.R diff --git a/NAMESPACE b/NAMESPACE index cfb93aff..7a5e321f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -96,6 +96,7 @@ export(new_importance_weights) export(new_model) export(new_recipe_blueprint) export(new_xy_blueprint) +export(recompose) export(refresh_blueprint) export(run_forge) export(run_mold) diff --git a/R/blueprint-formula-default.R b/R/blueprint-formula-default.R index 406bcdcf..ea9a557f 100644 --- a/R/blueprint-formula-default.R +++ b/R/blueprint-formula-default.R @@ -552,7 +552,7 @@ mold_formula_default_process_predictors <- function(blueprint, data) { terms <- simplify_terms(framed$terms) - predictors <- recompose(predictors, blueprint$composition) + predictors <- recompose(predictors, composition = blueprint$composition) blueprint_terms <- blueprint$terms blueprint_terms$predictors <- terms @@ -704,7 +704,7 @@ forge_formula_default_process_predictors <- function(blueprint, predictors) { data <- reattach_factorish_columns(data, predictors, factorish_names) } - data <- recompose(data, blueprint$composition) + data <- recompose(data, composition = blueprint$composition) offset <- extract_offset(framed$terms, framed$data) diff --git a/R/blueprint-recipe-default.R b/R/blueprint-recipe-default.R index 17041218..05259151 100644 --- a/R/blueprint-recipe-default.R +++ b/R/blueprint-recipe-default.R @@ -266,7 +266,7 @@ mold_recipe_default_process_predictors <- function(blueprint, data) { predictors <- maybe_add_intercept_column(predictors, blueprint$intercept) - predictors <- recompose(predictors, blueprint$composition) + predictors <- recompose(predictors, composition = blueprint$composition) ptype <- get_original_predictor_ptype(blueprint$recipe, data) @@ -477,7 +477,7 @@ forge_recipe_default_process <- function(blueprint, predictors, outcomes, extras forge_recipe_default_process_predictors <- function(blueprint, predictors) { predictors <- maybe_add_intercept_column(predictors, blueprint$intercept) - predictors <- recompose(predictors, blueprint$composition) + predictors <- recompose(predictors, composition = blueprint$composition) new_forge_process_terms( blueprint = blueprint, diff --git a/R/blueprint-xy-default.R b/R/blueprint-xy-default.R index 2d4ae06b..a8e364cf 100644 --- a/R/blueprint-xy-default.R +++ b/R/blueprint-xy-default.R @@ -255,7 +255,7 @@ mold_xy_default_process_predictors <- function(blueprint, x) { x <- maybe_add_intercept_column(x, blueprint$intercept) - x <- recompose(x, blueprint$composition) + x <- recompose(x, composition = blueprint$composition) new_mold_process_terms( blueprint = blueprint, @@ -356,7 +356,7 @@ forge_xy_default_process <- function(blueprint, predictors, outcomes, extras) { forge_xy_default_process_predictors <- function(blueprint, predictors) { predictors <- maybe_add_intercept_column(predictors, blueprint$intercept) - predictors <- recompose(predictors, blueprint$composition) + predictors <- recompose(predictors, composition = blueprint$composition) new_forge_process_terms( blueprint = blueprint, diff --git a/R/recompose.R b/R/recompose.R index 4deb0a41..e6fa1c48 100644 --- a/R/recompose.R +++ b/R/recompose.R @@ -1,50 +1,83 @@ -# This is the same as the "recomposition" at the end of recipes::bake() - -recompose <- function(data, composition) { - if (identical(composition, "tibble")) { - data - } else if (identical(composition, "dgCMatrix")) { - convert_matrix(data, sparse = TRUE) - } else if (identical(composition, "matrix")) { - convert_matrix(data, sparse = FALSE) - } else { - abort("Internal error: Unknown `composition` type.") - } -} +#' Recompose a data frame into another form +#' +#' @description +#' `recompose()` takes a data frame and converts it into one of: +#' - A tibble +#' - A matrix +#' - A sparse matrix (using the Matrix package) +#' +#' This is an internal function used only by hardhat and recipes. +#' +#' @inheritParams rlang::args_dots_empty +#' +#' @param data A data frame. +#' +#' @param composition One of: +#' - `"tibble"` to convert to a tibble. +#' - `"matrix"` to convert to a matrix. All columns must be numeric. +#' - `"dgCMatrix"` to convert to a sparse matrix. All columns must be numeric, +#' and the Matrix package must be installed. +#' +#' @returns +#' The output type is determined from the `composition`. +#' +#' @export +#' @keywords internal +#' +#' @examples +#' df <- vctrs::data_frame(x = 1) +#' +#' recompose(df) +#' recompose(df, composition = "matrix") +#' +#' # All columns must be numeric to convert to a matrix +#' df <- vctrs::data_frame(x = 1, y = "a") +#' try(recompose(df, composition = "matrix")) +recompose <- function(data, + ..., + composition = c("tibble", "matrix", "dgCMatrix")) { + check_dots_empty0(...) + check_data_frame(data) + + composition <- arg_match0( + arg = composition, + values = c("tibble", "matrix", "dgCMatrix") + ) -convert_matrix <- function(x, sparse = TRUE) { - is_num <- vapply(x, is.numeric, logical(1)) - - if (!all(is_num)) { - num_viol <- sum(!is_num) - if (num_viol < 5) { - abort( - paste0( - "Columns (", - paste0("`", names(is_num)[!is_num], "`", collapse = ", "), - ") are not numeric; cannot convert to matrix." - ) - ) - } else { - abort( - paste0( - num_viol, - " columns are not numeric; cannot ", - "convert to matrix." - ) - ) + switch( + composition, + tibble = { + coerce_to_tibble(data) + }, + matrix = { + coerce_to_matrix(data) + }, + dgCMatrix = { + data <- coerce_to_matrix(data) + coerce_to_sparse(data) } - } + ) +} - # At this point, all cols are numeric so we can just use as.matrix() - res <- as.matrix(x) +coerce_to_matrix <- function(data, error_call = caller_env()) { + numeric <- map_lgl(data, is.numeric) - if (sparse) { - if (!is_installed("Matrix")) { - abort("The Matrix package must be installed to use a 'dgCMatrix' `composition`") - } - res <- Matrix::Matrix(res, sparse = TRUE) + if (!all(numeric)) { + loc <- which(!numeric) + loc <- names(data)[loc] + + message <- c( + "{.arg data} must only contain numeric columns.", + i = "These columns aren't numeric: {.str {loc}}." + ) + + cli::cli_abort(message, call = error_call) } - res + as.matrix(data) +} + +coerce_to_sparse <- function(data, error_call = caller_env()) { + check_installed("Matrix", call = error_call) + Matrix::Matrix(data, sparse = TRUE) } diff --git a/man/recompose.Rd b/man/recompose.Rd new file mode 100644 index 00000000..3774c8e0 --- /dev/null +++ b/man/recompose.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recompose.R +\name{recompose} +\alias{recompose} +\title{Recompose a data frame into another form} +\usage{ +recompose(data, ..., composition = c("tibble", "matrix", "dgCMatrix")) +} +\arguments{ +\item{data}{A data frame.} + +\item{...}{These dots are for future extensions and must be empty.} + +\item{composition}{One of: +\itemize{ +\item \code{"tibble"} to convert to a tibble. +\item \code{"matrix"} to convert to a matrix. All columns must be numeric. +\item \code{"dgCMatrix"} to convert to a sparse matrix. All columns must be numeric, +and the Matrix package must be installed. +}} +} +\value{ +The output type is determined from the \code{composition}. +} +\description{ +\code{recompose()} takes a data frame and converts it into one of: +\itemize{ +\item A tibble +\item A matrix +\item A sparse matrix (using the Matrix package) +} + +This is an internal function used only by hardhat and recipes. +} +\examples{ +df <- vctrs::data_frame(x = 1) + +recompose(df) +recompose(df, composition = "matrix") + +# All columns must be numeric to convert to a matrix +df <- vctrs::data_frame(x = 1, y = "a") +try(recompose(df, composition = "matrix")) +} +\keyword{internal} diff --git a/tests/testthat/_snaps/forge-formula.md b/tests/testthat/_snaps/forge-formula.md new file mode 100644 index 00000000..fcd5d4aa --- /dev/null +++ b/tests/testthat/_snaps/forge-formula.md @@ -0,0 +1,9 @@ +# can be both missing levels and have new levels + + Code + mold(y ~ f, dat, blueprint = bp2) + Condition + Error in `recompose()`: + ! `data` must only contain numeric columns. + i These columns aren't numeric: "f". + diff --git a/tests/testthat/_snaps/recompose.md b/tests/testthat/_snaps/recompose.md new file mode 100644 index 00000000..f135d585 --- /dev/null +++ b/tests/testthat/_snaps/recompose.md @@ -0,0 +1,53 @@ +# columns must be numeric when coercing to matrix + + Code + recompose(df, composition = "matrix") + Condition + Error in `recompose()`: + ! `data` must only contain numeric columns. + i These columns aren't numeric: "y" and "z". + +# columns must be numeric when coercing to sparse matrix + + Code + recompose(df, composition = "dgCMatrix") + Condition + Error in `recompose()`: + ! `data` must only contain numeric columns. + i These columns aren't numeric: "y" and "z". + +# checks for data frame input + + Code + recompose(1) + Condition + Error in `recompose()`: + ! `data` must be a data frame, not the number 1. + +# dots must be empty + + Code + recompose(data.frame(), 1) + Condition + Error in `recompose()`: + ! `...` must be empty. + x Problematic argument: + * ..1 = 1 + i Did you forget to name an argument? + +# validates `composition` + + Code + recompose(data.frame(), composition = "foo") + Condition + Error in `recompose()`: + ! `composition` must be one of "tibble", "matrix", or "dgCMatrix", not "foo". + +--- + + Code + recompose(data.frame(), composition = 1) + Condition + Error in `recompose()`: + ! `composition` must be a string or character vector. + diff --git a/tests/testthat/test-forge-formula.R b/tests/testthat/test-forge-formula.R index 3f36929d..dd965981 100644 --- a/tests/testthat/test-forge-formula.R +++ b/tests/testthat/test-forge-formula.R @@ -744,11 +744,11 @@ test_that("can be both missing levels and have new levels", { bp1 <- default_formula_blueprint(indicators = "none") bp2 <- default_formula_blueprint(indicators = "none", composition = "matrix") + x1 <- mold(y ~ f, dat, blueprint = bp1) - expect_error( - x2 <- mold(y ~ f, dat, blueprint = bp2), - "cannot convert to matrix" - ) + expect_snapshot(error = TRUE, { + mold(y ~ f, dat, blueprint = bp2) + }) # Warning for the extra level expect_warning( diff --git a/tests/testthat/test-recompose.R b/tests/testthat/test-recompose.R new file mode 100644 index 00000000..79f60cbb --- /dev/null +++ b/tests/testthat/test-recompose.R @@ -0,0 +1,66 @@ +test_that("data frames are coerced to tibble", { + expect_identical( + recompose(data.frame(x = 1)), + tibble(x = 1) + ) +}) + +test_that("can coerce to matrix", { + expect_identical( + recompose(data.frame(x = 1), composition = "matrix"), + matrix(1, dimnames = list(NULL, "x")) + ) +}) + +test_that("can coerce to sparse matrix", { + skip_if_not_installed("Matrix") + + expect_identical( + recompose(data.frame(x = 1:2), composition = "dgCMatrix"), + Matrix::Matrix(1:2, dimnames = list(NULL, "x"), sparse = TRUE) + ) +}) + +test_that("non-numeric columns are allowed with tibble output", { + df <- data.frame(x = 1, y = "a", z = "b") + expect_identical(recompose(df), tibble::as_tibble(df)) +}) + +test_that("columns must be numeric when coercing to matrix", { + df <- data.frame(x = 1, y = "a", z = "b") + + expect_snapshot(error = TRUE, { + recompose(df, composition = "matrix") + }) +}) + +test_that("columns must be numeric when coercing to sparse matrix", { + skip_if_not_installed("Matrix") + + df <- data.frame(x = 1, y = "a", z = "b") + + expect_snapshot(error = TRUE, { + recompose(df, composition = "dgCMatrix") + }) +}) + +test_that("checks for data frame input", { + expect_snapshot(error = TRUE, { + recompose(1) + }) +}) + +test_that("dots must be empty", { + expect_snapshot(error = TRUE, { + recompose(data.frame(), 1) + }) +}) + +test_that("validates `composition`", { + expect_snapshot(error = TRUE, { + recompose(data.frame(), composition = "foo") + }) + expect_snapshot(error = TRUE, { + recompose(data.frame(), composition = 1) + }) +}) From cd6899223067fbb675e88f24ce06a509ca2402e5 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Fri, 24 Mar 2023 10:21:11 -0400 Subject: [PATCH 2/3] Add support for `"data.frame"` composition Since recipes allows for this. We don't want this in the hardhat blueprints though, as we'd prefer to nudge people towards tibbles, which hardhat tries hard to generate internally. --- R/blueprint.R | 2 ++ R/recompose.R | 9 +++++++-- man/recompose.Rd | 4 +++- tests/testthat/_snaps/recompose.md | 2 +- tests/testthat/test-recompose.R | 12 ++++++++++-- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/R/blueprint.R b/R/blueprint.R index 262d8267..9ff51b48 100644 --- a/R/blueprint.R +++ b/R/blueprint.R @@ -308,6 +308,8 @@ check_list <- function(x, } check_composition <- function(composition, error_call = caller_env()) { + # `recompose()` technically also supports `"data.frame"`, + # but that is only for recipes, and we probably don't want that here arg_match0( arg = composition, values = c("tibble", "matrix", "dgCMatrix"), diff --git a/R/recompose.R b/R/recompose.R index e6fa1c48..02ccb530 100644 --- a/R/recompose.R +++ b/R/recompose.R @@ -3,6 +3,7 @@ #' @description #' `recompose()` takes a data frame and converts it into one of: #' - A tibble +#' - A data frame #' - A matrix #' - A sparse matrix (using the Matrix package) #' @@ -14,6 +15,7 @@ #' #' @param composition One of: #' - `"tibble"` to convert to a tibble. +#' - `"data.frame"` to convert to a base data frame. #' - `"matrix"` to convert to a matrix. All columns must be numeric. #' - `"dgCMatrix"` to convert to a sparse matrix. All columns must be numeric, #' and the Matrix package must be installed. @@ -35,13 +37,13 @@ #' try(recompose(df, composition = "matrix")) recompose <- function(data, ..., - composition = c("tibble", "matrix", "dgCMatrix")) { + composition = "tibble") { check_dots_empty0(...) check_data_frame(data) composition <- arg_match0( arg = composition, - values = c("tibble", "matrix", "dgCMatrix") + values = c("tibble", "data.frame", "matrix", "dgCMatrix") ) switch( @@ -49,6 +51,9 @@ recompose <- function(data, tibble = { coerce_to_tibble(data) }, + data.frame = { + new_data_frame(data, n = vec_size(data)) + }, matrix = { coerce_to_matrix(data) }, diff --git a/man/recompose.Rd b/man/recompose.Rd index 3774c8e0..5bf39856 100644 --- a/man/recompose.Rd +++ b/man/recompose.Rd @@ -4,7 +4,7 @@ \alias{recompose} \title{Recompose a data frame into another form} \usage{ -recompose(data, ..., composition = c("tibble", "matrix", "dgCMatrix")) +recompose(data, ..., composition = "tibble") } \arguments{ \item{data}{A data frame.} @@ -14,6 +14,7 @@ recompose(data, ..., composition = c("tibble", "matrix", "dgCMatrix")) \item{composition}{One of: \itemize{ \item \code{"tibble"} to convert to a tibble. +\item \code{"data.frame"} to convert to a base data frame. \item \code{"matrix"} to convert to a matrix. All columns must be numeric. \item \code{"dgCMatrix"} to convert to a sparse matrix. All columns must be numeric, and the Matrix package must be installed. @@ -26,6 +27,7 @@ The output type is determined from the \code{composition}. \code{recompose()} takes a data frame and converts it into one of: \itemize{ \item A tibble +\item A data frame \item A matrix \item A sparse matrix (using the Matrix package) } diff --git a/tests/testthat/_snaps/recompose.md b/tests/testthat/_snaps/recompose.md index f135d585..fe331d48 100644 --- a/tests/testthat/_snaps/recompose.md +++ b/tests/testthat/_snaps/recompose.md @@ -41,7 +41,7 @@ recompose(data.frame(), composition = "foo") Condition Error in `recompose()`: - ! `composition` must be one of "tibble", "matrix", or "dgCMatrix", not "foo". + ! `composition` must be one of "tibble", "data.frame", "matrix", or "dgCMatrix", not "foo". --- diff --git a/tests/testthat/test-recompose.R b/tests/testthat/test-recompose.R index 79f60cbb..c5e2f906 100644 --- a/tests/testthat/test-recompose.R +++ b/tests/testthat/test-recompose.R @@ -1,10 +1,17 @@ -test_that("data frames are coerced to tibble", { +test_that("data frames are coerced to tibble by default", { expect_identical( recompose(data.frame(x = 1)), tibble(x = 1) ) }) +test_that("can coerce to base data frame", { + expect_identical( + recompose(tibble(x = 1), composition = "data.frame"), + data.frame(x = 1) + ) +}) + test_that("can coerce to matrix", { expect_identical( recompose(data.frame(x = 1), composition = "matrix"), @@ -21,9 +28,10 @@ test_that("can coerce to sparse matrix", { ) }) -test_that("non-numeric columns are allowed with tibble output", { +test_that("non-numeric columns are allowed with tibble/data.frame output", { df <- data.frame(x = 1, y = "a", z = "b") expect_identical(recompose(df), tibble::as_tibble(df)) + expect_identical(recompose(df, composition = "data.frame"), df) }) test_that("columns must be numeric when coercing to matrix", { From 1a10167356a62e3529200d671b7c62c9efb9c78c Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Fri, 24 Mar 2023 10:21:18 -0400 Subject: [PATCH 3/3] NEWS bullet --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 39a53f8c..a35d02cb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # hardhat (development version) +* New internal `recompose()` helper (#220). + * `default_recipe_blueprint()` has gained a `strings_as_factors` argument, which is passed on to `recipes::prep()` (#212).