From 6c5482a3828a4b86e5113e41519a356f89debaef Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Wed, 17 Aug 2022 19:32:18 -0400 Subject: [PATCH] patch `params` argument with `xgboost` engine in `boost_tree()` (#787) * patch `params` argument with `xgboost` engine in `boost_tree()` * remove + add snapshots from previous PRs * update snaps with new help-page reference Co-authored-by: Max Kuhn --- NEWS.md | 3 + R/boost_tree.R | 89 ++++++++++++------ man/details_boost_tree_xgboost.Rd | 51 ++++++++++- man/rmd/boost_tree_xgboost.Rmd | 24 ++++- man/rmd/boost_tree_xgboost.md | 44 ++++++++- man/xgb_train.Rd | 5 - tests/testthat/_snaps/boost_tree_xgboost.md | 23 +++++ tests/testthat/_snaps/proportional_hazards.md | 14 --- tests/testthat/_snaps/translate.md | 50 ++++++++++ tests/testthat/test_boost_tree_xgboost.R | 91 +++++++++++++++++++ 10 files changed, 342 insertions(+), 52 deletions(-) create mode 100644 tests/testthat/_snaps/boost_tree_xgboost.md diff --git a/NEWS.md b/NEWS.md index 813baabd7..cfba75b31 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # parsnip (development version) + +* Enabled passing additional engine arguments with the xgboost `boost_tree()` engine. To supply engine-specific arguments that are documented in `xgboost::xgb.train()` as arguments to be passed via `params`, supply the list elements directly as named arguments to `set_engine()`. Read more in `?details_boost_tree_xgboost` (#787). + # parsnip 1.0.0 ## Model Specification Changes diff --git a/R/boost_tree.R b/R/boost_tree.R index cf12e5dea..cce9d1ccd 100644 --- a/R/boost_tree.R +++ b/R/boost_tree.R @@ -213,9 +213,6 @@ check_args.boost_tree <- function(object) { #' @param counts A logical. If `FALSE`, `colsample_bynode` and #' `colsample_bytree` are both assumed to be _proportions_ of the proportion of #' columns affects (instead of counts). -#' @param objective A single string (or NULL) that defines the loss function that -#' `xgboost` uses to create trees. See [xgboost::xgb.train()] for options. If left -#' NULL, an appropriate loss function is chosen. #' @param event_level For binary classification, this is a single string of either #' `"first"` or `"second"` to pass along describing which level of the outcome #' should be considered the "event". @@ -227,7 +224,7 @@ xgb_train <- function( x, y, weights = NULL, max_depth = 6, nrounds = 15, eta = 0.3, colsample_bynode = NULL, colsample_bytree = NULL, min_child_weight = 1, gamma = 0, subsample = 1, - validation = 0, early_stop = NULL, objective = NULL, counts = TRUE, + validation = 0, early_stop = NULL, counts = TRUE, event_level = c("first", "second"), ...) { event_level <- rlang::arg_match(event_level, c("first", "second")) @@ -248,18 +245,6 @@ xgb_train <- function( } } - if (is.null(objective)) { - if (is.numeric(y)) { - objective <- "reg:squarederror" - } else { - if (num_class == 2) { - objective <- "binary:logistic" - } else { - objective <- "multi:softprob" - } - } - } - n <- nrow(x) p <- ncol(x) @@ -300,35 +285,79 @@ xgb_train <- function( colsample_bytree = colsample_bytree, colsample_bynode = colsample_bynode, min_child_weight = min(min_child_weight, n), - subsample = subsample, - objective = objective + subsample = subsample ) - main_args <- list( - data = quote(x$data), - watchlist = quote(x$watchlist), - params = arg_list, - nrounds = nrounds, - early_stopping_rounds = early_stop + others <- process_others(others, arg_list) + + main_args <- c( + list( + data = quote(x$data), + watchlist = quote(x$watchlist), + params = arg_list, + nrounds = nrounds, + early_stopping_rounds = early_stop + ), + others ) + + if (is.null(main_args$objective)) { + if (is.numeric(y)) { + main_args$objective <- "reg:squarederror" + } else { + if (num_class == 2) { + main_args$objective <- "binary:logistic" + } else { + main_args$objective <- "multi:softprob" + } + } + } + if (!is.null(num_class) && num_class > 2) { main_args$num_class <- num_class } call <- make_call(fun = "xgb.train", ns = "xgboost", main_args) - # override or add some other args + eval_tidy(call, env = current_env()) +} + +process_others <- function(others, arg_list) { + guarded <- c("data", "weights", "num_class", "watchlist") + guarded_supplied <- names(others)[names(others) %in% guarded] + + if (length(guarded_supplied) > 0) { + cli::cli_warn( + c( + "!" = "{cli::qty(guarded_supplied)} The argument{?s} {.arg {guarded_supplied}} \ + {?is/are} guarded by parsnip and will not be passed to {.fun xgb.train}." + ), + class = "xgboost_guarded_warning" + ) + } others <- - others[!(names(others) %in% c("data", "weights", "nrounds", "num_class", names(arg_list)))] + others[!(names(others) %in% guarded)] + + if (!is.null(others$params)) { + cli::cli_warn( + c( + "!" = "Please supply elements of the `params` list argument as main arguments \ + to `set_engine()` rather than as part of `params`.", + "i" = "See `?details_boost_tree_xgboost` for more information." + ), + class = "xgboost_params_warning" + ) + + params <- others$params[!names(others$params) %in% names(arg_list)] + others <- c(others[names(others) != "params"], params) + } + if (!(any(names(others) == "verbose"))) { others$verbose <- 0 } - if (length(others) > 0) { - call <- rlang::call_modify(call, !!!others) - } - eval_tidy(call, env = current_env()) + others } recalc_param <- function(x, counts, denom) { diff --git a/man/details_boost_tree_xgboost.Rd b/man/details_boost_tree_xgboost.Rd index 762a5450a..ab05e46bd 100644 --- a/man/details_boost_tree_xgboost.Rd +++ b/man/details_boost_tree_xgboost.Rd @@ -120,6 +120,51 @@ training process. } \subsection{Other details}{ +\subsection{Interfacing with the \code{params} argument}{ + +The xgboost function that parsnip indirectly wraps, +\code{\link[xgboost:xgb.train]{xgboost::xgb.train()}}, takes most arguments via +the \code{params} list argument. To supply engine-specific arguments that are +documented in \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} as +arguments to be passed via \code{params}, supply the list elements directly +as named arguments to \code{\link[=set_engine]{set_engine()}} rather than as +elements in \code{params}. For example, pass a non-default evaluation metric +like this: + +\if{html}{\out{
}}\preformatted{# good +boost_tree() \%>\% + set_engine("xgboost", eval_metric = "mae") +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Boosted Tree Model Specification (unknown) +## +## Engine-Specific Arguments: +## eval_metric = mae +## +## Computational engine: xgboost +}\if{html}{\out{
}} + +…rather than this: + +\if{html}{\out{
}}\preformatted{# bad +boost_tree() \%>\% + set_engine("xgboost", params = list(eval_metric = "mae")) +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Boosted Tree Model Specification (unknown) +## +## Engine-Specific Arguments: +## params = list(eval_metric = "mae") +## +## Computational engine: xgboost +}\if{html}{\out{
}} + +parsnip will then route arguments as needed. In the case that arguments +are passed to \code{params} via \code{\link[=set_engine]{set_engine()}}, parsnip will +warn and re-route the arguments as needed. Note, though, that arguments +passed to \code{params} cannot be tuned. +} + \subsection{Sparse matrices}{ xgboost requires the data to be in a sparse format. If your predictor @@ -182,13 +227,17 @@ performance (and stopping early). If the model specification has \code{early_stop >= trees}, \code{early_stop} is converted to \code{trees - 1} and a warning is issued. + +Note that, since the \code{validation} argument provides an alternative +interface to \code{watchlist}, the \code{watchlist} argument is guarded by parsnip +and will be ignored (with a warning) if passed. } \subsection{Objective function}{ parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the \code{objective} argument to -\code{\link[=set_engine]{set_engine()}}. +\code{\link[=set_engine]{set_engine()}} directly. } } diff --git a/man/rmd/boost_tree_xgboost.Rmd b/man/rmd/boost_tree_xgboost.Rmd index c764f9da5..9394aad07 100644 --- a/man/rmd/boost_tree_xgboost.Rmd +++ b/man/rmd/boost_tree_xgboost.Rmd @@ -60,6 +60,26 @@ For classification, non-numeric outcomes (i.e., factors) are internally converte ## Other details +### Interfacing with the `params` argument + +The xgboost function that parsnip indirectly wraps, [xgboost::xgb.train()], takes most arguments via the `params` list argument. To supply engine-specific arguments that are documented in [xgboost::xgb.train()] as arguments to be passed via `params`, supply the list elements directly as named arguments to [set_engine()] rather than as elements in `params`. For example, pass a non-default evaluation metric like this: + +```{r} +# good +boost_tree() %>% + set_engine("xgboost", eval_metric = "mae") +``` + +...rather than this: + +```{r} +# bad +boost_tree() %>% + set_engine("xgboost", params = list(eval_metric = "mae")) +``` + +parsnip will then route arguments as needed. In the case that arguments are passed to `params` via [set_engine()], parsnip will warn and re-route the arguments as needed. Note, though, that arguments passed to `params` cannot be tuned. + ### Sparse matrices xgboost requires the data to be in a sparse format. If your predictor data are already in this format, then use [fit_xy.model_spec()] to pass it to the model function. Otherwise, parsnip converts the data to this format. @@ -78,9 +98,11 @@ By default, the model is trained without parallel processing. This can be change ```{r child = "template-early-stopping.Rmd"} ``` +Note that, since the `validation` argument provides an alternative interface to `watchlist`, the `watchlist` argument is guarded by parsnip and will be ignored (with a warning) if passed. + ### Objective function -parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()]. +parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()] directly. ## Examples diff --git a/man/rmd/boost_tree_xgboost.md b/man/rmd/boost_tree_xgboost.md index 2b6cf2414..a6eb52d39 100644 --- a/man/rmd/boost_tree_xgboost.md +++ b/man/rmd/boost_tree_xgboost.md @@ -109,6 +109,46 @@ For classification, non-numeric outcomes (i.e., factors) are internally converte ## Other details +### Interfacing with the `params` argument + +The xgboost function that parsnip indirectly wraps, [xgboost::xgb.train()], takes most arguments via the `params` list argument. To supply engine-specific arguments that are documented in [xgboost::xgb.train()] as arguments to be passed via `params`, supply the list elements directly as named arguments to [set_engine()] rather than as elements in `params`. For example, pass a non-default evaluation metric like this: + + +```r +# good +boost_tree() %>% + set_engine("xgboost", eval_metric = "mae") +``` + +``` +## Boosted Tree Model Specification (unknown) +## +## Engine-Specific Arguments: +## eval_metric = mae +## +## Computational engine: xgboost +``` + +...rather than this: + + +```r +# bad +boost_tree() %>% + set_engine("xgboost", params = list(eval_metric = "mae")) +``` + +``` +## Boosted Tree Model Specification (unknown) +## +## Engine-Specific Arguments: +## params = list(eval_metric = "mae") +## +## Computational engine: xgboost +``` + +parsnip will then route arguments as needed. In the case that arguments are passed to `params` via [set_engine()], parsnip will warn and re-route the arguments as needed. Note, though, that arguments passed to `params` cannot be tuned. + ### Sparse matrices xgboost requires the data to be in a sparse format. If your predictor data are already in this format, then use [fit_xy.model_spec()] to pass it to the model function. Otherwise, parsnip converts the data to this format. @@ -137,9 +177,11 @@ The best way to use this feature is in conjunction with an _internal validation If the model specification has `early_stop >= trees`, `early_stop` is converted to `trees - 1` and a warning is issued. +Note that, since the `validation` argument provides an alternative interface to `watchlist`, the `watchlist` argument is guarded by parsnip and will be ignored (with a warning) if passed. + ### Objective function -parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()]. +parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()] directly. ## Examples diff --git a/man/xgb_train.Rd b/man/xgb_train.Rd index 204bd8f7b..d18340d4c 100644 --- a/man/xgb_train.Rd +++ b/man/xgb_train.Rd @@ -19,7 +19,6 @@ xgb_train( subsample = 1, validation = 0, early_stop = NULL, - objective = NULL, counts = TRUE, event_level = c("first", "second"), ... @@ -62,10 +61,6 @@ training iterations without improvement before stopping. If \code{validation} is used, performance is base on the validation set; otherwise, the training set is used.} -\item{objective}{A single string (or NULL) that defines the loss function that -\code{xgboost} uses to create trees. See \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} for options. If left -NULL, an appropriate loss function is chosen.} - \item{counts}{A logical. If \code{FALSE}, \code{colsample_bynode} and \code{colsample_bytree} are both assumed to be \emph{proportions} of the proportion of columns affects (instead of counts).} diff --git a/tests/testthat/_snaps/boost_tree_xgboost.md b/tests/testthat/_snaps/boost_tree_xgboost.md new file mode 100644 index 000000000..32d22a14f --- /dev/null +++ b/tests/testthat/_snaps/boost_tree_xgboost.md @@ -0,0 +1,23 @@ +# interface to param arguments + + ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`. + i See `?details_boost_tree_xgboost` for more information. + +--- + + ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`. + i See `?details_boost_tree_xgboost` for more information. + +--- + + ! The argument `watchlist` is guarded by parsnip and will not be passed to `xgb.train()`. + +--- + + ! The arguments `watchlist` and `data` are guarded by parsnip and will not be passed to `xgb.train()`. + +--- + + ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`. + i See `?details_boost_tree_xgboost` for more information. + diff --git a/tests/testthat/_snaps/proportional_hazards.md b/tests/testthat/_snaps/proportional_hazards.md index dc6cd3515..71b989cdc 100644 --- a/tests/testthat/_snaps/proportional_hazards.md +++ b/tests/testthat/_snaps/proportional_hazards.md @@ -1,17 +1,3 @@ -# printing - - Code - proportional_hazards() - Message - parsnip could not locate an implementation for `proportional_hazards` censored regression model specifications using the `survival` engine. - i The parsnip extension package censored implements support for this specification. - i Please install (if needed) and load to continue. - Output - Proportional Hazards Model Specification (censored regression) - - Computational engine: survival - - # updating Code diff --git a/tests/testthat/_snaps/translate.md b/tests/testthat/_snaps/translate.md index ea638365f..2e9bd8682 100644 --- a/tests/testthat/_snaps/translate.md +++ b/tests/testthat/_snaps/translate.md @@ -2099,3 +2099,53 @@ list(sigma = ~0.2) +# translate tuning paramter names + + Code + .model_param_name_key(mod) + Output + # A tibble: 2 x 3 + user parsnip engine + + 1 number of trees trees nrounds + 2 min_n min_n min_child_weight + +--- + + Code + .model_param_name_key(mod, as_tibble = FALSE) + Output + $user_to_parsnip + trees min_n + "number of trees" "min_n" + + $parsnip_to_engine + nrounds min_child_weight + "trees" "min_n" + + +--- + + Code + .model_param_name_key(linear_reg()) + Output + # A tibble: 0 x 3 + # ... with 3 variables: user , parsnip , engine + # i Use `colnames()` to see all variable names + +--- + + Code + .model_param_name_key(linear_reg(), as_tibble = FALSE) + Output + $user_to_parsnip + named character(0) + + $parsnip_to_engine + named character(0) + + +--- + + 'object' should be a model specification or workflow. + diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 690a8a0a1..5adde2957 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -598,4 +598,95 @@ test_that("count/proportion parameters", { }) +test_that('interface to param arguments', { + skip_if_not_installed("xgboost") + + ctrl$verbosity <- 0L + + # define base model spec + spec_base <- + boost_tree() %>% + set_mode("regression") + + # pass params to params argument (bad) + spec_1 <- + spec_base %>% + set_engine("xgboost", params = list(eval_metric = "mae")) + + expect_snapshot_warning( + fit_1 <- spec_1 %>% fit(mpg ~ ., data = mtcars), + class = "xgboost_params_warning" + ) + + expect_equal(extract_fit_engine(fit_1)$params$eval_metric, "mae") + + # pass params as main argument (good) + spec_2 <- + spec_base %>% + set_engine("xgboost", eval_metric = "mae") + + expect_silent( + fit_2 <- spec_2 %>% fit(mpg ~ ., data = mtcars) + ) + + expect_equal(extract_fit_engine(fit_2)$params$eval_metric, "mae") + + # pass objective to params argument (bad) + spec_3 <- + spec_base %>% + set_engine("xgboost", params = list(objective = "reg:pseudohubererror")) + + expect_snapshot_warning( + fit_3 <- spec_3 %>% fit(mpg ~ ., data = mtcars), + class = "xgboost_params_warning" + ) + expect_equal(extract_fit_engine(fit_3)$params$objective, "reg:pseudohubererror") + + # pass objective as main argument (good) + spec_4 <- + spec_base %>% + set_engine("xgboost", objective = "reg:pseudohubererror") + + expect_silent( + fit_4 <- spec_4 %>% fit(mpg ~ ., data = mtcars) + ) + + expect_equal(extract_fit_engine(fit_4)$params$objective, "reg:pseudohubererror") + + # pass a guarded argument as a main argument (bad) + spec_5 <- + spec_base %>% + set_engine("xgboost", watchlist = "boop") + + expect_snapshot_warning( + fit_5 <- spec_5 %>% fit(mpg ~ ., data = mtcars), + class = "xgboost_guarded_warning" + ) + + expect_null(extract_fit_engine(fit_5)$params$watchlist) + + # pass two guarded arguments as main arguments (bad) + spec_6 <- + spec_base %>% + set_engine("xgboost", watchlist = "boop", data = "beep") + + expect_snapshot_warning( + fit_6 <- spec_6 %>% fit(mpg ~ ., data = mtcars), + class = "xgboost_guarded_warning" + ) + + expect_null(extract_fit_engine(fit_5)$params$watchlist) + + # pass a guarded argument as params argument (bad) + spec_7 <- + spec_base %>% + set_engine("xgboost", params = list(gamma = 0.1)) + + expect_snapshot_warning( + fit_7 <- spec_7 %>% fit(mpg ~ ., data = mtcars), + class = "xgboost_params_warning" + ) + + expect_equal(extract_fit_engine(fit_5)$params$gamma, 0) +})