From 6c5482a3828a4b86e5113e41519a356f89debaef Mon Sep 17 00:00:00 2001
From: "Simon P. Couch" <simonpatrickcouch@gmail.com>
Date: Wed, 17 Aug 2022 19:32:18 -0400
Subject: [PATCH] patch `params` argument with `xgboost` engine in
 `boost_tree()` (#787)

* patch `params` argument with `xgboost` engine in `boost_tree()`

* remove + add snapshots from previous PRs

* update snaps with new help-page reference

Co-authored-by: Max Kuhn <mxkuhn@gmail.com>
---
 NEWS.md                                       |  3 +
 R/boost_tree.R                                | 89 ++++++++++++------
 man/details_boost_tree_xgboost.Rd             | 51 ++++++++++-
 man/rmd/boost_tree_xgboost.Rmd                | 24 ++++-
 man/rmd/boost_tree_xgboost.md                 | 44 ++++++++-
 man/xgb_train.Rd                              |  5 -
 tests/testthat/_snaps/boost_tree_xgboost.md   | 23 +++++
 tests/testthat/_snaps/proportional_hazards.md | 14 ---
 tests/testthat/_snaps/translate.md            | 50 ++++++++++
 tests/testthat/test_boost_tree_xgboost.R      | 91 +++++++++++++++++++
 10 files changed, 342 insertions(+), 52 deletions(-)
 create mode 100644 tests/testthat/_snaps/boost_tree_xgboost.md

diff --git a/NEWS.md b/NEWS.md
index 813baabd7..cfba75b31 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
 # parsnip (development version)
 
+
+* Enabled passing additional engine arguments with the xgboost `boost_tree()` engine. To supply engine-specific arguments that are documented in `xgboost::xgb.train()` as arguments to be passed via `params`, supply the list elements directly as named arguments to `set_engine()`. Read more in `?details_boost_tree_xgboost` (#787).
+  
 # parsnip 1.0.0
 
 ## Model Specification Changes
diff --git a/R/boost_tree.R b/R/boost_tree.R
index cf12e5dea..cce9d1ccd 100644
--- a/R/boost_tree.R
+++ b/R/boost_tree.R
@@ -213,9 +213,6 @@ check_args.boost_tree <- function(object) {
 #' @param counts A logical. If `FALSE`, `colsample_bynode` and
 #' `colsample_bytree` are both assumed to be _proportions_ of the proportion of
 #' columns affects (instead of counts).
-#' @param objective A single string (or NULL) that defines the loss function that
-#' `xgboost` uses to create trees. See [xgboost::xgb.train()] for options. If left
-#' NULL, an appropriate loss function is chosen.
 #' @param event_level For binary classification, this is a single string of either
 #' `"first"` or `"second"` to pass along describing which level of the outcome
 #' should be considered the "event".
@@ -227,7 +224,7 @@ xgb_train <- function(
     x, y, weights = NULL,
     max_depth = 6, nrounds = 15, eta  = 0.3, colsample_bynode = NULL,
     colsample_bytree = NULL, min_child_weight = 1, gamma = 0, subsample = 1,
-    validation = 0, early_stop = NULL, objective = NULL, counts = TRUE,
+    validation = 0, early_stop = NULL, counts = TRUE,
     event_level = c("first", "second"), ...) {
 
   event_level <- rlang::arg_match(event_level, c("first", "second"))
@@ -248,18 +245,6 @@ xgb_train <- function(
     }
   }
 
-  if (is.null(objective)) {
-    if (is.numeric(y)) {
-      objective <- "reg:squarederror"
-    } else {
-      if (num_class == 2) {
-        objective <- "binary:logistic"
-      } else {
-        objective <- "multi:softprob"
-      }
-    }
-  }
-
   n <- nrow(x)
   p <- ncol(x)
 
@@ -300,35 +285,79 @@ xgb_train <- function(
     colsample_bytree = colsample_bytree,
     colsample_bynode = colsample_bynode,
     min_child_weight = min(min_child_weight, n),
-    subsample = subsample,
-    objective = objective
+    subsample = subsample
   )
 
-  main_args <- list(
-    data = quote(x$data),
-    watchlist = quote(x$watchlist),
-    params = arg_list,
-    nrounds = nrounds,
-    early_stopping_rounds = early_stop
+  others <- process_others(others, arg_list)
+
+  main_args <- c(
+      list(
+        data = quote(x$data),
+        watchlist = quote(x$watchlist),
+        params = arg_list,
+        nrounds = nrounds,
+        early_stopping_rounds = early_stop
+      ),
+      others
   )
+
+  if (is.null(main_args$objective)) {
+    if (is.numeric(y)) {
+      main_args$objective <- "reg:squarederror"
+    } else {
+      if (num_class == 2) {
+        main_args$objective <- "binary:logistic"
+      } else {
+        main_args$objective <- "multi:softprob"
+      }
+    }
+  }
+
   if (!is.null(num_class) && num_class > 2) {
     main_args$num_class <- num_class
   }
 
   call <- make_call(fun = "xgb.train", ns = "xgboost", main_args)
 
-  # override or add some other args
+  eval_tidy(call, env = current_env())
+}
+
+process_others <- function(others, arg_list) {
+  guarded <- c("data", "weights", "num_class", "watchlist")
+  guarded_supplied <- names(others)[names(others) %in% guarded]
+
+  if (length(guarded_supplied) > 0) {
+    cli::cli_warn(
+      c(
+        "!" = "{cli::qty(guarded_supplied)} The argument{?s} {.arg {guarded_supplied}} \
+               {?is/are} guarded by parsnip and will not be passed to {.fun xgb.train}."
+      ),
+      class = "xgboost_guarded_warning"
+    )
+  }
 
   others <-
-    others[!(names(others) %in% c("data", "weights", "nrounds", "num_class", names(arg_list)))]
+    others[!(names(others) %in% guarded)]
+
+  if (!is.null(others$params)) {
+    cli::cli_warn(
+      c(
+        "!" = "Please supply elements of the `params` list argument as main arguments \
+               to `set_engine()` rather than as part of `params`.",
+        "i" = "See `?details_boost_tree_xgboost` for more information."
+      ),
+      class = "xgboost_params_warning"
+    )
+
+    params <- others$params[!names(others$params) %in% names(arg_list)]
+    others <- c(others[names(others) != "params"], params)
+  }
+
   if (!(any(names(others) == "verbose"))) {
     others$verbose <- 0
   }
-  if (length(others) > 0) {
-    call <- rlang::call_modify(call, !!!others)
-  }
 
-  eval_tidy(call, env = current_env())
+  others
 }
 
 recalc_param <- function(x, counts, denom) {
diff --git a/man/details_boost_tree_xgboost.Rd b/man/details_boost_tree_xgboost.Rd
index 762a5450a..ab05e46bd 100644
--- a/man/details_boost_tree_xgboost.Rd
+++ b/man/details_boost_tree_xgboost.Rd
@@ -120,6 +120,51 @@ training process.
 }
 
 \subsection{Other details}{
+\subsection{Interfacing with the \code{params} argument}{
+
+The xgboost function that parsnip indirectly wraps,
+\code{\link[xgboost:xgb.train]{xgboost::xgb.train()}}, takes most arguments via
+the \code{params} list argument. To supply engine-specific arguments that are
+documented in \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} as
+arguments to be passed via \code{params}, supply the list elements directly
+as named arguments to \code{\link[=set_engine]{set_engine()}} rather than as
+elements in \code{params}. For example, pass a non-default evaluation metric
+like this:
+
+\if{html}{\out{<div class="sourceCode r">}}\preformatted{# good
+boost_tree() \%>\%
+  set_engine("xgboost", eval_metric = "mae")
+}\if{html}{\out{</div>}}
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{## Boosted Tree Model Specification (unknown)
+## 
+## Engine-Specific Arguments:
+##   eval_metric = mae
+## 
+## Computational engine: xgboost
+}\if{html}{\out{</div>}}
+
+…rather than this:
+
+\if{html}{\out{<div class="sourceCode r">}}\preformatted{# bad
+boost_tree() \%>\%
+  set_engine("xgboost", params = list(eval_metric = "mae"))
+}\if{html}{\out{</div>}}
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{## Boosted Tree Model Specification (unknown)
+## 
+## Engine-Specific Arguments:
+##   params = list(eval_metric = "mae")
+## 
+## Computational engine: xgboost
+}\if{html}{\out{</div>}}
+
+parsnip will then route arguments as needed. In the case that arguments
+are passed to \code{params} via \code{\link[=set_engine]{set_engine()}}, parsnip will
+warn and re-route the arguments as needed. Note, though, that arguments
+passed to \code{params} cannot be tuned.
+}
+
 \subsection{Sparse matrices}{
 
 xgboost requires the data to be in a sparse format. If your predictor
@@ -182,13 +227,17 @@ performance (and stopping early).
 
 If the model specification has \code{early_stop >= trees}, \code{early_stop} is
 converted to \code{trees - 1} and a warning is issued.
+
+Note that, since the \code{validation} argument provides an alternative
+interface to \code{watchlist}, the \code{watchlist} argument is guarded by parsnip
+and will be ignored (with a warning) if passed.
 }
 
 \subsection{Objective function}{
 
 parsnip chooses the objective function based on the characteristics of
 the outcome. To use a different loss, pass the \code{objective} argument to
-\code{\link[=set_engine]{set_engine()}}.
+\code{\link[=set_engine]{set_engine()}} directly.
 }
 
 }
diff --git a/man/rmd/boost_tree_xgboost.Rmd b/man/rmd/boost_tree_xgboost.Rmd
index c764f9da5..9394aad07 100644
--- a/man/rmd/boost_tree_xgboost.Rmd
+++ b/man/rmd/boost_tree_xgboost.Rmd
@@ -60,6 +60,26 @@ For classification, non-numeric outcomes (i.e., factors) are internally converte
 
 ## Other details
 
+### Interfacing with the `params` argument
+
+The xgboost function that parsnip indirectly wraps, [xgboost::xgb.train()], takes most arguments via the `params` list argument. To supply engine-specific arguments that are documented in [xgboost::xgb.train()] as arguments to be passed via `params`, supply the list elements directly as named arguments to [set_engine()] rather than as elements in `params`. For example, pass a non-default evaluation metric like this:
+
+```{r}
+# good
+boost_tree() %>%
+  set_engine("xgboost", eval_metric = "mae")
+```
+
+...rather than this:
+
+```{r}
+# bad
+boost_tree() %>%
+  set_engine("xgboost", params = list(eval_metric = "mae"))
+```
+
+parsnip will then route arguments as needed. In the case that arguments are passed to `params` via [set_engine()], parsnip will warn and re-route the arguments as needed. Note, though, that arguments passed to `params` cannot be tuned.
+
 ### Sparse matrices
 
 xgboost requires the data to be in a sparse format. If your predictor data are already in this format, then use [fit_xy.model_spec()] to pass it to the model function. Otherwise, parsnip converts the data to this format. 
@@ -78,9 +98,11 @@ By default, the model is trained without parallel processing. This can be change
 ```{r child = "template-early-stopping.Rmd"}
 ```
 
+Note that, since the `validation` argument provides an alternative interface to `watchlist`, the `watchlist` argument is guarded by parsnip and will be ignored (with a warning) if passed.
+
 ### Objective function
 
-parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()]. 
+parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()] directly. 
 
 ## Examples 
 
diff --git a/man/rmd/boost_tree_xgboost.md b/man/rmd/boost_tree_xgboost.md
index 2b6cf2414..a6eb52d39 100644
--- a/man/rmd/boost_tree_xgboost.md
+++ b/man/rmd/boost_tree_xgboost.md
@@ -109,6 +109,46 @@ For classification, non-numeric outcomes (i.e., factors) are internally converte
 
 ## Other details
 
+### Interfacing with the `params` argument
+
+The xgboost function that parsnip indirectly wraps, [xgboost::xgb.train()], takes most arguments via the `params` list argument. To supply engine-specific arguments that are documented in [xgboost::xgb.train()] as arguments to be passed via `params`, supply the list elements directly as named arguments to [set_engine()] rather than as elements in `params`. For example, pass a non-default evaluation metric like this:
+
+
+```r
+# good
+boost_tree() %>%
+  set_engine("xgboost", eval_metric = "mae")
+```
+
+```
+## Boosted Tree Model Specification (unknown)
+## 
+## Engine-Specific Arguments:
+##   eval_metric = mae
+## 
+## Computational engine: xgboost
+```
+
+...rather than this:
+
+
+```r
+# bad
+boost_tree() %>%
+  set_engine("xgboost", params = list(eval_metric = "mae"))
+```
+
+```
+## Boosted Tree Model Specification (unknown)
+## 
+## Engine-Specific Arguments:
+##   params = list(eval_metric = "mae")
+## 
+## Computational engine: xgboost
+```
+
+parsnip will then route arguments as needed. In the case that arguments are passed to `params` via [set_engine()], parsnip will warn and re-route the arguments as needed. Note, though, that arguments passed to `params` cannot be tuned.
+
 ### Sparse matrices
 
 xgboost requires the data to be in a sparse format. If your predictor data are already in this format, then use [fit_xy.model_spec()] to pass it to the model function. Otherwise, parsnip converts the data to this format. 
@@ -137,9 +177,11 @@ The best way to use this feature is in conjunction with an _internal validation
 
 If the model specification has `early_stop >= trees`, `early_stop` is converted to `trees - 1` and a warning is issued. 
 
+Note that, since the `validation` argument provides an alternative interface to `watchlist`, the `watchlist` argument is guarded by parsnip and will be ignored (with a warning) if passed.
+
 ### Objective function
 
-parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()]. 
+parsnip chooses the objective function based on the characteristics of the outcome. To use a different loss, pass the `objective` argument to [set_engine()] directly. 
 
 ## Examples 
 
diff --git a/man/xgb_train.Rd b/man/xgb_train.Rd
index 204bd8f7b..d18340d4c 100644
--- a/man/xgb_train.Rd
+++ b/man/xgb_train.Rd
@@ -19,7 +19,6 @@ xgb_train(
   subsample = 1,
   validation = 0,
   early_stop = NULL,
-  objective = NULL,
   counts = TRUE,
   event_level = c("first", "second"),
   ...
@@ -62,10 +61,6 @@ training iterations without improvement before stopping. If \code{validation} is
 used, performance is base on the validation set; otherwise, the training set
 is used.}
 
-\item{objective}{A single string (or NULL) that defines the loss function that
-\code{xgboost} uses to create trees. See \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} for options. If left
-NULL, an appropriate loss function is chosen.}
-
 \item{counts}{A logical. If \code{FALSE}, \code{colsample_bynode} and
 \code{colsample_bytree} are both assumed to be \emph{proportions} of the proportion of
 columns affects (instead of counts).}
diff --git a/tests/testthat/_snaps/boost_tree_xgboost.md b/tests/testthat/_snaps/boost_tree_xgboost.md
new file mode 100644
index 000000000..32d22a14f
--- /dev/null
+++ b/tests/testthat/_snaps/boost_tree_xgboost.md
@@ -0,0 +1,23 @@
+# interface to param arguments
+
+    ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`.
+    i See `?details_boost_tree_xgboost` for more information.
+
+---
+
+    ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`.
+    i See `?details_boost_tree_xgboost` for more information.
+
+---
+
+    ! The argument `watchlist` is guarded by parsnip and will not be passed to `xgb.train()`.
+
+---
+
+    ! The arguments `watchlist` and `data` are guarded by parsnip and will not be passed to `xgb.train()`.
+
+---
+
+    ! Please supply elements of the `params` list argument as main arguments to `set_engine()` rather than as part of `params`.
+    i See `?details_boost_tree_xgboost` for more information.
+
diff --git a/tests/testthat/_snaps/proportional_hazards.md b/tests/testthat/_snaps/proportional_hazards.md
index dc6cd3515..71b989cdc 100644
--- a/tests/testthat/_snaps/proportional_hazards.md
+++ b/tests/testthat/_snaps/proportional_hazards.md
@@ -1,17 +1,3 @@
-# printing
-
-    Code
-      proportional_hazards()
-    Message
-      parsnip could not locate an implementation for `proportional_hazards` censored regression model specifications using the `survival` engine.
-      i The parsnip extension package censored implements support for this specification.
-      i Please install (if needed) and load to continue.
-    Output
-      Proportional Hazards Model Specification (censored regression)
-      
-      Computational engine: survival 
-      
-
 # updating
 
     Code
diff --git a/tests/testthat/_snaps/translate.md b/tests/testthat/_snaps/translate.md
index ea638365f..2e9bd8682 100644
--- a/tests/testthat/_snaps/translate.md
+++ b/tests/testthat/_snaps/translate.md
@@ -2099,3 +2099,53 @@
       list(sigma = ~0.2)
       
 
+# translate tuning paramter names
+
+    Code
+      .model_param_name_key(mod)
+    Output
+      # A tibble: 2 x 3
+        user            parsnip engine          
+        <chr>           <chr>   <chr>           
+      1 number of trees trees   nrounds         
+      2 min_n           min_n   min_child_weight
+
+---
+
+    Code
+      .model_param_name_key(mod, as_tibble = FALSE)
+    Output
+      $user_to_parsnip
+                  trees             min_n 
+      "number of trees"           "min_n" 
+      
+      $parsnip_to_engine
+               nrounds min_child_weight 
+               "trees"          "min_n" 
+      
+
+---
+
+    Code
+      .model_param_name_key(linear_reg())
+    Output
+      # A tibble: 0 x 3
+      # ... with 3 variables: user <chr>, parsnip <chr>, engine <chr>
+      # i Use `colnames()` to see all variable names
+
+---
+
+    Code
+      .model_param_name_key(linear_reg(), as_tibble = FALSE)
+    Output
+      $user_to_parsnip
+      named character(0)
+      
+      $parsnip_to_engine
+      named character(0)
+      
+
+---
+
+    'object' should be a model specification or workflow.
+
diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 690a8a0a1..5adde2957 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -598,4 +598,95 @@ test_that("count/proportion parameters", {
 
 })
 
+test_that('interface to param arguments', {
+  skip_if_not_installed("xgboost")
+
+  ctrl$verbosity <- 0L
+
+  # define base model spec
+  spec_base <-
+    boost_tree() %>%
+    set_mode("regression")
+
+  # pass params to params argument (bad)
+  spec_1 <-
+    spec_base %>%
+    set_engine("xgboost", params = list(eval_metric = "mae"))
+
+  expect_snapshot_warning(
+    fit_1 <- spec_1 %>% fit(mpg ~ ., data = mtcars),
+    class = "xgboost_params_warning"
+  )
+
+  expect_equal(extract_fit_engine(fit_1)$params$eval_metric, "mae")
+
+  # pass params as main argument (good)
+  spec_2 <-
+    spec_base %>%
+    set_engine("xgboost", eval_metric = "mae")
+
+  expect_silent(
+    fit_2 <- spec_2 %>% fit(mpg ~ ., data = mtcars)
+  )
+
+  expect_equal(extract_fit_engine(fit_2)$params$eval_metric, "mae")
+
+  # pass objective to params argument (bad)
+  spec_3 <-
+    spec_base %>%
+    set_engine("xgboost", params = list(objective = "reg:pseudohubererror"))
+
+  expect_snapshot_warning(
+    fit_3 <- spec_3 %>% fit(mpg ~ ., data = mtcars),
+    class = "xgboost_params_warning"
+  )
 
+  expect_equal(extract_fit_engine(fit_3)$params$objective, "reg:pseudohubererror")
+
+  # pass objective as main argument (good)
+  spec_4 <-
+    spec_base %>%
+    set_engine("xgboost", objective = "reg:pseudohubererror")
+
+  expect_silent(
+    fit_4 <- spec_4 %>% fit(mpg ~ ., data = mtcars)
+  )
+
+  expect_equal(extract_fit_engine(fit_4)$params$objective, "reg:pseudohubererror")
+
+  # pass a guarded argument as a main argument (bad)
+  spec_5 <-
+    spec_base %>%
+    set_engine("xgboost", watchlist = "boop")
+
+  expect_snapshot_warning(
+    fit_5 <- spec_5 %>% fit(mpg ~ ., data = mtcars),
+    class = "xgboost_guarded_warning"
+  )
+
+  expect_null(extract_fit_engine(fit_5)$params$watchlist)
+
+  # pass two guarded arguments as main arguments (bad)
+  spec_6 <-
+    spec_base %>%
+    set_engine("xgboost", watchlist = "boop", data = "beep")
+
+  expect_snapshot_warning(
+    fit_6 <- spec_6 %>% fit(mpg ~ ., data = mtcars),
+    class = "xgboost_guarded_warning"
+  )
+
+  expect_null(extract_fit_engine(fit_5)$params$watchlist)
+
+  # pass a guarded argument as params argument (bad)
+  spec_7 <-
+    spec_base %>%
+    set_engine("xgboost", params = list(gamma = 0.1))
+
+  expect_snapshot_warning(
+    fit_7 <- spec_7 %>% fit(mpg ~ ., data = mtcars),
+    class = "xgboost_params_warning"
+  )
+
+  expect_equal(extract_fit_engine(fit_5)$params$gamma, 0)
+})