diff --git a/R/boost_tree.R b/R/boost_tree.R
index 2515d7c3f..32d9bc1cd 100644
--- a/R/boost_tree.R
+++ b/R/boost_tree.R
@@ -472,6 +472,17 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) {
list(data = dat, watchlist = wlist)
}
+
+get_event_level <- function(model_spec){
+ if ("event_level" %in% names(model_spec$eng_args)) {
+ event_level <- get_expr(model_spec$eng_args$event_level)
+ } else {
+ # "first" is the default for as_xgb_data() and xgb_train()
+ event_level <- "first"
+ }
+ event_level
+}
+
#' @importFrom purrr map_df
#' @export
#' @rdname multi_predict
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
index 742354c24..da069d8c9 100644
--- a/R/boost_tree_data.R
+++ b/R/boost_tree_data.R
@@ -158,7 +158,12 @@ set_pred(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
- x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+ event_level <- get_event_level(object$spec)
+ if (event_level == "first") {
+ x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
+ } else {
+ x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+ }
} else {
x <- object$lvl[apply(x, 1, which.max)]
}
@@ -178,7 +183,12 @@ set_pred(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
- x <- tibble(v1 = 1 - x, v2 = x)
+ event_level <- get_event_level(object$spec)
+ if (event_level == "first") {
+ x <- tibble(v1 = x, v2 = 1 - x)
+ } else {
+ x <- tibble(v1 = 1 - x, v2 = x)
+ }
} else {
x <- as_tibble(x, .name_repair = "minimal")
}
diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd
index 9a2efa8c1..1e1b1c0bc 100644
--- a/man/boost_tree.Rd
+++ b/man/boost_tree.Rd
@@ -185,14 +185,17 @@ For this engine, tuning over \code{trees} is very efficient since the same
model object can be used to make predictions over multiple values of
\code{trees}.
-Finally, note that \code{xgboost} models require that non-numeric predictors
-(e.g., factors) must be converted to dummy variables or some other
-numeric representation. By default, when using \code{fit()} with \code{xgboost}, a
-one-hot encoding is used to convert factor predictors to indicator
-variables. In the classification mode, non-numeric outcomes (i.e.,
+Note that \code{xgboost} models require that non-numeric predictors (e.g.,
+factors) must be converted to dummy variables or some other numeric
+representation. By default, when using \code{fit()} with \code{xgboost}, a one-hot
+encoding is used to convert factor predictors to indicator variables.
+
+Finally, in the classification mode, non-numeric outcomes (i.e.,
factors) are converted to numeric. For binary classification, the
\code{event_level} argument of \code{set_engine()} can be set to either \code{"first"}
-or \code{"second"} to specify which level should be used as the event.
+or \code{"second"} to specify which level should be used as the event. This
+can be helpful when a watchlist is used to monitor performance from with
+the xgboost training process.
}
\subsection{C5.0}{\if{html}{\out{
}}\preformatted{boost_tree() \%>\%
diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 048b2d0ca..94448b6df 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -210,7 +210,7 @@ test_that('submodel prediction', {
mp_res <- multi_predict(class_fit, new_data = wa_churn[1:4, vars], trees = 5, type = "prob")
mp_res <- do.call("rbind", mp_res$.pred)
- expect_equal(mp_res[[".pred_No"]], pred_class)
+ expect_equal(mp_res[[".pred_Yes"]], pred_class)
expect_error(
multi_predict(class_fit, newdata = wa_churn[1:4, vars], trees = 5, type = "prob"),
@@ -218,7 +218,6 @@ test_that('submodel prediction', {
)
})
-
test_that('default engine', {
skip_if_not_installed("xgboost")
expect_warning(
@@ -422,43 +421,64 @@ test_that('argument checks for data dimensions', {
})
-test_that("set `event_level` as engine-specific argument", {
+test_that("fit and prediction with `event_level`", {
skip_if_not_installed("xgboost")
data(penguins, package = "modeldata")
penguins <- na.omit(penguins[, -c(1:2)])
- spec <-
- boost_tree(trees = 10, tree_depth = 3) %>%
- set_engine(
- "xgboost",
- eval_metric = "aucpr",
- event_level = "second",
- verbose = 1
- ) %>%
- set_mode("classification")
+ train_x <- as.matrix(penguins[-(1:4), -5])
+ train_y_1 <- -as.numeric(penguins$sex[-(1:4)]) + 2
+ train_y_2 <- as.numeric(penguins$sex[-(1:4)]) - 1
+ x_pred <- xgboost::xgb.DMatrix(as.matrix(penguins[1:4, -5]))
+
+ # event_level = "first"
set.seed(24)
- fit_p <- spec %>% fit(sex ~ ., data = penguins)
+ fit_p_1 <- boost_tree(trees = 10) %>%
+ set_engine("xgboost", eval_metric = "auc"
+ # event_level = "first" is the default
+ ) %>%
+ set_mode("classification") %>%
+ fit(sex ~ ., data = penguins[-(1:4), ])
- penguins_x <- as.matrix(penguins[, -5])
- penguins_y <- as.numeric(penguins$sex) - 1
- xgbmat <- xgb.DMatrix(data = penguins_x, label = penguins_y)
+ xgbmat_train_1 <- xgb.DMatrix(data = train_x, label = train_y_1)
set.seed(24)
- fit_xgb <- xgboost::xgb.train(data = xgbmat,
- params = list(eta = 0.3, max_depth = 3,
- gamma = 0, colsample_bytree = 1,
- min_child_weight = 1,
- subsample = 1),
+ fit_xgb_1 <- xgboost::xgb.train(data = xgbmat_train_1,
nrounds = 10,
- watchlist = list("training" = xgbmat),
+ watchlist = list("training" = xgbmat_train_1),
objective = "binary:logistic",
- verbose = 1,
- eval_metric = "aucpr",
- nthread = 1)
+ eval_metric = "auc")
+
+ expect_equal(fit_p_1$fit$evaluation_log, fit_xgb_1$evaluation_log)
+
+ pred_xgb_1 <- predict(fit_xgb_1, x_pred)
+ pred_p_1 <- predict(fit_p_1, new_data = penguins[1:4, ], type = "prob")
+ expect_equal(pred_p_1[[".pred_female"]], pred_xgb_1)
+
+ # event_level = "second"
+ set.seed(24)
+ fit_p_2 <- boost_tree(trees = 10) %>%
+ set_engine("xgboost", eval_metric = "auc",
+ event_level = "second") %>%
+ set_mode("classification") %>%
+ fit(sex ~ ., data = penguins[-(1:4), ])
+
+ xgbmat_train_2 <- xgb.DMatrix(data = train_x, label = train_y_2)
+
+ set.seed(24)
+ fit_xgb_2 <- xgboost::xgb.train(data = xgbmat_train_2,
+ nrounds = 10,
+ watchlist = list("training" = xgbmat_train_2),
+ objective = "binary:logistic",
+ eval_metric = "auc")
+
+ expect_equal(fit_p_2$fit$evaluation_log, fit_xgb_2$evaluation_log)
- expect_equal(fit_p$fit$evaluation_log, fit_xgb$evaluation_log)
+ pred_xgb_2 <- predict(fit_xgb_2, x_pred)
+ pred_p_2 <- predict(fit_p_2, new_data = penguins[1:4, ], type = "prob")
+ expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2)
})