diff --git a/R/boost_tree.R b/R/boost_tree.R index 2515d7c3f..32d9bc1cd 100644 --- a/R/boost_tree.R +++ b/R/boost_tree.R @@ -472,6 +472,17 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) { list(data = dat, watchlist = wlist) } + +get_event_level <- function(model_spec){ + if ("event_level" %in% names(model_spec$eng_args)) { + event_level <- get_expr(model_spec$eng_args$event_level) + } else { + # "first" is the default for as_xgb_data() and xgb_train() + event_level <- "first" + } + event_level +} + #' @importFrom purrr map_df #' @export #' @rdname multi_predict diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R index 742354c24..da069d8c9 100644 --- a/R/boost_tree_data.R +++ b/R/boost_tree_data.R @@ -158,7 +158,12 @@ set_pred( pre = NULL, post = function(x, object) { if (is.vector(x)) { - x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1]) + event_level <- get_event_level(object$spec) + if (event_level == "first") { + x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2]) + } else { + x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1]) + } } else { x <- object$lvl[apply(x, 1, which.max)] } @@ -178,7 +183,12 @@ set_pred( pre = NULL, post = function(x, object) { if (is.vector(x)) { - x <- tibble(v1 = 1 - x, v2 = x) + event_level <- get_event_level(object$spec) + if (event_level == "first") { + x <- tibble(v1 = x, v2 = 1 - x) + } else { + x <- tibble(v1 = 1 - x, v2 = x) + } } else { x <- as_tibble(x, .name_repair = "minimal") } diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd index 9a2efa8c1..1e1b1c0bc 100644 --- a/man/boost_tree.Rd +++ b/man/boost_tree.Rd @@ -185,14 +185,17 @@ For this engine, tuning over \code{trees} is very efficient since the same model object can be used to make predictions over multiple values of \code{trees}. -Finally, note that \code{xgboost} models require that non-numeric predictors -(e.g., factors) must be converted to dummy variables or some other -numeric representation. By default, when using \code{fit()} with \code{xgboost}, a -one-hot encoding is used to convert factor predictors to indicator -variables. In the classification mode, non-numeric outcomes (i.e., +Note that \code{xgboost} models require that non-numeric predictors (e.g., +factors) must be converted to dummy variables or some other numeric +representation. By default, when using \code{fit()} with \code{xgboost}, a one-hot +encoding is used to convert factor predictors to indicator variables. + +Finally, in the classification mode, non-numeric outcomes (i.e., factors) are converted to numeric. For binary classification, the \code{event_level} argument of \code{set_engine()} can be set to either \code{"first"} -or \code{"second"} to specify which level should be used as the event. +or \code{"second"} to specify which level should be used as the event. This +can be helpful when a watchlist is used to monitor performance from with +the xgboost training process. } \subsection{C5.0}{\if{html}{\out{
}}\preformatted{boost_tree() \%>\% diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 048b2d0ca..94448b6df 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -210,7 +210,7 @@ test_that('submodel prediction', { mp_res <- multi_predict(class_fit, new_data = wa_churn[1:4, vars], trees = 5, type = "prob") mp_res <- do.call("rbind", mp_res$.pred) - expect_equal(mp_res[[".pred_No"]], pred_class) + expect_equal(mp_res[[".pred_Yes"]], pred_class) expect_error( multi_predict(class_fit, newdata = wa_churn[1:4, vars], trees = 5, type = "prob"), @@ -218,7 +218,6 @@ test_that('submodel prediction', { ) }) - test_that('default engine', { skip_if_not_installed("xgboost") expect_warning( @@ -422,43 +421,64 @@ test_that('argument checks for data dimensions', { }) -test_that("set `event_level` as engine-specific argument", { +test_that("fit and prediction with `event_level`", { skip_if_not_installed("xgboost") data(penguins, package = "modeldata") penguins <- na.omit(penguins[, -c(1:2)]) - spec <- - boost_tree(trees = 10, tree_depth = 3) %>% - set_engine( - "xgboost", - eval_metric = "aucpr", - event_level = "second", - verbose = 1 - ) %>% - set_mode("classification") + train_x <- as.matrix(penguins[-(1:4), -5]) + train_y_1 <- -as.numeric(penguins$sex[-(1:4)]) + 2 + train_y_2 <- as.numeric(penguins$sex[-(1:4)]) - 1 + x_pred <- xgboost::xgb.DMatrix(as.matrix(penguins[1:4, -5])) + + # event_level = "first" set.seed(24) - fit_p <- spec %>% fit(sex ~ ., data = penguins) + fit_p_1 <- boost_tree(trees = 10) %>% + set_engine("xgboost", eval_metric = "auc" + # event_level = "first" is the default + ) %>% + set_mode("classification") %>% + fit(sex ~ ., data = penguins[-(1:4), ]) - penguins_x <- as.matrix(penguins[, -5]) - penguins_y <- as.numeric(penguins$sex) - 1 - xgbmat <- xgb.DMatrix(data = penguins_x, label = penguins_y) + xgbmat_train_1 <- xgb.DMatrix(data = train_x, label = train_y_1) set.seed(24) - fit_xgb <- xgboost::xgb.train(data = xgbmat, - params = list(eta = 0.3, max_depth = 3, - gamma = 0, colsample_bytree = 1, - min_child_weight = 1, - subsample = 1), + fit_xgb_1 <- xgboost::xgb.train(data = xgbmat_train_1, nrounds = 10, - watchlist = list("training" = xgbmat), + watchlist = list("training" = xgbmat_train_1), objective = "binary:logistic", - verbose = 1, - eval_metric = "aucpr", - nthread = 1) + eval_metric = "auc") + + expect_equal(fit_p_1$fit$evaluation_log, fit_xgb_1$evaluation_log) + + pred_xgb_1 <- predict(fit_xgb_1, x_pred) + pred_p_1 <- predict(fit_p_1, new_data = penguins[1:4, ], type = "prob") + expect_equal(pred_p_1[[".pred_female"]], pred_xgb_1) + + # event_level = "second" + set.seed(24) + fit_p_2 <- boost_tree(trees = 10) %>% + set_engine("xgboost", eval_metric = "auc", + event_level = "second") %>% + set_mode("classification") %>% + fit(sex ~ ., data = penguins[-(1:4), ]) + + xgbmat_train_2 <- xgb.DMatrix(data = train_x, label = train_y_2) + + set.seed(24) + fit_xgb_2 <- xgboost::xgb.train(data = xgbmat_train_2, + nrounds = 10, + watchlist = list("training" = xgbmat_train_2), + objective = "binary:logistic", + eval_metric = "auc") + + expect_equal(fit_p_2$fit$evaluation_log, fit_xgb_2$evaluation_log) - expect_equal(fit_p$fit$evaluation_log, fit_xgb$evaluation_log) + pred_xgb_2 <- predict(fit_xgb_2, x_pred) + pred_p_2 <- predict(fit_p_2, new_data = penguins[1:4, ], type = "prob") + expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2) })