From 1e945f43eadda6456a1a03df8022d43417f64d55 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 8 Apr 2021 11:20:13 +0100 Subject: [PATCH 1/5] make predictions respect the event_level --- R/boost_tree.R | 11 +++++++++++ R/boost_tree_data.R | 14 ++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/R/boost_tree.R b/R/boost_tree.R index 2515d7c3f..32d9bc1cd 100644 --- a/R/boost_tree.R +++ b/R/boost_tree.R @@ -472,6 +472,17 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) { list(data = dat, watchlist = wlist) } + +get_event_level <- function(model_spec){ + if ("event_level" %in% names(model_spec$eng_args)) { + event_level <- get_expr(model_spec$eng_args$event_level) + } else { + # "first" is the default for as_xgb_data() and xgb_train() + event_level <- "first" + } + event_level +} + #' @importFrom purrr map_df #' @export #' @rdname multi_predict diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R index 742354c24..da069d8c9 100644 --- a/R/boost_tree_data.R +++ b/R/boost_tree_data.R @@ -158,7 +158,12 @@ set_pred( pre = NULL, post = function(x, object) { if (is.vector(x)) { - x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1]) + event_level <- get_event_level(object$spec) + if (event_level == "first") { + x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2]) + } else { + x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1]) + } } else { x <- object$lvl[apply(x, 1, which.max)] } @@ -178,7 +183,12 @@ set_pred( pre = NULL, post = function(x, object) { if (is.vector(x)) { - x <- tibble(v1 = 1 - x, v2 = x) + event_level <- get_event_level(object$spec) + if (event_level == "first") { + x <- tibble(v1 = x, v2 = 1 - x) + } else { + x <- tibble(v1 = 1 - x, v2 = x) + } } else { x <- as_tibble(x, .name_repair = "minimal") } From 6244c1d75b4b5d9435568adb5ec5ec48268a0c87 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 8 Apr 2021 11:22:05 +0100 Subject: [PATCH 2/5] fix test: "Yes" is the first level, thus the event level --- tests/testthat/test_boost_tree_xgboost.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 048b2d0ca..0366f9bcb 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -210,7 +210,7 @@ test_that('submodel prediction', { mp_res <- multi_predict(class_fit, new_data = wa_churn[1:4, vars], trees = 5, type = "prob") mp_res <- do.call("rbind", mp_res$.pred) - expect_equal(mp_res[[".pred_No"]], pred_class) + expect_equal(mp_res[[".pred_Yes"]], pred_class) expect_error( multi_predict(class_fit, newdata = wa_churn[1:4, vars], trees = 5, type = "prob"), From a7b31a8b57fab272c5f8b527372c82abb4488404 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 8 Apr 2021 11:22:55 +0100 Subject: [PATCH 3/5] add test for prediction with event_level --- tests/testthat/test_boost_tree_xgboost.R | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 0366f9bcb..4370dd1ea 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -218,6 +218,37 @@ test_that('submodel prediction', { ) }) +test_that('prediction with event_level', { + + skip_if_not_installed("xgboost") + library(xgboost) + + vars <- c("female", "tenure", "total_charges", "phone_service", "monthly_charges") + + # event_level = "first" + fit_1 <- + boost_tree(trees = 20, mode = "classification") %>% + set_engine("xgboost") %>% + fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)]) + + x <- xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars])) + + pred_xgb_1 <- predict(fit_1$fit, x) + pred_res_1 <- predict(fit_1, new_data = wa_churn[1:4, vars], type = "prob") + expect_equal(pred_res_1[[".pred_Yes"]], pred_xgb_1) + + # event_level = "second" + fit_2 <- + boost_tree(trees = 20, mode = "classification") %>% + set_engine("xgboost", event_level = "second") %>% + fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)]) + + x <- xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars])) + + pred_xgb_2 <- predict(fit_2$fit, x) + pred_res_2 <- predict(fit_2, new_data = wa_churn[1:4, vars], type = "prob") + expect_equal(pred_res_2[[".pred_No"]], pred_xgb_2) +}) test_that('default engine', { skip_if_not_installed("xgboost") From dc5f046d0b2657c14ec4e263bbfecf1cdfe0cb25 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 8 Apr 2021 11:23:25 +0100 Subject: [PATCH 4/5] (leftover) document() --- man/boost_tree.Rd | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd index 9a2efa8c1..1e1b1c0bc 100644 --- a/man/boost_tree.Rd +++ b/man/boost_tree.Rd @@ -185,14 +185,17 @@ For this engine, tuning over \code{trees} is very efficient since the same model object can be used to make predictions over multiple values of \code{trees}. -Finally, note that \code{xgboost} models require that non-numeric predictors -(e.g., factors) must be converted to dummy variables or some other -numeric representation. By default, when using \code{fit()} with \code{xgboost}, a -one-hot encoding is used to convert factor predictors to indicator -variables. In the classification mode, non-numeric outcomes (i.e., +Note that \code{xgboost} models require that non-numeric predictors (e.g., +factors) must be converted to dummy variables or some other numeric +representation. By default, when using \code{fit()} with \code{xgboost}, a one-hot +encoding is used to convert factor predictors to indicator variables. + +Finally, in the classification mode, non-numeric outcomes (i.e., factors) are converted to numeric. For binary classification, the \code{event_level} argument of \code{set_engine()} can be set to either \code{"first"} -or \code{"second"} to specify which level should be used as the event. +or \code{"second"} to specify which level should be used as the event. This +can be helpful when a watchlist is used to monitor performance from with +the xgboost training process. } \subsection{C5.0}{\if{html}{\out{
}}\preformatted{boost_tree() \%>\% From 952170966db68d6c29b6546565f23561d9aa2d23 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Fri, 9 Apr 2021 12:00:39 +0100 Subject: [PATCH 5/5] test for fit and prediction --- tests/testthat/test_boost_tree_xgboost.R | 101 ++++++++++------------- 1 file changed, 45 insertions(+), 56 deletions(-) diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 4370dd1ea..94448b6df 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -218,38 +218,6 @@ test_that('submodel prediction', { ) }) -test_that('prediction with event_level', { - - skip_if_not_installed("xgboost") - library(xgboost) - - vars <- c("female", "tenure", "total_charges", "phone_service", "monthly_charges") - - # event_level = "first" - fit_1 <- - boost_tree(trees = 20, mode = "classification") %>% - set_engine("xgboost") %>% - fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)]) - - x <- xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars])) - - pred_xgb_1 <- predict(fit_1$fit, x) - pred_res_1 <- predict(fit_1, new_data = wa_churn[1:4, vars], type = "prob") - expect_equal(pred_res_1[[".pred_Yes"]], pred_xgb_1) - - # event_level = "second" - fit_2 <- - boost_tree(trees = 20, mode = "classification") %>% - set_engine("xgboost", event_level = "second") %>% - fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)]) - - x <- xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars])) - - pred_xgb_2 <- predict(fit_2$fit, x) - pred_res_2 <- predict(fit_2, new_data = wa_churn[1:4, vars], type = "prob") - expect_equal(pred_res_2[[".pred_No"]], pred_xgb_2) -}) - test_that('default engine', { skip_if_not_installed("xgboost") expect_warning( @@ -453,43 +421,64 @@ test_that('argument checks for data dimensions', { }) -test_that("set `event_level` as engine-specific argument", { +test_that("fit and prediction with `event_level`", { skip_if_not_installed("xgboost") data(penguins, package = "modeldata") penguins <- na.omit(penguins[, -c(1:2)]) - spec <- - boost_tree(trees = 10, tree_depth = 3) %>% - set_engine( - "xgboost", - eval_metric = "aucpr", - event_level = "second", - verbose = 1 - ) %>% - set_mode("classification") + train_x <- as.matrix(penguins[-(1:4), -5]) + train_y_1 <- -as.numeric(penguins$sex[-(1:4)]) + 2 + train_y_2 <- as.numeric(penguins$sex[-(1:4)]) - 1 + + x_pred <- xgboost::xgb.DMatrix(as.matrix(penguins[1:4, -5])) + # event_level = "first" set.seed(24) - fit_p <- spec %>% fit(sex ~ ., data = penguins) + fit_p_1 <- boost_tree(trees = 10) %>% + set_engine("xgboost", eval_metric = "auc" + # event_level = "first" is the default + ) %>% + set_mode("classification") %>% + fit(sex ~ ., data = penguins[-(1:4), ]) - penguins_x <- as.matrix(penguins[, -5]) - penguins_y <- as.numeric(penguins$sex) - 1 - xgbmat <- xgb.DMatrix(data = penguins_x, label = penguins_y) + xgbmat_train_1 <- xgb.DMatrix(data = train_x, label = train_y_1) set.seed(24) - fit_xgb <- xgboost::xgb.train(data = xgbmat, - params = list(eta = 0.3, max_depth = 3, - gamma = 0, colsample_bytree = 1, - min_child_weight = 1, - subsample = 1), + fit_xgb_1 <- xgboost::xgb.train(data = xgbmat_train_1, nrounds = 10, - watchlist = list("training" = xgbmat), + watchlist = list("training" = xgbmat_train_1), objective = "binary:logistic", - verbose = 1, - eval_metric = "aucpr", - nthread = 1) + eval_metric = "auc") + + expect_equal(fit_p_1$fit$evaluation_log, fit_xgb_1$evaluation_log) + + pred_xgb_1 <- predict(fit_xgb_1, x_pred) + pred_p_1 <- predict(fit_p_1, new_data = penguins[1:4, ], type = "prob") + expect_equal(pred_p_1[[".pred_female"]], pred_xgb_1) + + # event_level = "second" + set.seed(24) + fit_p_2 <- boost_tree(trees = 10) %>% + set_engine("xgboost", eval_metric = "auc", + event_level = "second") %>% + set_mode("classification") %>% + fit(sex ~ ., data = penguins[-(1:4), ]) + + xgbmat_train_2 <- xgb.DMatrix(data = train_x, label = train_y_2) + + set.seed(24) + fit_xgb_2 <- xgboost::xgb.train(data = xgbmat_train_2, + nrounds = 10, + watchlist = list("training" = xgbmat_train_2), + objective = "binary:logistic", + eval_metric = "auc") + + expect_equal(fit_p_2$fit$evaluation_log, fit_xgb_2$evaluation_log) - expect_equal(fit_p$fit$evaluation_log, fit_xgb$evaluation_log) + pred_xgb_2 <- predict(fit_xgb_2, x_pred) + pred_p_2 <- predict(fit_p_2, new_data = penguins[1:4, ], type = "prob") + expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2) })