From 1e945f43eadda6456a1a03df8022d43417f64d55 Mon Sep 17 00:00:00 2001
From: Hannah Frick <hannah@rstudio.com>
Date: Thu, 8 Apr 2021 11:20:13 +0100
Subject: [PATCH 1/5] make predictions respect the event_level

---
 R/boost_tree.R      | 11 +++++++++++
 R/boost_tree_data.R | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/R/boost_tree.R b/R/boost_tree.R
index 2515d7c3f..32d9bc1cd 100644
--- a/R/boost_tree.R
+++ b/R/boost_tree.R
@@ -472,6 +472,17 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) {
 
   list(data = dat, watchlist = wlist)
 }
+
+get_event_level <- function(model_spec){
+  if ("event_level" %in% names(model_spec$eng_args)) {
+    event_level <- get_expr(model_spec$eng_args$event_level)
+  } else {
+    # "first" is the default for as_xgb_data() and xgb_train()
+    event_level <- "first"
+  }
+  event_level
+}
+
 #' @importFrom purrr map_df
 #' @export
 #' @rdname multi_predict
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
index 742354c24..da069d8c9 100644
--- a/R/boost_tree_data.R
+++ b/R/boost_tree_data.R
@@ -158,7 +158,12 @@ set_pred(
     pre = NULL,
     post = function(x, object) {
       if (is.vector(x)) {
-        x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+        event_level <- get_event_level(object$spec)
+        if (event_level == "first") {
+          x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
+        } else {
+          x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+        }
       } else {
         x <- object$lvl[apply(x, 1, which.max)]
       }
@@ -178,7 +183,12 @@ set_pred(
     pre = NULL,
     post = function(x, object) {
       if (is.vector(x)) {
-        x <- tibble(v1 = 1 - x, v2 = x)
+        event_level <- get_event_level(object$spec)
+        if (event_level == "first") {
+          x <- tibble(v1 = x, v2 = 1 - x)
+        } else {
+          x <- tibble(v1 = 1 - x, v2 = x)
+        }
       } else {
         x <- as_tibble(x, .name_repair = "minimal")
       }

From 6244c1d75b4b5d9435568adb5ec5ec48268a0c87 Mon Sep 17 00:00:00 2001
From: Hannah Frick <hannah@rstudio.com>
Date: Thu, 8 Apr 2021 11:22:05 +0100
Subject: [PATCH 2/5] fix test: "Yes" is the first level, thus the event level

---
 tests/testthat/test_boost_tree_xgboost.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 048b2d0ca..0366f9bcb 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -210,7 +210,7 @@ test_that('submodel prediction', {
 
   mp_res <- multi_predict(class_fit, new_data = wa_churn[1:4, vars], trees = 5, type = "prob")
   mp_res <- do.call("rbind", mp_res$.pred)
-  expect_equal(mp_res[[".pred_No"]], pred_class)
+  expect_equal(mp_res[[".pred_Yes"]], pred_class)
 
   expect_error(
     multi_predict(class_fit, newdata = wa_churn[1:4, vars], trees = 5, type = "prob"),

From a7b31a8b57fab272c5f8b527372c82abb4488404 Mon Sep 17 00:00:00 2001
From: Hannah Frick <hannah@rstudio.com>
Date: Thu, 8 Apr 2021 11:22:55 +0100
Subject: [PATCH 3/5] add test for prediction with event_level

---
 tests/testthat/test_boost_tree_xgboost.R | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 0366f9bcb..4370dd1ea 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -218,6 +218,37 @@ test_that('submodel prediction', {
   )
 })
 
+test_that('prediction with event_level', {
+
+  skip_if_not_installed("xgboost")
+  library(xgboost)
+
+  vars <- c("female", "tenure", "total_charges", "phone_service", "monthly_charges")
+
+  # event_level = "first"
+  fit_1 <-
+    boost_tree(trees = 20, mode = "classification") %>%
+    set_engine("xgboost") %>%
+    fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)])
+
+  x <-  xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars]))
+
+  pred_xgb_1 <- predict(fit_1$fit, x)
+  pred_res_1 <- predict(fit_1, new_data = wa_churn[1:4, vars], type = "prob")
+  expect_equal(pred_res_1[[".pred_Yes"]], pred_xgb_1)
+
+  # event_level = "second"
+  fit_2 <-
+    boost_tree(trees = 20, mode = "classification") %>%
+    set_engine("xgboost", event_level = "second") %>%
+    fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)])
+
+  x <-  xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars]))
+
+  pred_xgb_2 <- predict(fit_2$fit, x)
+  pred_res_2 <- predict(fit_2, new_data = wa_churn[1:4, vars], type = "prob")
+  expect_equal(pred_res_2[[".pred_No"]], pred_xgb_2)
+})
 
 test_that('default engine', {
   skip_if_not_installed("xgboost")

From dc5f046d0b2657c14ec4e263bbfecf1cdfe0cb25 Mon Sep 17 00:00:00 2001
From: Hannah Frick <hannah@rstudio.com>
Date: Thu, 8 Apr 2021 11:23:25 +0100
Subject: [PATCH 4/5] (leftover) document()

---
 man/boost_tree.Rd | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd
index 9a2efa8c1..1e1b1c0bc 100644
--- a/man/boost_tree.Rd
+++ b/man/boost_tree.Rd
@@ -185,14 +185,17 @@ For this engine, tuning over \code{trees} is very efficient since the same
 model object can be used to make predictions over multiple values of
 \code{trees}.
 
-Finally, note that \code{xgboost} models require that non-numeric predictors
-(e.g., factors) must be converted to dummy variables or some other
-numeric representation. By default, when using \code{fit()} with \code{xgboost}, a
-one-hot encoding is used to convert factor predictors to indicator
-variables. In the classification mode, non-numeric outcomes (i.e.,
+Note that \code{xgboost} models require that non-numeric predictors (e.g.,
+factors) must be converted to dummy variables or some other numeric
+representation. By default, when using \code{fit()} with \code{xgboost}, a one-hot
+encoding is used to convert factor predictors to indicator variables.
+
+Finally, in the classification mode, non-numeric outcomes (i.e.,
 factors) are converted to numeric. For binary classification, the
 \code{event_level} argument of \code{set_engine()} can be set to either \code{"first"}
-or \code{"second"} to specify which level should be used as the event.
+or \code{"second"} to specify which level should be used as the event. This
+can be helpful when a watchlist is used to monitor performance from with
+the xgboost training process.
 }
 
 \subsection{C5.0}{\if{html}{\out{<div class="r">}}\preformatted{boost_tree() \%>\% 

From 952170966db68d6c29b6546565f23561d9aa2d23 Mon Sep 17 00:00:00 2001
From: Hannah Frick <hannah@rstudio.com>
Date: Fri, 9 Apr 2021 12:00:39 +0100
Subject: [PATCH 5/5] test for fit and prediction

---
 tests/testthat/test_boost_tree_xgboost.R | 101 ++++++++++-------------
 1 file changed, 45 insertions(+), 56 deletions(-)

diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 4370dd1ea..94448b6df 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -218,38 +218,6 @@ test_that('submodel prediction', {
   )
 })
 
-test_that('prediction with event_level', {
-
-  skip_if_not_installed("xgboost")
-  library(xgboost)
-
-  vars <- c("female", "tenure", "total_charges", "phone_service", "monthly_charges")
-
-  # event_level = "first"
-  fit_1 <-
-    boost_tree(trees = 20, mode = "classification") %>%
-    set_engine("xgboost") %>%
-    fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)])
-
-  x <-  xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars]))
-
-  pred_xgb_1 <- predict(fit_1$fit, x)
-  pred_res_1 <- predict(fit_1, new_data = wa_churn[1:4, vars], type = "prob")
-  expect_equal(pred_res_1[[".pred_Yes"]], pred_xgb_1)
-
-  # event_level = "second"
-  fit_2 <-
-    boost_tree(trees = 20, mode = "classification") %>%
-    set_engine("xgboost", event_level = "second") %>%
-    fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)])
-
-  x <-  xgboost::xgb.DMatrix(as.matrix(wa_churn[1:4, vars]))
-
-  pred_xgb_2 <- predict(fit_2$fit, x)
-  pred_res_2 <- predict(fit_2, new_data = wa_churn[1:4, vars], type = "prob")
-  expect_equal(pred_res_2[[".pred_No"]], pred_xgb_2)
-})
-
 test_that('default engine', {
   skip_if_not_installed("xgboost")
   expect_warning(
@@ -453,43 +421,64 @@ test_that('argument checks for data dimensions', {
 
 })
 
-test_that("set `event_level` as engine-specific argument", {
+test_that("fit and prediction with `event_level`", {
 
   skip_if_not_installed("xgboost")
 
   data(penguins, package = "modeldata")
   penguins <- na.omit(penguins[, -c(1:2)])
 
-  spec <-
-    boost_tree(trees = 10, tree_depth = 3) %>%
-    set_engine(
-      "xgboost",
-      eval_metric = "aucpr",
-      event_level = "second",
-      verbose = 1
-    ) %>%
-    set_mode("classification")
+  train_x <- as.matrix(penguins[-(1:4), -5])
+  train_y_1 <- -as.numeric(penguins$sex[-(1:4)]) + 2
+  train_y_2 <- as.numeric(penguins$sex[-(1:4)]) - 1
+
+  x_pred <-  xgboost::xgb.DMatrix(as.matrix(penguins[1:4, -5]))
 
+  # event_level = "first"
   set.seed(24)
-  fit_p <- spec %>% fit(sex ~ ., data = penguins)
+  fit_p_1 <- boost_tree(trees = 10) %>%
+    set_engine("xgboost", eval_metric = "auc"
+               # event_level = "first" is the default
+               ) %>%
+    set_mode("classification") %>%
+    fit(sex ~ ., data = penguins[-(1:4), ])
 
-  penguins_x <- as.matrix(penguins[, -5])
-  penguins_y <- as.numeric(penguins$sex) - 1
-  xgbmat <- xgb.DMatrix(data = penguins_x, label = penguins_y)
+  xgbmat_train_1 <- xgb.DMatrix(data = train_x, label = train_y_1)
 
   set.seed(24)
-  fit_xgb <- xgboost::xgb.train(data = xgbmat,
-                                params = list(eta = 0.3, max_depth = 3,
-                                              gamma = 0, colsample_bytree = 1,
-                                              min_child_weight = 1,
-                                              subsample = 1),
+  fit_xgb_1 <- xgboost::xgb.train(data = xgbmat_train_1,
                                 nrounds = 10,
-                                watchlist = list("training" = xgbmat),
+                                watchlist = list("training" = xgbmat_train_1),
                                 objective = "binary:logistic",
-                                verbose = 1,
-                                eval_metric = "aucpr",
-                                nthread = 1)
+                                eval_metric = "auc")
+
+  expect_equal(fit_p_1$fit$evaluation_log, fit_xgb_1$evaluation_log)
+
+  pred_xgb_1 <- predict(fit_xgb_1, x_pred)
+  pred_p_1 <- predict(fit_p_1, new_data = penguins[1:4, ], type = "prob")
+  expect_equal(pred_p_1[[".pred_female"]], pred_xgb_1)
+
+  # event_level = "second"
+  set.seed(24)
+  fit_p_2 <- boost_tree(trees = 10) %>%
+    set_engine("xgboost", eval_metric = "auc",
+               event_level = "second") %>%
+    set_mode("classification") %>%
+    fit(sex ~ ., data = penguins[-(1:4), ])
+
+  xgbmat_train_2 <- xgb.DMatrix(data = train_x, label = train_y_2)
+
+  set.seed(24)
+  fit_xgb_2 <- xgboost::xgb.train(data = xgbmat_train_2,
+                                  nrounds = 10,
+                                  watchlist = list("training" = xgbmat_train_2),
+                                  objective = "binary:logistic",
+                                  eval_metric = "auc")
+
+  expect_equal(fit_p_2$fit$evaluation_log, fit_xgb_2$evaluation_log)
 
-  expect_equal(fit_p$fit$evaluation_log, fit_xgb$evaluation_log)
+  pred_xgb_2 <- predict(fit_xgb_2, x_pred)
+  pred_p_2 <- predict(fit_p_2, new_data = penguins[1:4, ], type = "prob")
+  expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2)
 
 })