diff --git a/R/boost_tree.R b/R/boost_tree.R
index 2515d7c3f..32d9bc1cd 100644
--- a/R/boost_tree.R
+++ b/R/boost_tree.R
@@ -472,6 +472,17 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) {
 
   list(data = dat, watchlist = wlist)
 }
+
+get_event_level <- function(model_spec){
+  if ("event_level" %in% names(model_spec$eng_args)) {
+    event_level <- get_expr(model_spec$eng_args$event_level)
+  } else {
+    # "first" is the default for as_xgb_data() and xgb_train()
+    event_level <- "first"
+  }
+  event_level
+}
+
 #' @importFrom purrr map_df
 #' @export
 #' @rdname multi_predict
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
index 742354c24..da069d8c9 100644
--- a/R/boost_tree_data.R
+++ b/R/boost_tree_data.R
@@ -158,7 +158,12 @@ set_pred(
     pre = NULL,
     post = function(x, object) {
       if (is.vector(x)) {
-        x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+        event_level <- get_event_level(object$spec)
+        if (event_level == "first") {
+          x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
+        } else {
+          x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
+        }
       } else {
         x <- object$lvl[apply(x, 1, which.max)]
       }
@@ -178,7 +183,12 @@ set_pred(
     pre = NULL,
     post = function(x, object) {
       if (is.vector(x)) {
-        x <- tibble(v1 = 1 - x, v2 = x)
+        event_level <- get_event_level(object$spec)
+        if (event_level == "first") {
+          x <- tibble(v1 = x, v2 = 1 - x)
+        } else {
+          x <- tibble(v1 = 1 - x, v2 = x)
+        }
       } else {
         x <- as_tibble(x, .name_repair = "minimal")
       }
diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd
index 9a2efa8c1..1e1b1c0bc 100644
--- a/man/boost_tree.Rd
+++ b/man/boost_tree.Rd
@@ -185,14 +185,17 @@ For this engine, tuning over \code{trees} is very efficient since the same
 model object can be used to make predictions over multiple values of
 \code{trees}.
 
-Finally, note that \code{xgboost} models require that non-numeric predictors
-(e.g., factors) must be converted to dummy variables or some other
-numeric representation. By default, when using \code{fit()} with \code{xgboost}, a
-one-hot encoding is used to convert factor predictors to indicator
-variables. In the classification mode, non-numeric outcomes (i.e.,
+Note that \code{xgboost} models require that non-numeric predictors (e.g.,
+factors) must be converted to dummy variables or some other numeric
+representation. By default, when using \code{fit()} with \code{xgboost}, a one-hot
+encoding is used to convert factor predictors to indicator variables.
+
+Finally, in the classification mode, non-numeric outcomes (i.e.,
 factors) are converted to numeric. For binary classification, the
 \code{event_level} argument of \code{set_engine()} can be set to either \code{"first"}
-or \code{"second"} to specify which level should be used as the event.
+or \code{"second"} to specify which level should be used as the event. This
+can be helpful when a watchlist is used to monitor performance from with
+the xgboost training process.
 }
 
 \subsection{C5.0}{\if{html}{\out{<div class="r">}}\preformatted{boost_tree() \%>\% 
diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
index 048b2d0ca..94448b6df 100644
--- a/tests/testthat/test_boost_tree_xgboost.R
+++ b/tests/testthat/test_boost_tree_xgboost.R
@@ -210,7 +210,7 @@ test_that('submodel prediction', {
 
   mp_res <- multi_predict(class_fit, new_data = wa_churn[1:4, vars], trees = 5, type = "prob")
   mp_res <- do.call("rbind", mp_res$.pred)
-  expect_equal(mp_res[[".pred_No"]], pred_class)
+  expect_equal(mp_res[[".pred_Yes"]], pred_class)
 
   expect_error(
     multi_predict(class_fit, newdata = wa_churn[1:4, vars], trees = 5, type = "prob"),
@@ -218,7 +218,6 @@ test_that('submodel prediction', {
   )
 })
 
-
 test_that('default engine', {
   skip_if_not_installed("xgboost")
   expect_warning(
@@ -422,43 +421,64 @@ test_that('argument checks for data dimensions', {
 
 })
 
-test_that("set `event_level` as engine-specific argument", {
+test_that("fit and prediction with `event_level`", {
 
   skip_if_not_installed("xgboost")
 
   data(penguins, package = "modeldata")
   penguins <- na.omit(penguins[, -c(1:2)])
 
-  spec <-
-    boost_tree(trees = 10, tree_depth = 3) %>%
-    set_engine(
-      "xgboost",
-      eval_metric = "aucpr",
-      event_level = "second",
-      verbose = 1
-    ) %>%
-    set_mode("classification")
+  train_x <- as.matrix(penguins[-(1:4), -5])
+  train_y_1 <- -as.numeric(penguins$sex[-(1:4)]) + 2
+  train_y_2 <- as.numeric(penguins$sex[-(1:4)]) - 1
 
+  x_pred <-  xgboost::xgb.DMatrix(as.matrix(penguins[1:4, -5]))
+
+  # event_level = "first"
   set.seed(24)
-  fit_p <- spec %>% fit(sex ~ ., data = penguins)
+  fit_p_1 <- boost_tree(trees = 10) %>%
+    set_engine("xgboost", eval_metric = "auc"
+               # event_level = "first" is the default
+               ) %>%
+    set_mode("classification") %>%
+    fit(sex ~ ., data = penguins[-(1:4), ])
 
-  penguins_x <- as.matrix(penguins[, -5])
-  penguins_y <- as.numeric(penguins$sex) - 1
-  xgbmat <- xgb.DMatrix(data = penguins_x, label = penguins_y)
+  xgbmat_train_1 <- xgb.DMatrix(data = train_x, label = train_y_1)
 
   set.seed(24)
-  fit_xgb <- xgboost::xgb.train(data = xgbmat,
-                                params = list(eta = 0.3, max_depth = 3,
-                                              gamma = 0, colsample_bytree = 1,
-                                              min_child_weight = 1,
-                                              subsample = 1),
+  fit_xgb_1 <- xgboost::xgb.train(data = xgbmat_train_1,
                                 nrounds = 10,
-                                watchlist = list("training" = xgbmat),
+                                watchlist = list("training" = xgbmat_train_1),
                                 objective = "binary:logistic",
-                                verbose = 1,
-                                eval_metric = "aucpr",
-                                nthread = 1)
+                                eval_metric = "auc")
+
+  expect_equal(fit_p_1$fit$evaluation_log, fit_xgb_1$evaluation_log)
+
+  pred_xgb_1 <- predict(fit_xgb_1, x_pred)
+  pred_p_1 <- predict(fit_p_1, new_data = penguins[1:4, ], type = "prob")
+  expect_equal(pred_p_1[[".pred_female"]], pred_xgb_1)
+
+  # event_level = "second"
+  set.seed(24)
+  fit_p_2 <- boost_tree(trees = 10) %>%
+    set_engine("xgboost", eval_metric = "auc",
+               event_level = "second") %>%
+    set_mode("classification") %>%
+    fit(sex ~ ., data = penguins[-(1:4), ])
+
+  xgbmat_train_2 <- xgb.DMatrix(data = train_x, label = train_y_2)
+
+  set.seed(24)
+  fit_xgb_2 <- xgboost::xgb.train(data = xgbmat_train_2,
+                                  nrounds = 10,
+                                  watchlist = list("training" = xgbmat_train_2),
+                                  objective = "binary:logistic",
+                                  eval_metric = "auc")
+
+  expect_equal(fit_p_2$fit$evaluation_log, fit_xgb_2$evaluation_log)
 
-  expect_equal(fit_p$fit$evaluation_log, fit_xgb$evaluation_log)
+  pred_xgb_2 <- predict(fit_xgb_2, x_pred)
+  pred_p_2 <- predict(fit_p_2, new_data = penguins[1:4, ], type = "prob")
+  expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2)
 
 })