Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# embed (development version)

* Make work with all versions of xgboost. (#267)

# embed 1.2.0

## Improvements

* Adds `step_lencode()` to perform analytical likelihood encoding. (#258)
Expand Down
54 changes: 39 additions & 15 deletions R/discretize_xgb.R
Original file line number Diff line number Diff line change
Expand Up @@ -206,20 +206,38 @@ run_xgboost <- function(
)
}

xgboost::xgb.train(
params = .params,
nrounds = 100,
data = .train,
watchlist = list(
train = .train,
test = .test
),
tree_method = "hist",
early_stopping_rounds = 10,
objective = .objective,
verbose = 0,
nthread = 1
)
if (utils::packageVersion("xgboost") >= "2.0.0.0") {
.params$tree_method = "hist"
.params$nthread = 1
.params$objective = .objective

xgboost::xgb.train(
params = .params,
nrounds = 100,
data = .train,
evals = list(
train = .train,
test = .test
),
early_stopping_rounds = 10,
verbose = 0
)
} else {
xgboost::xgb.train(
params = .params,
nrounds = 100,
data = .train,
watchlist = list(
train = .train,
test = .test
),
tree_method = "hist",
early_stopping_rounds = 10,
objective = .objective,
verbose = 0,
nthread = 1
)
}
}

xgb_binning <- function(
Expand Down Expand Up @@ -373,10 +391,16 @@ xgb_binning <- function(
# inform the user that the dataset is insufficient for this particular case
# https://github.com/dmlc/xgboost/issues/2876
# https://stackoverflow.com/questions/42670033/r-getting-non-tree-model-detected-this-function-can-only-be-used-with-tree-mo
if (utils::packageVersion("xgboost") >= "2.0.0.0") {
best_iteration <- attr(xgb_mdl, "early_stop")$best_iteration
} else {
best_iteration <- xgb_mdl$best_iteration
}

xgb_tree <- try(
xgboost::xgb.model.dt.tree(
model = xgb_mdl,
trees = 1:xgb_mdl$best_iteration,
trees = seq_len(best_iteration),
use_int_id = TRUE
),
silent = TRUE
Expand Down
241 changes: 241 additions & 0 deletions tests/testthat/_snaps/1.7.11.1/discretize_xgb.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
# run_xgboost for classification

Code
xgboost
Output
##### xgb.Booster
raw: 74.2 Kb
call:
xgboost::xgb.train(params = .params, data = .train, nrounds = 100,
watchlist = list(train = .train, test = .test), verbose = 0,
early_stopping_rounds = 10, tree_method = "hist", objective = .objective,
nthread = 1)
params (as set within xgb.train):
eta = "0.3", max_bin = "10", max_depth = "1", min_child_weight = "5", tree_method = "hist", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
cb.evaluation.log()
cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,
verbose = verbose)
# of features: 13
niter: 96
best_iteration : 86
best_ntreelimit : 86
best_score : 0.4421503
best_msg : [86] train-logloss:0.417583 test-logloss:0.442150
nfeatures : 13
evaluation_log:
iter train_logloss test_logloss
<num> <num> <num>
1 0.6279229 0.6303495
2 0.5869984 0.5894989
--- --- ---
95 0.4157892 0.4425857
96 0.4156102 0.4432699

# run_xgboost for multi-classification

Code
xgboost
Output
##### xgb.Booster
raw: 149.7 Kb
call:
xgboost::xgb.train(params = .params, data = .train, nrounds = 100,
watchlist = list(train = .train, test = .test), verbose = 0,
early_stopping_rounds = 10, tree_method = "hist", objective = .objective,
nthread = 1)
params (as set within xgb.train):
eta = "0.3", max_bin = "10", max_depth = "1", min_child_weight = "5", num_class = "6", tree_method = "hist", objective = "multi:softprob", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
cb.evaluation.log()
cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,
verbose = verbose)
# of features: 30
niter: 33
best_iteration : 23
best_ntreelimit : 23
best_score : 1.246428
best_msg : [23] train-mlogloss:1.178121 test-mlogloss:1.246428
nfeatures : 30
evaluation_log:
iter train_mlogloss test_mlogloss
<num> <num> <num>
1 1.623174 1.631783
2 1.515108 1.531188
--- --- ---
32 1.159813 1.249701
33 1.158088 1.250462

# run_xgboost for regression

Code
xgboost
Output
##### xgb.Booster
raw: 40.2 Kb
call:
xgboost::xgb.train(params = .params, data = .train, nrounds = 100,
watchlist = list(train = .train, test = .test), verbose = 0,
early_stopping_rounds = 10, tree_method = "hist", objective = .objective,
nthread = 1)
params (as set within xgb.train):
eta = "0.3", max_bin = "10", max_depth = "1", min_child_weight = "5", tree_method = "hist", objective = "reg:squarederror", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
cb.evaluation.log()
cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,
verbose = verbose)
# of features: 73
niter: 50
best_iteration : 40
best_ntreelimit : 40
best_score : 0.1165337
best_msg : [40] train-rmse:0.064010 test-rmse:0.116534
nfeatures : 73
evaluation_log:
iter train_rmse test_rmse
<num> <num> <num>
1 3.31007782 3.3068878
2 2.31969213 2.3262197
--- --- ---
49 0.06207940 0.1175223
50 0.06191289 0.1188113

# xgb_binning for classification

Code
xgb_binning
Output
[1] 1 2 3 5 6 9 12 15 20

# xgb_binning for multi-classification

Code
xgb_binning
Output
[1] 26 31 35 38

# xgb_binning for regression

Code
xgb_binning
Output
[1] 42.01972 42.02510 42.03122 42.03462 42.03840 42.04638 42.05236 42.05917

# step_discretize_xgb for classification

Code
xgb_train_bins[1:10, ]
Output
# A tibble: 10 x 3
x z class
<fct> <fct> <fct>
1 [0.6808, Inf] [0.4208,0.7999) a
2 [0.5749,0.6808) [0.4208,0.7999) b
3 [0.3687,0.5749) [-Inf,0.3327) b
4 [0.5749,0.6808) [0.4208,0.7999) b
5 [0.6808, Inf] [0.4208,0.7999) a
6 [0.5749,0.6808) [0.7999, Inf] a
7 [0.5749,0.6808) [-Inf,0.3327) b
8 [0.5749,0.6808) [-Inf,0.3327) a
9 [0.2779,0.3687) [0.4208,0.7999) a
10 [0.3687,0.5749) [0.4208,0.7999) b

---

Code
xgb_test_bins[1:10, ]
Output
# A tibble: 10 x 3
x z class
<fct> <fct> <fct>
1 [0.5749,0.6808) [-Inf,0.3327) b
2 [0.5749,0.6808) [0.4208,0.7999) b
3 [0.3687,0.5749) [-Inf,0.3327) b
4 [0.2779,0.3687) [0.4208,0.7999) b
5 [0.6808, Inf] [0.7999, Inf] a
6 [0.3687,0.5749) [-Inf,0.3327) b
7 [0.2779,0.3687) [0.4208,0.7999) a
8 [0.6808, Inf] [0.7999, Inf] a
9 [0.3687,0.5749) [0.7999, Inf] b
10 [0.3687,0.5749) [0.4208,0.7999) b

---

Code
prep(step_discretize_xgb(recipe(class ~ ., data = sim_tr_cls[1:9, ]),
all_predictors(), outcome = "class"))
Condition
Error in `step_discretize_xgb()`:
Caused by error in `prep()`:
! Too few observations in the early stopping validation set.
i Consider increasing the `sample_val` parameter.

---

Code
set.seed(1)
prep(step_discretize_xgb(recipe(Status ~ ., data = credit_data_train), Time,
outcome = "Status"), retain = TRUE)
Condition
Warning:
More than 20 unique training set values are required.
i Predictors 'Time' were not processed; their original values will be used.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
outcome: 1
predictor: 13

-- Training information
Training data contained 3340 data points and 301 incomplete rows.

-- Operations
* Discretizing variables using xgboost: <none> | Trained

# step_discretize_xgb for multi-classification

Code
xgb_train_bins[1:10, ]
Output
# A tibble: 10 x 3
x z class
<fct> <fct> <fct>
1 [0.6879, Inf] [0.3274, Inf] c
2 [0.5863,0.6879) [0.3274, Inf] b
3 [0.3821,0.5863) [-Inf,0.3274) b
4 [0.5863,0.6879) [0.3274, Inf] b
5 [0.6879, Inf] [0.3274, Inf] c
6 [0.5863,0.6879) [0.3274, Inf] c
7 [0.5863,0.6879) [-Inf,0.3274) b
8 [0.5863,0.6879) [-Inf,0.3274) c
9 [-Inf,0.2887) [0.3274, Inf] a
10 [0.3821,0.5863) [0.3274, Inf] b

---

Code
xgb_test_bins[1:10, ]
Output
# A tibble: 10 x 3
x z class
<fct> <fct> <fct>
1 [0.5863,0.6879) [-Inf,0.3274) b
2 [0.5863,0.6879) [0.3274, Inf] b
3 [0.3821,0.5863) [-Inf,0.3274) b
4 [0.2887,0.3821) [0.3274, Inf] b
5 [0.6879, Inf] [0.3274, Inf] c
6 [0.3821,0.5863) [-Inf,0.3274) b
7 [0.2887,0.3821) [0.3274, Inf] a
8 [0.6879, Inf] [0.3274, Inf] c
9 [0.3821,0.5863) [0.3274, Inf] b
10 [0.3821,0.5863) [0.3274, Inf] b

Loading
Loading