Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ S3method(bake,step_collapse_stringdist)
S3method(bake,step_discretize_cart)
S3method(bake,step_discretize_xgb)
S3method(bake,step_embed)
S3method(bake,step_lencode)
S3method(bake,step_lencode_bayes)
S3method(bake,step_lencode_glm)
S3method(bake,step_lencode_mixed)
Expand All @@ -18,6 +19,7 @@ S3method(prep,step_collapse_stringdist)
S3method(prep,step_discretize_cart)
S3method(prep,step_discretize_xgb)
S3method(prep,step_embed)
S3method(prep,step_lencode)
S3method(prep,step_lencode_bayes)
S3method(prep,step_lencode_glm)
S3method(prep,step_lencode_mixed)
Expand All @@ -31,6 +33,7 @@ S3method(print,step_collapse_stringdist)
S3method(print,step_discretize_cart)
S3method(print,step_discretize_xgb)
S3method(print,step_embed)
S3method(print,step_lencode)
S3method(print,step_lencode_bayes)
S3method(print,step_lencode_glm)
S3method(print,step_lencode_mixed)
Expand All @@ -44,6 +47,7 @@ S3method(required_pkgs,step_collapse_stringdist)
S3method(required_pkgs,step_discretize_cart)
S3method(required_pkgs,step_discretize_xgb)
S3method(required_pkgs,step_embed)
S3method(required_pkgs,step_lencode)
S3method(required_pkgs,step_lencode_bayes)
S3method(required_pkgs,step_lencode_glm)
S3method(required_pkgs,step_lencode_mixed)
Expand All @@ -57,6 +61,7 @@ S3method(tidy,step_collapse_stringdist)
S3method(tidy,step_discretize_cart)
S3method(tidy,step_discretize_xgb)
S3method(tidy,step_embed)
S3method(tidy,step_lencode)
S3method(tidy,step_lencode_bayes)
S3method(tidy,step_lencode_glm)
S3method(tidy,step_lencode_mixed)
Expand All @@ -83,6 +88,7 @@ export(step_discretize_cart)
export(step_discretize_xgb)
export(step_embed)
export(step_feature_hash)
export(step_lencode)
export(step_lencode_bayes)
export(step_lencode_glm)
export(step_lencode_mixed)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

* `step_feature_hash()` has been fully deprecated in favor of `textrecipes::step_dummy_hash()`. (#253)

* Adds `step_lencode()` to perform analytical likelihood encoding. (#258)

# embed 1.1.5

## Improvements
Expand Down
1 change: 1 addition & 0 deletions R/aaa.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ utils::globalVariables(
".",
"..level",
"..order",
"..value",
".group",
".rows",
"col_names",
Expand Down
311 changes: 311 additions & 0 deletions R/lencode.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
#' Likelihood encoding using analytical formula
#'
#' `step_lencode()` creates a *specification* of a recipe step that will convert
#' a nominal (i.e. factor) predictor into a single set of scores derived
#' analytically.
#'
#' @param recipe A recipe object. The step will be added to the sequence of
#' operations for this recipe.
#' @param ... One or more selector functions to choose variables. For
#' `step_lencode()`, this indicates the variables to be encoded into a
#' numeric format. See [recipes::selections()] for more details. For the
#' `tidy` method, these are not currently used.
#' @param role Not used by this step since no new variables are created.
#' @param outcome A call to `vars` to specify which variable is used as the
#' outcome. Only numeric and two-level factors are currently supported.
#' @param mapping A list of tibble results that define the encoding. This is
#' `NULL` until the step is trained by [recipes::prep()].
#' @param skip A logical. Should the step be skipped when the recipe is baked by
#' [recipes::bake()]? While all operations are baked when [recipes::prep()] is
#' run, some operations may not be able to be conducted on new data (e.g.
#' processing the outcome variable(s)). Care should be taken when using `skip
#' = TRUE` as it may affect the computations for subsequent operations
#' @param trained A logical to indicate if the quantities for preprocessing have
#' been estimated.
#' @param id A character string that is unique to this step to identify it.
#' @return An updated version of `recipe` with the new step added to the
#' sequence of existing steps (if any). For the `tidy` method, a tibble with
#' columns `terms` (the selectors or variables for encoding), `level` (the
#' factor levels), and `value` (the encodings).
#' @keywords datagen
#' @concept preprocessing encoding
#' @details
#'
#' Each selected nominal predictor will be replaced by a numeric predictor.
#' Each unique value of the nominal predictor is replaced by a numeric value.
#' Thse values are calculated differently depending on the type of the outcome.
#'
#' For **numeric** outcomes each value is the average value of the outcome
#' inside each of the levels of the predictor. Unseen levels of the predictor
#' will be using the global mean of the predictor.
#' If case weights are used then a weighted mean is calculated instead.
#'
#' For **nominal** outcomes each value is the log odds of the of the first level
#' of the outcome variable being present, within each level of the levels of the
#' predictor. Unseen levels will be replaced by the global log odds without
#' stratification.
#' If case weights are used then a weighted log odds is calculated.
#'
#' If no or all occurances happens then the log odds is calculated using
#' `p = (2 * nrow(data) - 1) / (2 * nrow(data))` to avoid infinity that would
#' happen by taking the log of `0`.
#'
#' # Tidying
#'
#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble is returned
#' with columns `level`, `value`, `terms`, and `id`:
#'
#' \describe{
#' \item{level}{character, the factor levels}
#' \item{value}{numeric, the encoding}
#' \item{terms}{character, the selectors or variables selected}
#' \item{id}{character, id of this step}
#' }
#'
#' @template case-weights-supervised
#'
#' @references
#'
#' Micci-Barreca D (2001) "A preprocessing scheme for high-cardinality
#' categorical attributes in classification and prediction problems," ACM SIGKDD
#' Explorations Newsletter, 3(1), 27-32.
#'
#' Zumel N and Mount J (2017) "vtreat: a data.frame Processor for Predictive
#' Modeling," arXiv:1611.09477
#'
#' @examplesIf rlang::is_installed("modeldata")
#' library(recipes)
#' library(dplyr)
#' library(modeldata)
#'
#' data(grants)
#'
#' set.seed(1)
#' grants_other <- sample_n(grants_other, 500)
#' reencoded <- recipe(class ~ sponsor_code, data = grants_other) |>
#' step_lencode(sponsor_code, outcome = vars(class)) |>
#' prep()
#'
#' bake(reencoded, grants_other)
#'
#' tidy(reencoded, 1)
#' @export
step_lencode <-
function(
recipe,
...,
role = NA,
trained = FALSE,
outcome = NULL,
mapping = NULL,
skip = FALSE,
id = rand_id("lencode")
) {
if (is.null(outcome)) {
cli::cli_abort("Please list a variable in {.arg outcome}.")
}
add_step(
recipe,
step_lencode_new(
terms = enquos(...),
role = role,
trained = trained,
outcome = outcome,
mapping = mapping,
skip = skip,
id = id,
case_weights = NULL
)
)
}

step_lencode_new <-
function(terms, role, trained, outcome, mapping, skip, id, case_weights) {
step(
subclass = "lencode",
terms = terms,
role = role,
trained = trained,
outcome = outcome,
mapping = mapping,
skip = skip,
id = id,
case_weights = case_weights
)
}

#' @export
prep.step_lencode <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

wts <- get_case_weights(info, training)
were_weights_used <- are_weights_used(wts)
if (isFALSE(were_weights_used) || is.null(wts)) {
wts <- NULL
}

if (length(col_names) > 0) {
check_type(training[, col_names], types = c("string", "factor", "ordered"))
y_name <- recipes_eval_select(x$outcome, training, info)
res <- purrr::map(
training[, col_names],
lencode_calc,
y = training[[y_name]],
wts = wts
)
} else {
res <- list()
}
step_lencode_new(
terms = x$terms,
role = x$role,
trained = TRUE,
outcome = x$outcome,
mapping = res,
skip = x$skip,
id = x$id,
case_weights = were_weights_used
)
}
lencode_calc <- function(x, y, wts = NULL) {
if (!is.numeric(y) && !is.factor(y) && !is.character(y)) {
cli::cli_abort(
"Only works nominal or numeric {.arg outcome},
not {.obj_type_friendly {y}}."
)
}

data <- tibble::new_tibble(
list(..level = x, ..value = y, wts = wts)
)

if (is.numeric(y)) {
if (is.null(wts)) {
res <- dplyr::summarise(
data,
..value = mean(..value),
.by = ..level
)
unseen_value <- mean(data$..value)
} else {
res <- dplyr::summarise(
data,
..value = stats::weighted.mean(..value, wts),
.by = ..level
)
unseen_value <- stats::weighted.mean(data$..value, data$wts)
}
}

if (is.factor(y) || is.character(y)) {
inf_estimate_p <- (2 * nrow(data) - 1) / (2 * nrow(data))
inf_estimate_log_odds <- log(inf_estimate_p / (1 - inf_estimate_p))
if (is.null(wts)) {
res <- dplyr::summarize(
data,
p = (sum(..value == levels(..value)[1])) / n(),
.by = ..level
)

global_p <- (sum(data$..value == levels(data$..value)[1])) / nrow(data)
} else {
data$wts <- as.numeric(data$wts)
res <- dplyr::summarize(
data,
p = (sum((..value == levels(..value)[1]) * wts)) / sum(wts),
.by = ..level
)

global_p <- (sum((data$..value == levels(data$..value)[1]) * data$wts)) /
sum(data$wts)
}

res <- res |>
dplyr::mutate(..value = log(p / (1 - p))) |>
dplyr::mutate(
..value = dplyr::if_else(
is.infinite(..value),
inf_estimate_log_odds,
..value
)
) |>
dplyr::select(-p)

unseen_value <- log(global_p / (1 - global_p))
}

unseen <- tibble::new_tibble(
list(
..level = "..new",
..value = unseen_value
)
)

dplyr::bind_rows(res, unseen)
}

#' @export
bake.step_lencode <- function(object, new_data, ...) {
col_names <- names(object$mapping)
check_new_data(col_names, object, new_data)

for (col_name in col_names) {
new_data[[col_name]] <- map_glm_coef(
dat = new_data[, col_name], # map_glm_coef() expects a tibble
mapping = object$mapping[[col_name]]
)
}

new_data
}

#' @export
print.step_lencode <-
function(x, width = max(20, options()$width - 31), ...) {
title <- "Linear embedding for factors via GLM for "
print_step(
names(x$mapping),
x$terms,
x$trained,
title,
width,
case_weights = x$case_weights
)
invisible(x)
}

#' @rdname step_lencode
#' @usage NULL
#' @export
tidy.step_lencode <- function(x, ...) {
if (is_trained(x)) {
if (length(x$mapping) == 0) {
res <- tibble(
terms = character(),
level = character(),
value = double()
)
} else {
for (i in seq_along(x$mapping)) {
x$mapping[[i]]$terms <- names(x$mapping)[i]
}
res <- bind_rows(x$mapping)
names(res) <- gsub("^\\.\\.", "", names(res))
}
} else {
term_names <- sel2char(x$terms)
res <- tibble(
terms = term_names,
level = rep(na_chr, length(term_names)),
value = rep(na_dbl, length(term_names))
)
}
res$id <- x$id
res
}

#' @rdname required_pkgs.embed
#' @export
required_pkgs.step_lencode <- function(x, ...) {
c("embed")
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ reference:
contents:
- step_embed
- step_feature_hash
- step_lencode
- step_lencode_bayes
- step_lencode_glm
- step_lencode_mixed
Expand Down
Loading
Loading