tidymodels · EmilHvitfeldt · Aug 28, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ S3method(bake,step_collapse_stringdist)
 S3method(bake,step_discretize_cart)
 S3method(bake,step_discretize_xgb)
 S3method(bake,step_embed)
+S3method(bake,step_lencode)
 S3method(bake,step_lencode_bayes)
 S3method(bake,step_lencode_glm)
 S3method(bake,step_lencode_mixed)
@@ -18,6 +19,7 @@ S3method(prep,step_collapse_stringdist)
 S3method(prep,step_discretize_cart)
 S3method(prep,step_discretize_xgb)
 S3method(prep,step_embed)
+S3method(prep,step_lencode)
 S3method(prep,step_lencode_bayes)
 S3method(prep,step_lencode_glm)
 S3method(prep,step_lencode_mixed)
@@ -31,6 +33,7 @@ S3method(print,step_collapse_stringdist)
 S3method(print,step_discretize_cart)
 S3method(print,step_discretize_xgb)
 S3method(print,step_embed)
+S3method(print,step_lencode)
 S3method(print,step_lencode_bayes)
 S3method(print,step_lencode_glm)
 S3method(print,step_lencode_mixed)
@@ -44,6 +47,7 @@ S3method(required_pkgs,step_collapse_stringdist)
 S3method(required_pkgs,step_discretize_cart)
 S3method(required_pkgs,step_discretize_xgb)
 S3method(required_pkgs,step_embed)
+S3method(required_pkgs,step_lencode)
 S3method(required_pkgs,step_lencode_bayes)
 S3method(required_pkgs,step_lencode_glm)
 S3method(required_pkgs,step_lencode_mixed)
@@ -57,6 +61,7 @@ S3method(tidy,step_collapse_stringdist)
 S3method(tidy,step_discretize_cart)
 S3method(tidy,step_discretize_xgb)
 S3method(tidy,step_embed)
+S3method(tidy,step_lencode)
 S3method(tidy,step_lencode_bayes)
 S3method(tidy,step_lencode_glm)
 S3method(tidy,step_lencode_mixed)
@@ -83,6 +88,7 @@ export(step_discretize_cart)
 export(step_discretize_xgb)
 export(step_embed)
 export(step_feature_hash)
+export(step_lencode)
 export(step_lencode_bayes)
 export(step_lencode_glm)
 export(step_lencode_mixed)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 * `step_feature_hash()` has been fully deprecated in favor of `textrecipes::step_dummy_hash()`. (#253)
 
+* Adds `step_lencode()` to perform analytical likelihood encoding. (#258)
+
 # embed 1.1.5
 
 ## Improvements

diff --git a/R/aaa.R b/R/aaa.R
@@ -7,6 +7,7 @@ utils::globalVariables(
     ".",
     "..level",
     "..order",
+    "..value",
     ".group",
     ".rows",
     "col_names",

diff --git a/R/lencode.R b/R/lencode.R
@@ -0,0 +1,311 @@
+#' Likelihood encoding using analytical formula
+#'
+#' `step_lencode()` creates a *specification* of a recipe step that will convert
+#' a nominal (i.e. factor) predictor into a single set of scores derived
+#' analytically.
+#'
+#' @param recipe A recipe object. The step will be added to the sequence of
+#'   operations for this recipe.
+#' @param ... One or more selector functions to choose variables. For
+#'   `step_lencode()`, this indicates the variables to be encoded into a
+#'   numeric format. See [recipes::selections()] for more details. For the
+#'   `tidy` method, these are not currently used.
+#' @param role Not used by this step since no new variables are created.
+#' @param outcome A call to `vars` to specify which variable is used as the
+#'   outcome. Only numeric and two-level factors are currently supported.
+#' @param mapping A list of tibble results that define the encoding. This is
+#'   `NULL` until the step is trained by [recipes::prep()].
+#' @param skip A logical. Should the step be skipped when the recipe is baked by
+#'   [recipes::bake()]? While all operations are baked when [recipes::prep()] is
+#'   run, some operations may not be able to be conducted on new data (e.g.
+#'   processing the outcome variable(s)). Care should be taken when using `skip
+#'   = TRUE` as it may affect the computations for subsequent operations
+#' @param trained A logical to indicate if the quantities for preprocessing have
+#'   been estimated.
+#' @param id A character string that is unique to this step to identify it.
+#' @return An updated version of `recipe` with the new step added to the
+#'   sequence of existing steps (if any). For the `tidy` method, a tibble with
+#'   columns `terms` (the selectors or variables for encoding), `level` (the
+#'   factor levels), and `value` (the encodings).
+#' @keywords datagen
+#' @concept preprocessing encoding
+#' @details
+#'
+#' Each selected nominal predictor will be replaced by a numeric predictor.
+#' Each unique value of the nominal predictor is replaced by a numeric value.
+#' Thse values are calculated differently depending on the type of the outcome.
+#'
+#' For **numeric** outcomes each value is the average value of the outcome
+#' inside each of the levels of the predictor. Unseen levels of the predictor
+#' will be using the global mean of the predictor.
+#' If case weights are used then a weighted mean is calculated instead.
+#'
+#' For **nominal** outcomes each value is the log odds of the of the first level
+#' of the outcome variable being present, within each level of the levels of the
+#' predictor. Unseen levels will be replaced by the global log odds without
+#' stratification.
+#' If case weights are used then a weighted log odds is calculated.
+#'
+#' If no or all occurances happens then the log odds is calculated using
+#' `p = (2 * nrow(data) - 1) / (2 * nrow(data))` to avoid infinity that would
+#' happen by taking the log of `0`.
+#'
+#' # Tidying
+#'
+#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble is returned
+#' with columns `level`, `value`, `terms`, and `id`:
+#'
+#' \describe{
+#'   \item{level}{character, the factor levels}
+#'   \item{value}{numeric, the encoding}
+#'   \item{terms}{character, the selectors or variables selected}
+#'   \item{id}{character, id of this step}
+#' }
+#'
+#' @template case-weights-supervised
+#'
+#' @references
+#'
+#' Micci-Barreca D (2001) "A preprocessing scheme for high-cardinality
+#' categorical attributes in classification and prediction problems," ACM SIGKDD
+#' Explorations Newsletter, 3(1), 27-32.
+#'
+#' Zumel N and Mount J (2017) "vtreat: a data.frame Processor for Predictive
+#' Modeling," arXiv:1611.09477
+#'
+#' @examplesIf rlang::is_installed("modeldata")
+#' library(recipes)
+#' library(dplyr)
+#' library(modeldata)
+#'
+#' data(grants)
+#'
+#' set.seed(1)
+#' grants_other <- sample_n(grants_other, 500)
+#' reencoded <- recipe(class ~ sponsor_code, data = grants_other) |>
+#'   step_lencode(sponsor_code, outcome = vars(class)) |>
+#'   prep()
+#'
+#' bake(reencoded, grants_other)
+#'
+#' tidy(reencoded, 1)
+#' @export
+step_lencode <-
+  function(
+    recipe,
+    ...,
+    role = NA,
+    trained = FALSE,
+    outcome = NULL,
+    mapping = NULL,
+    skip = FALSE,
+    id = rand_id("lencode")
+  ) {
+    if (is.null(outcome)) {
+      cli::cli_abort("Please list a variable in {.arg outcome}.")
+    }
+    add_step(
+      recipe,
+      step_lencode_new(
+        terms = enquos(...),
+        role = role,
+        trained = trained,
+        outcome = outcome,
+        mapping = mapping,
+        skip = skip,
+        id = id,
+        case_weights = NULL
+      )
+    )
+  }
+
+step_lencode_new <-
+  function(terms, role, trained, outcome, mapping, skip, id, case_weights) {
+    step(
+      subclass = "lencode",
+      terms = terms,
+      role = role,
+      trained = trained,
+      outcome = outcome,
+      mapping = mapping,
+      skip = skip,
+      id = id,
+      case_weights = case_weights
+    )
+  }
+
+#' @export
+prep.step_lencode <- function(x, training, info = NULL, ...) {
+  col_names <- recipes_eval_select(x$terms, training, info)
+
+  wts <- get_case_weights(info, training)
+  were_weights_used <- are_weights_used(wts)
+  if (isFALSE(were_weights_used) || is.null(wts)) {
+    wts <- NULL
+  }
+
+  if (length(col_names) > 0) {
+    check_type(training[, col_names], types = c("string", "factor", "ordered"))
+    y_name <- recipes_eval_select(x$outcome, training, info)
+    res <- purrr::map(
+      training[, col_names],
+      lencode_calc,
+      y = training[[y_name]],
+      wts = wts
+    )
+  } else {
+    res <- list()
+  }
+  step_lencode_new(
+    terms = x$terms,
+    role = x$role,
+    trained = TRUE,
+    outcome = x$outcome,
+    mapping = res,
+    skip = x$skip,
+    id = x$id,
+    case_weights = were_weights_used
+  )
+}
+lencode_calc <- function(x, y, wts = NULL) {
+  if (!is.numeric(y) && !is.factor(y) && !is.character(y)) {
+    cli::cli_abort(
+      "Only works nominal or numeric {.arg outcome}, 
+      not {.obj_type_friendly {y}}."
+    )
+  }
+
+  data <- tibble::new_tibble(
+    list(..level = x, ..value = y, wts = wts)
+  )
+
+  if (is.numeric(y)) {
+    if (is.null(wts)) {
+      res <- dplyr::summarise(
+        data,
+        ..value = mean(..value),
+        .by = ..level
+      )
+      unseen_value <- mean(data$..value)
+    } else {
+      res <- dplyr::summarise(
+        data,
+        ..value = stats::weighted.mean(..value, wts),
+        .by = ..level
+      )
+      unseen_value <- stats::weighted.mean(data$..value, data$wts)
+    }
+  }
+
+  if (is.factor(y) || is.character(y)) {
+    inf_estimate_p <- (2 * nrow(data) - 1) / (2 * nrow(data))
+    inf_estimate_log_odds <- log(inf_estimate_p / (1 - inf_estimate_p))
+    if (is.null(wts)) {
+      res <- dplyr::summarize(
+        data,
+        p = (sum(..value == levels(..value)[1])) / n(),
+        .by = ..level
+      )
+
+      global_p <- (sum(data$..value == levels(data$..value)[1])) / nrow(data)
+    } else {
+      data$wts <- as.numeric(data$wts)
+      res <- dplyr::summarize(
+        data,
+        p = (sum((..value == levels(..value)[1]) * wts)) / sum(wts),
+        .by = ..level
+      )
+
+      global_p <- (sum((data$..value == levels(data$..value)[1]) * data$wts)) /
+        sum(data$wts)
+    }
+
+    res <- res |>
+      dplyr::mutate(..value = log(p / (1 - p))) |>
+      dplyr::mutate(
+        ..value = dplyr::if_else(
+          is.infinite(..value),
+          inf_estimate_log_odds,
+          ..value
+        )
+      ) |>
+      dplyr::select(-p)
+
+    unseen_value <- log(global_p / (1 - global_p))
+  }
+
+  unseen <- tibble::new_tibble(
+    list(
+      ..level = "..new",
+      ..value = unseen_value
+    )
+  )
+
+  dplyr::bind_rows(res, unseen)
+}
+
+#' @export
+bake.step_lencode <- function(object, new_data, ...) {
+  col_names <- names(object$mapping)
+  check_new_data(col_names, object, new_data)
+
+  for (col_name in col_names) {
+    new_data[[col_name]] <- map_glm_coef(
+      dat = new_data[, col_name], # map_glm_coef() expects a tibble
+      mapping = object$mapping[[col_name]]
+    )
+  }
+
+  new_data
+}
+
+#' @export
+print.step_lencode <-
+  function(x, width = max(20, options()$width - 31), ...) {
+    title <- "Linear embedding for factors via GLM for "
+    print_step(
+      names(x$mapping),
+      x$terms,
+      x$trained,
+      title,
+      width,
+      case_weights = x$case_weights
+    )
+    invisible(x)
+  }
+
+#' @rdname step_lencode
+#' @usage NULL
+#' @export
+tidy.step_lencode <- function(x, ...) {
+  if (is_trained(x)) {
+    if (length(x$mapping) == 0) {
+      res <- tibble(
+        terms = character(),
+        level = character(),
+        value = double()
+      )
+    } else {
+      for (i in seq_along(x$mapping)) {
+        x$mapping[[i]]$terms <- names(x$mapping)[i]
+      }
+      res <- bind_rows(x$mapping)
+      names(res) <- gsub("^\\.\\.", "", names(res))
+    }
+  } else {
+    term_names <- sel2char(x$terms)
+    res <- tibble(
+      terms = term_names,
+      level = rep(na_chr, length(term_names)),
+      value = rep(na_dbl, length(term_names))
+    )
+  }
+  res$id <- x$id
+  res
+}
+
+#' @rdname required_pkgs.embed
+#' @export
+required_pkgs.step_lencode <- function(x, ...) {
+  c("embed")
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -15,6 +15,7 @@ reference:
   contents:
     - step_embed
     - step_feature_hash
+    - step_lencode
     - step_lencode_bayes
     - step_lencode_glm
     - step_lencode_mixed