diff --git a/NAMESPACE b/NAMESPACE index 82bfb94..7ee9f61 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(bake,step_collapse_stringdist) S3method(bake,step_discretize_cart) S3method(bake,step_discretize_xgb) S3method(bake,step_embed) +S3method(bake,step_lencode) S3method(bake,step_lencode_bayes) S3method(bake,step_lencode_glm) S3method(bake,step_lencode_mixed) @@ -18,6 +19,7 @@ S3method(prep,step_collapse_stringdist) S3method(prep,step_discretize_cart) S3method(prep,step_discretize_xgb) S3method(prep,step_embed) +S3method(prep,step_lencode) S3method(prep,step_lencode_bayes) S3method(prep,step_lencode_glm) S3method(prep,step_lencode_mixed) @@ -31,6 +33,7 @@ S3method(print,step_collapse_stringdist) S3method(print,step_discretize_cart) S3method(print,step_discretize_xgb) S3method(print,step_embed) +S3method(print,step_lencode) S3method(print,step_lencode_bayes) S3method(print,step_lencode_glm) S3method(print,step_lencode_mixed) @@ -44,6 +47,7 @@ S3method(required_pkgs,step_collapse_stringdist) S3method(required_pkgs,step_discretize_cart) S3method(required_pkgs,step_discretize_xgb) S3method(required_pkgs,step_embed) +S3method(required_pkgs,step_lencode) S3method(required_pkgs,step_lencode_bayes) S3method(required_pkgs,step_lencode_glm) S3method(required_pkgs,step_lencode_mixed) @@ -57,6 +61,7 @@ S3method(tidy,step_collapse_stringdist) S3method(tidy,step_discretize_cart) S3method(tidy,step_discretize_xgb) S3method(tidy,step_embed) +S3method(tidy,step_lencode) S3method(tidy,step_lencode_bayes) S3method(tidy,step_lencode_glm) S3method(tidy,step_lencode_mixed) @@ -83,6 +88,7 @@ export(step_discretize_cart) export(step_discretize_xgb) export(step_embed) export(step_feature_hash) +export(step_lencode) export(step_lencode_bayes) export(step_lencode_glm) export(step_lencode_mixed) diff --git a/NEWS.md b/NEWS.md index c4542c4..baf3618 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ * `step_feature_hash()` has been fully deprecated in favor of `textrecipes::step_dummy_hash()`. (#253) +* Adds `step_lencode()` to perform analytical likelihood encoding. (#258) + # embed 1.1.5 ## Improvements diff --git a/R/aaa.R b/R/aaa.R index b33f824..d66b755 100644 --- a/R/aaa.R +++ b/R/aaa.R @@ -7,6 +7,7 @@ utils::globalVariables( ".", "..level", "..order", + "..value", ".group", ".rows", "col_names", diff --git a/R/lencode.R b/R/lencode.R new file mode 100644 index 0000000..a987cfc --- /dev/null +++ b/R/lencode.R @@ -0,0 +1,311 @@ +#' Likelihood encoding using analytical formula +#' +#' `step_lencode()` creates a *specification* of a recipe step that will convert +#' a nominal (i.e. factor) predictor into a single set of scores derived +#' analytically. +#' +#' @param recipe A recipe object. The step will be added to the sequence of +#' operations for this recipe. +#' @param ... One or more selector functions to choose variables. For +#' `step_lencode()`, this indicates the variables to be encoded into a +#' numeric format. See [recipes::selections()] for more details. For the +#' `tidy` method, these are not currently used. +#' @param role Not used by this step since no new variables are created. +#' @param outcome A call to `vars` to specify which variable is used as the +#' outcome. Only numeric and two-level factors are currently supported. +#' @param mapping A list of tibble results that define the encoding. This is +#' `NULL` until the step is trained by [recipes::prep()]. +#' @param skip A logical. Should the step be skipped when the recipe is baked by +#' [recipes::bake()]? While all operations are baked when [recipes::prep()] is +#' run, some operations may not be able to be conducted on new data (e.g. +#' processing the outcome variable(s)). Care should be taken when using `skip +#' = TRUE` as it may affect the computations for subsequent operations +#' @param trained A logical to indicate if the quantities for preprocessing have +#' been estimated. +#' @param id A character string that is unique to this step to identify it. +#' @return An updated version of `recipe` with the new step added to the +#' sequence of existing steps (if any). For the `tidy` method, a tibble with +#' columns `terms` (the selectors or variables for encoding), `level` (the +#' factor levels), and `value` (the encodings). +#' @keywords datagen +#' @concept preprocessing encoding +#' @details +#' +#' Each selected nominal predictor will be replaced by a numeric predictor. +#' Each unique value of the nominal predictor is replaced by a numeric value. +#' Thse values are calculated differently depending on the type of the outcome. +#' +#' For **numeric** outcomes each value is the average value of the outcome +#' inside each of the levels of the predictor. Unseen levels of the predictor +#' will be using the global mean of the predictor. +#' If case weights are used then a weighted mean is calculated instead. +#' +#' For **nominal** outcomes each value is the log odds of the of the first level +#' of the outcome variable being present, within each level of the levels of the +#' predictor. Unseen levels will be replaced by the global log odds without +#' stratification. +#' If case weights are used then a weighted log odds is calculated. +#' +#' If no or all occurances happens then the log odds is calculated using +#' `p = (2 * nrow(data) - 1) / (2 * nrow(data))` to avoid infinity that would +#' happen by taking the log of `0`. +#' +#' # Tidying +#' +#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble is returned +#' with columns `level`, `value`, `terms`, and `id`: +#' +#' \describe{ +#' \item{level}{character, the factor levels} +#' \item{value}{numeric, the encoding} +#' \item{terms}{character, the selectors or variables selected} +#' \item{id}{character, id of this step} +#' } +#' +#' @template case-weights-supervised +#' +#' @references +#' +#' Micci-Barreca D (2001) "A preprocessing scheme for high-cardinality +#' categorical attributes in classification and prediction problems," ACM SIGKDD +#' Explorations Newsletter, 3(1), 27-32. +#' +#' Zumel N and Mount J (2017) "vtreat: a data.frame Processor for Predictive +#' Modeling," arXiv:1611.09477 +#' +#' @examplesIf rlang::is_installed("modeldata") +#' library(recipes) +#' library(dplyr) +#' library(modeldata) +#' +#' data(grants) +#' +#' set.seed(1) +#' grants_other <- sample_n(grants_other, 500) +#' reencoded <- recipe(class ~ sponsor_code, data = grants_other) |> +#' step_lencode(sponsor_code, outcome = vars(class)) |> +#' prep() +#' +#' bake(reencoded, grants_other) +#' +#' tidy(reencoded, 1) +#' @export +step_lencode <- + function( + recipe, + ..., + role = NA, + trained = FALSE, + outcome = NULL, + mapping = NULL, + skip = FALSE, + id = rand_id("lencode") + ) { + if (is.null(outcome)) { + cli::cli_abort("Please list a variable in {.arg outcome}.") + } + add_step( + recipe, + step_lencode_new( + terms = enquos(...), + role = role, + trained = trained, + outcome = outcome, + mapping = mapping, + skip = skip, + id = id, + case_weights = NULL + ) + ) + } + +step_lencode_new <- + function(terms, role, trained, outcome, mapping, skip, id, case_weights) { + step( + subclass = "lencode", + terms = terms, + role = role, + trained = trained, + outcome = outcome, + mapping = mapping, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_lencode <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts) + if (isFALSE(were_weights_used) || is.null(wts)) { + wts <- NULL + } + + if (length(col_names) > 0) { + check_type(training[, col_names], types = c("string", "factor", "ordered")) + y_name <- recipes_eval_select(x$outcome, training, info) + res <- purrr::map( + training[, col_names], + lencode_calc, + y = training[[y_name]], + wts = wts + ) + } else { + res <- list() + } + step_lencode_new( + terms = x$terms, + role = x$role, + trained = TRUE, + outcome = x$outcome, + mapping = res, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} +lencode_calc <- function(x, y, wts = NULL) { + if (!is.numeric(y) && !is.factor(y) && !is.character(y)) { + cli::cli_abort( + "Only works nominal or numeric {.arg outcome}, + not {.obj_type_friendly {y}}." + ) + } + + data <- tibble::new_tibble( + list(..level = x, ..value = y, wts = wts) + ) + + if (is.numeric(y)) { + if (is.null(wts)) { + res <- dplyr::summarise( + data, + ..value = mean(..value), + .by = ..level + ) + unseen_value <- mean(data$..value) + } else { + res <- dplyr::summarise( + data, + ..value = stats::weighted.mean(..value, wts), + .by = ..level + ) + unseen_value <- stats::weighted.mean(data$..value, data$wts) + } + } + + if (is.factor(y) || is.character(y)) { + inf_estimate_p <- (2 * nrow(data) - 1) / (2 * nrow(data)) + inf_estimate_log_odds <- log(inf_estimate_p / (1 - inf_estimate_p)) + if (is.null(wts)) { + res <- dplyr::summarize( + data, + p = (sum(..value == levels(..value)[1])) / n(), + .by = ..level + ) + + global_p <- (sum(data$..value == levels(data$..value)[1])) / nrow(data) + } else { + data$wts <- as.numeric(data$wts) + res <- dplyr::summarize( + data, + p = (sum((..value == levels(..value)[1]) * wts)) / sum(wts), + .by = ..level + ) + + global_p <- (sum((data$..value == levels(data$..value)[1]) * data$wts)) / + sum(data$wts) + } + + res <- res |> + dplyr::mutate(..value = log(p / (1 - p))) |> + dplyr::mutate( + ..value = dplyr::if_else( + is.infinite(..value), + inf_estimate_log_odds, + ..value + ) + ) |> + dplyr::select(-p) + + unseen_value <- log(global_p / (1 - global_p)) + } + + unseen <- tibble::new_tibble( + list( + ..level = "..new", + ..value = unseen_value + ) + ) + + dplyr::bind_rows(res, unseen) +} + +#' @export +bake.step_lencode <- function(object, new_data, ...) { + col_names <- names(object$mapping) + check_new_data(col_names, object, new_data) + + for (col_name in col_names) { + new_data[[col_name]] <- map_glm_coef( + dat = new_data[, col_name], # map_glm_coef() expects a tibble + mapping = object$mapping[[col_name]] + ) + } + + new_data +} + +#' @export +print.step_lencode <- + function(x, width = max(20, options()$width - 31), ...) { + title <- "Linear embedding for factors via GLM for " + print_step( + names(x$mapping), + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) + } + +#' @rdname step_lencode +#' @usage NULL +#' @export +tidy.step_lencode <- function(x, ...) { + if (is_trained(x)) { + if (length(x$mapping) == 0) { + res <- tibble( + terms = character(), + level = character(), + value = double() + ) + } else { + for (i in seq_along(x$mapping)) { + x$mapping[[i]]$terms <- names(x$mapping)[i] + } + res <- bind_rows(x$mapping) + names(res) <- gsub("^\\.\\.", "", names(res)) + } + } else { + term_names <- sel2char(x$terms) + res <- tibble( + terms = term_names, + level = rep(na_chr, length(term_names)), + value = rep(na_dbl, length(term_names)) + ) + } + res$id <- x$id + res +} + +#' @rdname required_pkgs.embed +#' @export +required_pkgs.step_lencode <- function(x, ...) { + c("embed") +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 6691947..6cd1e4b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -15,6 +15,7 @@ reference: contents: - step_embed - step_feature_hash + - step_lencode - step_lencode_bayes - step_lencode_glm - step_lencode_mixed diff --git a/man/required_pkgs.embed.Rd b/man/required_pkgs.embed.Rd index ed60312..b2267c1 100644 --- a/man/required_pkgs.embed.Rd +++ b/man/required_pkgs.embed.Rd @@ -1,14 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/collapse_cart.R, R/collapse_stringdist.R, -% R/discretize_cart.R, R/discretize_xgb.R, R/embed.R, R/lencode_bayes.R, -% R/lencode_glm.R, R/lencode_mixed.R, R/pca_sparse.R, R/pca_sparse_bayes.R, -% R/pca_truncated.R, R/umap.R, R/woe.R +% R/discretize_cart.R, R/discretize_xgb.R, R/embed.R, R/lencode.R, +% R/lencode_bayes.R, R/lencode_glm.R, R/lencode_mixed.R, R/pca_sparse.R, +% R/pca_sparse_bayes.R, R/pca_truncated.R, R/umap.R, R/woe.R \name{required_pkgs.step_collapse_cart} \alias{required_pkgs.step_collapse_cart} \alias{required_pkgs.step_collapse_stringdist} \alias{required_pkgs.step_discretize_cart} \alias{required_pkgs.step_discretize_xgb} \alias{required_pkgs.step_embed} +\alias{required_pkgs.step_lencode} \alias{required_pkgs.step_lencode_bayes} \alias{required_pkgs.step_lencode_glm} \alias{required_pkgs.step_lencode_mixed} @@ -29,6 +30,8 @@ \method{required_pkgs}{step_embed}(x, ...) +\method{required_pkgs}{step_lencode}(x, ...) + \method{required_pkgs}{step_lencode_bayes}(x, ...) \method{required_pkgs}{step_lencode_glm}(x, ...) diff --git a/man/step_lencode.Rd b/man/step_lencode.Rd new file mode 100644 index 0000000..87d1062 --- /dev/null +++ b/man/step_lencode.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lencode.R +\name{step_lencode} +\alias{step_lencode} +\alias{tidy.step_lencode} +\title{Likelihood encoding using analytical formula} +\usage{ +step_lencode( + recipe, + ..., + role = NA, + trained = FALSE, + outcome = NULL, + mapping = NULL, + skip = FALSE, + id = rand_id("lencode") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables. For +\code{step_lencode()}, this indicates the variables to be encoded into a +numeric format. See \code{\link[recipes:selections]{recipes::selections()}} for more details. For the +\code{tidy} method, these are not currently used.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{outcome}{A call to \code{vars} to specify which variable is used as the +outcome. Only numeric and two-level factors are currently supported.} + +\item{mapping}{A list of tibble results that define the encoding. This is +\code{NULL} until the step is trained by \code{\link[recipes:prep]{recipes::prep()}}.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{recipes::bake()}}? While all operations are baked when \code{\link[recipes:prep]{recipes::prep()}} is +run, some operations may not be able to be conducted on new data (e.g. +processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of existing steps (if any). For the \code{tidy} method, a tibble with +columns \code{terms} (the selectors or variables for encoding), \code{level} (the +factor levels), and \code{value} (the encodings). +} +\description{ +\code{step_lencode()} creates a \emph{specification} of a recipe step that will convert +a nominal (i.e. factor) predictor into a single set of scores derived +analytically. +} +\details{ +Each selected nominal predictor will be replaced by a numeric predictor. +Each unique value of the nominal predictor is replaced by a numeric value. +Thse values are calculated differently depending on the type of the outcome. + +For \strong{numeric} outcomes each value is the average value of the outcome +inside each of the levels of the predictor. Unseen levels of the predictor +will be using the global mean of the predictor. +If case weights are used then a weighted mean is calculated instead. + +For \strong{nominal} outcomes each value is the log odds of the of the first level +of the outcome variable being present, within each level of the levels of the +predictor. Unseen levels will be replaced by the global log odds without +stratification. +If case weights are used then a weighted log odds is calculated. + +If no or all occurances happens then the log odds is calculated using +\code{p = (2 * nrow(data) - 1) / (2 * nrow(data))} to avoid infinity that would +happen by taking the log of \code{0}. +} +\section{Tidying}{ +When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is returned +with columns \code{level}, \code{value}, \code{terms}, and \code{id}: + +\describe{ +\item{level}{character, the factor levels} +\item{value}{numeric, the encoding} +\item{terms}{character, the selectors or variables selected} +\item{id}{character, id of this step} +} +} + +\section{Case weights}{ + + +This step performs an supervised operation that can utilize case weights. +To use them, see the documentation in \link[recipes:case_weights]{recipes::case_weights} and the examples on +\code{tidymodels.org}. +} + +\examples{ +\dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +library(recipes) +library(dplyr) +library(modeldata) + +data(grants) + +set.seed(1) +grants_other <- sample_n(grants_other, 500) +reencoded <- recipe(class ~ sponsor_code, data = grants_other) |> + step_lencode(sponsor_code, outcome = vars(class)) |> + prep() + +bake(reencoded, grants_other) + +tidy(reencoded, 1) +\dontshow{\}) # examplesIf} +} +\references{ +Micci-Barreca D (2001) "A preprocessing scheme for high-cardinality +categorical attributes in classification and prediction problems," ACM SIGKDD +Explorations Newsletter, 3(1), 27-32. + +Zumel N and Mount J (2017) "vtreat: a data.frame Processor for Predictive +Modeling," arXiv:1611.09477 +} +\concept{preprocessing encoding} +\keyword{datagen} diff --git a/tests/testthat/_snaps/lencode.md b/tests/testthat/_snaps/lencode.md new file mode 100644 index 0000000..ac8225a --- /dev/null +++ b/tests/testthat/_snaps/lencode.md @@ -0,0 +1,147 @@ +# factor outcome - factor predictor + + Code + new_values_ch <- bake(class_test, new_data = new_dat_ch) + Condition + Warning in `bake()`: + ! There was 1 column that was a factor when the recipe was prepped: + * `x3` + i This may cause errors when processing new data. + +# factor outcome - character predictor + + Code + new_values <- bake(class_test, new_data = new_dat_ch) + +# numeric outcome - factor predictor + + Code + new_values_ch <- bake(reg_test, new_data = new_dat_ch) + Condition + Warning in `bake()`: + ! There was 1 column that was a factor when the recipe was prepped: + * `x3` + i This may cause errors when processing new data. + +# bad args + + Code + prep(step_lencode(recipe(Species ~ ., data = three_class), Sepal.Length, + outcome = vars(Species)), training = three_class, retain = TRUE) + Condition + Error in `step_lencode()`: + Caused by error in `prep()`: + x All columns selected for the step should be string, factor, or ordered. + * 1 double variable found: `Sepal.Length` + +--- + + Code + prep(step_lencode(recipe(Species ~ ., data = three_class), Species, outcome = vars( + logical)), training = three_class, retain = TRUE) + Condition + Error in `step_lencode()`: + Caused by error in `purrr::map()`: + i In index: 1. + i With name: Species. + Caused by error in `.f()`: + ! Only works nominal or numeric `outcome`, not a logical vector. + +# case weights + + Code + class_test + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 3 + case_weights: 1 + + -- Training information + Training data contained 500 data points and no incomplete rows. + + -- Operations + * Linear embedding for factors via GLM for: x3 | Trained, weighted + +# bake method errors when needed non-standard role columns are missing + + Code + bake(rec_trained, new_data = ex_dat[, -3]) + Condition + Error in `step_lencode()`: + ! The following required column is missing from `new_data`: x3. + +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Linear embedding for factors via GLM for: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Linear embedding for factors via GLM for: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 3 + + -- Operations + * Linear embedding for factors via GLM for: x3 + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 3 + + -- Training information + Training data contained 500 data points and no incomplete rows. + + -- Operations + * Linear embedding for factors via GLM for: x3 | Trained + diff --git a/tests/testthat/test-lencode.R b/tests/testthat/test-lencode.R new file mode 100644 index 0000000..f284f64 --- /dev/null +++ b/tests/testthat/test-lencode.R @@ -0,0 +1,439 @@ +test_that("factor outcome - factor predictor", { + class_test <- recipe(x2 ~ ., data = ex_dat) |> + step_lencode(x3, outcome = vars(x2), id = "id") |> + prep(training = ex_dat, retain = TRUE) + tr_values <- bake(class_test, new_data = NULL)$x3 + new_values <- bake(class_test, new_data = new_dat) + expect_snapshot( + new_values_ch <- bake(class_test, new_data = new_dat_ch) + ) + key <- class_test$steps[[1]]$mapping + td_obj <- tidy(class_test, number = 1) + + expect_equal("x3", names(key)) + + expect_equal( + length(unique(ex_dat$x3)) + 1, + nrow(key$x3) + ) + expect_true(sum(key$x3$..level == "..new") == 1) + + expect_true(is.numeric(tr_values)) + + expect_equal( + new_values$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_ch$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_ch$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values_ch$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + + expect_equal( + td_obj$level, + key$x3$..level + ) + expect_equal( + td_obj$value, + key$x3$..value + ) + + new_values +}) + +test_that("factor outcome - character predictor", { + class_test <- recipe(x2 ~ ., data = ex_dat_ch) |> + step_lencode(x3, outcome = vars(x2)) |> + prep(training = ex_dat_ch, retain = TRUE) + tr_values <- bake(class_test, new_data = NULL)$x3 + expect_snapshot( + new_values <- bake(class_test, new_data = new_dat_ch) + ) + new_values_fc <- bake(class_test, new_data = new_dat) + key <- class_test$steps[[1]]$mapping + td_obj <- tidy(class_test, number = 1) + + expect_equal("x3", names(key)) + + expect_equal( + length(unique(ex_dat$x3)) + 1, + nrow(key$x3) + ) + expect_true(sum(key$x3$..level == "..new") == 1) + + expect_true(is.numeric(tr_values)) + + expect_equal( + new_values$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_fc$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_fc$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values_fc$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + + expect_equal( + td_obj$level, + key$x3$..level + ) + expect_equal( + td_obj$value, + key$x3$..value + ) + + unseen_level <- data.frame( + x1 = 0, + x2 = factor("a", levels = c("a", "b")), + x3 = "unseen-level", + x4 = factor("A", levels = c("A", "B", "C", "D", "E")) + ) + + expect_equal( + bake(class_test, unseen_level)$x3, + 0 + ) +}) + +test_that("numeric outcome - factor predictor", { + reg_test <- recipe(x1 ~ ., data = ex_dat) |> + step_lencode(x3, outcome = vars(x1)) |> + prep(training = ex_dat, retain = TRUE) + tr_values <- bake(reg_test, new_data = NULL)$x3 + new_values <- bake(reg_test, new_data = new_dat) + expect_snapshot( + new_values_ch <- bake(reg_test, new_data = new_dat_ch) + ) + td_obj <- tidy(reg_test, number = 1) + + key <- reg_test$steps[[1]]$mapping + + expect_equal("x3", names(key)) + + expect_equal( + length(unique(ex_dat$x3)) + 1, + nrow(key$x3) + ) + expect_true(sum(key$x3$..level == "..new") == 1) + + expect_true(is.numeric(tr_values)) + + expect_equal( + new_values$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_ch$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_ch$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values_ch$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + + expect_equal( + td_obj$level, + key$x3$..level + ) + expect_equal( + td_obj$value, + key$x3$..value + ) +}) + +test_that("numeric outcome - character predictor", { + reg_test <- recipe(x1 ~ ., data = ex_dat_ch) |> + step_lencode(x3, outcome = vars(x1)) |> + prep(training = ex_dat_ch, retain = TRUE) + tr_values <- bake(reg_test, new_data = NULL)$x3 + new_values <- bake(reg_test, new_data = new_dat_ch) + new_values_fc <- bake(reg_test, new_data = new_dat) + key <- reg_test$steps[[1]]$mapping + td_obj <- tidy(reg_test, number = 1) + + expect_equal("x3", names(key)) + + expect_equal( + length(unique(ex_dat$x3)) + 1, + nrow(key$x3) + ) + expect_true(sum(key$x3$..level == "..new") == 1) + + expect_true(is.numeric(tr_values)) + + expect_equal( + new_values$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_fc$x3[1], + key$x3$..value[key$x3$..level == "..new"] + ) + expect_equal( + new_values_fc$x3[2], + key$x3$..value[key$x3$..level == levels(ex_dat$x3)[1]] + ) + expect_equal( + new_values_fc$x3[3], + key$x3$..value[key$x3$..level == "..new"] + ) + + expect_equal( + td_obj$level, + key$x3$..level + ) + expect_equal( + td_obj$value, + key$x3$..value + ) + + unseen_level <- data.frame( + x1 = 0, + x2 = factor("a", levels = c("a", "b")), + x3 = "unseen-level", + x4 = factor("A", levels = c("A", "B", "C", "D", "E")) + ) + + expect_equal( + bake(reg_test, unseen_level)$x3, + mean(ex_dat_ch$x1) + ) +}) + +test_that("non occurring events doesn't result in infinities", { + data <- data.frame( + outcome = c("a", "a", "b", "b"), + predictor = c("a", "a", "a", "b") + ) + + res <- recipe(outcome ~ ., data = data) |> + step_lencode(predictor, outcome = vars(outcome)) |> + prep() |> + tidy(1) + + exp <- c( + log(2 / 3 / (1 - 2 / 3)), + log( + (2 * nrow(data) - 1) / + (2 * nrow(data)) / + (1 - (2 * nrow(data) - 1) / (2 * nrow(data))) + ), + log(0.5 / (1 - 0.5)) + ) + + expect_identical(res$value, exp) + expect_identical(res$level, c("a", "b", "..new")) +}) + +test_that("non occurring events doesn't result in infinities - case weights", { + data <- data.frame( + outcome = c("a", "a", "b", "b"), + predictor = c("a", "a", "a", "b"), + wts = importance_weights(rep(1, 4)) + ) + + res <- recipe(outcome ~ ., data = data) |> + step_lencode(predictor, outcome = vars(outcome)) |> + prep() |> + tidy(1) + + exp <- c( + log(2 / 3 / (1 - 2 / 3)), + log( + (2 * nrow(data) - 1) / + (2 * nrow(data)) / + (1 - (2 * nrow(data) - 1) / (2 * nrow(data))) + ), + log(0.5 / (1 - 0.5)) + ) + + expect_identical(res$value, exp) + expect_identical(res$level, c("a", "b", "..new")) +}) + + +test_that("bad args", { + three_class <- iris + three_class$fac <- rep(letters[1:3], 50) + three_class$logical <- rep(c(TRUE, FALSE), 75) + + expect_snapshot( + error = TRUE, + recipe(Species ~ ., data = three_class) |> + step_lencode(Sepal.Length, outcome = vars(Species)) |> + prep(training = three_class, retain = TRUE) + ) + + expect_snapshot( + error = TRUE, + recipe(Species ~ ., data = three_class) |> + step_lencode(Species, outcome = vars(logical)) |> + prep(training = three_class, retain = TRUE) + ) +}) + +test_that("case weights", { + wts_int <- rep(c(0.9, 1), times = c(100, 400)) + + ex_dat_cw <- ex_dat |> + mutate(wts = importance_weights(wts_int)) + + class_test <- recipe(x2 ~ ., data = ex_dat_cw) |> + step_lencode(x3, outcome = vars(x2), id = "id") |> + prep(training = ex_dat_cw, retain = TRUE) + + ref_mod <- glm( + x2 ~ 0 + x3, + data = ex_dat_cw, + family = binomial, + na.action = na.omit, + weights = ex_dat_cw$wts + ) + + inf_estimate_p <- (2 * nrow(ex_dat_cw) - 1) / (2 * nrow(ex_dat_cw)) + inf_estimate_log_odds <- log(inf_estimate_p / (1 - inf_estimate_p)) + + exp <- tibble( + ..level = names(coef(ref_mod)), + ..value = unname(coef(ref_mod)) + ) |> + mutate( + ..level = gsub("^x3", "", ..level), + ..value = -..value, + ..value = if_else(abs(..value) < 0.0001, 0, ..value), + ..value = if_else( + abs(round(..value, 0.4)) == max(abs(round(..value, 0.4))), + inf_estimate_log_odds, + ..value + ) + ) |> + arrange(..level) + + res <- slice_head(class_test$steps[[1]]$mapping$x3, n = -1) |> + arrange(..level) + + expect_equal( + res, + exp, + tolerance = 0.00001 + ) + + expect_snapshot(class_test) +}) + +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + rec <- recipe(x2 ~ ., data = ex_dat) |> + step_lencode(x3, outcome = vars(x2)) |> + update_role(x3, new_role = "potato") |> + update_role_requirements(role = "potato", bake = FALSE) + + rec_trained <- prep(rec, training = ex_dat, verbose = FALSE) + + expect_snapshot( + error = TRUE, + bake(rec_trained, new_data = ex_dat[, -3]) + ) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_lencode(rec, outcome = vars(mpg)) + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_lencode(rec1, outcome = vars(mpg)) + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_lencode(rec, outcome = vars(mpg)) + + expect <- tibble( + terms = character(), + level = character(), + value = double(), + id = character() + ) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical(tidy(rec, number = 1), expect) +}) + +test_that("printing", { + rec <- recipe(x2 ~ ., data = ex_dat_ch) |> + step_lencode(x3, outcome = vars(x2)) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +})