Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ S3method(bake,step_collapse_stringdist)
S3method(bake,step_discretize_cart)
S3method(bake,step_discretize_xgb)
S3method(bake,step_embed)
S3method(bake,step_feature_hash)
S3method(bake,step_lencode_bayes)
S3method(bake,step_lencode_glm)
S3method(bake,step_lencode_mixed)
Expand All @@ -19,7 +18,6 @@ S3method(prep,step_collapse_stringdist)
S3method(prep,step_discretize_cart)
S3method(prep,step_discretize_xgb)
S3method(prep,step_embed)
S3method(prep,step_feature_hash)
S3method(prep,step_lencode_bayes)
S3method(prep,step_lencode_glm)
S3method(prep,step_lencode_mixed)
Expand All @@ -33,7 +31,6 @@ S3method(print,step_collapse_stringdist)
S3method(print,step_discretize_cart)
S3method(print,step_discretize_xgb)
S3method(print,step_embed)
S3method(print,step_feature_hash)
S3method(print,step_lencode_bayes)
S3method(print,step_lencode_glm)
S3method(print,step_lencode_mixed)
Expand All @@ -47,7 +44,6 @@ S3method(required_pkgs,step_collapse_stringdist)
S3method(required_pkgs,step_discretize_cart)
S3method(required_pkgs,step_discretize_xgb)
S3method(required_pkgs,step_embed)
S3method(required_pkgs,step_feature_hash)
S3method(required_pkgs,step_lencode_bayes)
S3method(required_pkgs,step_lencode_glm)
S3method(required_pkgs,step_lencode_mixed)
Expand All @@ -61,7 +57,6 @@ S3method(tidy,step_collapse_stringdist)
S3method(tidy,step_discretize_cart)
S3method(tidy,step_discretize_xgb)
S3method(tidy,step_embed)
S3method(tidy,step_feature_hash)
S3method(tidy,step_lencode_bayes)
S3method(tidy,step_lencode_glm)
S3method(tidy,step_lencode_mixed)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# embed (development version)

* `step_feature_hash()` has been fully deprecated in favor of `textrecipes::step_dummy_hash()`. (#253)

# embed 1.1.5

## Improvements
Expand Down
275 changes: 20 additions & 255 deletions R/feature_hash.R
Original file line number Diff line number Diff line change
@@ -1,262 +1,27 @@
#' Dummy Variables Creation via Feature Hashing
#'
#' @description `r lifecycle::badge("soft-deprecated")`
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' `step_feature_hash()` is being deprecated in favor of
#' [textrecipes::step_dummy_hash()]. This function creates a *specification*
#' of a recipe step that will convert nominal data (e.g. character or factors)
#' into one or more numeric binary columns using the levels of the original
#' data.
#'
#' @inheritParams recipes::step_pca
#' @param num_hash The number of resulting dummy variable columns.
#' @param preserve Use `keep_original_cols` instead to specify whether the
#' selected column(s) should be retained in addition to the new dummy
#' variables.
#' @param columns A character vector for the selected columns. This is `NULL`
#' until the step is trained by [recipes::prep()].
#' @template step-return
#' @details
#'
#' `step_feature_hash()` will create a set of binary dummy variables from a
#' factor or character variable. The values themselves are used to determine
#' which row that the dummy variable should be assigned (as opposed to having a
#' specific column that the value will map to).
#'
#' Since this method does not rely on a pre-determined assignment of levels to
#' columns, new factor levels can be added to the selected columns without
#' issue. Missing values result in missing values for all of the hashed columns.
#'
#' Note that the assignment of the levels to the hashing columns does not try to
#' maximize the allocation. It is likely that multiple levels of the column will
#' map to the same hashed columns (even with small data sets). Similarly, it is
#' likely that some columns will have all zeros. A zero-variance filter (via
#' [recipes::step_zv()]) is recommended for any recipe that uses hashed columns.
#'
#' # Tidying
#'
#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble is returned with
#' columns `terms` and `id`:
#' Please use [textrecipes::step_dummy_hash()] instead.
#'
#' \describe{
#' \item{terms}{character, the selectors or variables selected}
#' \item{id}{character, id of this step}
#' }
#'
#' @template case-weights-not-supported
#'
#' @references
#'
#' Weinberger, K, A Dasgupta, J Langford, A Smola, and J Attenberg. 2009.
#' "Feature Hashing for Large Scale Multitask Learning." In Proceedings of the
#' 26th Annual International Conference on Machine Learning, 1113–20. ACM.
#'
#' Kuhn and Johnson (2020) _Feature Engineering and Selection: A Practical
#' Approach for Predictive Models_. CRC/Chapman Hall
#' \url{https://bookdown.org/max/FES/encoding-predictors-with-many-categories.html}
#' @seealso [recipes::step_dummy()], [recipes::step_zv()]
#' @examplesIf !embed:::is_cran_check() && rlang::is_installed(c("modeldata", "keras"))
#' data(grants, package = "modeldata")
#' rec <-
#' recipe(class ~ sponsor_code, data = grants_other) %>%
#' step_feature_hash(
#' sponsor_code,
#' num_hash = 2^6, keep_original_cols = TRUE
#' ) %>%
#' prep()
#'
#' # How many of the 298 locations ended up in each hash column?
#' results <-
#' bake(rec, new_data = NULL, starts_with("sponsor_code")) %>%
#' distinct()
#'
#' apply(results %>% select(-sponsor_code), 2, sum) %>% table()
#' @keywords internal
#' @export
step_feature_hash <-
function(recipe,
...,
role = "predictor",
trained = FALSE,
num_hash = 2^6,
preserve = deprecated(),
columns = NULL,
keep_original_cols = FALSE,
skip = FALSE,
id = rand_id("feature_hash")) {
lifecycle::deprecate_soft(
"0.2.0",
"embed::step_feature_hash()",
"textrecipes::step_dummy_hash()"
)

if (lifecycle::is_present(preserve)) {
lifecycle::deprecate_soft(
"0.1.5",
"step_feature_hash(preserve = )",
"step_feature_hash(keep_original_cols = )"
)
keep_original_cols <- preserve
}

# warm start for tf to avoid a bug in tensorflow
is_tf_available()

add_step(
recipe,
step_feature_hash_new(
terms = enquos(...),
role = role,
trained = trained,
num_hash = num_hash,
preserve = keep_original_cols,
columns = columns,
keep_original_cols = keep_original_cols,
skip = skip,
id = id
)
)
}

step_feature_hash_new <-
function(terms, role, trained, num_hash, preserve, columns,
keep_original_cols, skip, id) {
step(
subclass = "feature_hash",
terms = terms,
role = role,
trained = trained,
num_hash = num_hash,
preserve = preserve,
columns = columns,
keep_original_cols = keep_original_cols,
skip = skip,
id = id
)
}

#' @export
prep.step_feature_hash <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_number_whole(x$num_hash, min = 0, arg = "num_hash")

if (length(col_names) > 0) {
check_type(training[, col_names], types = c("string", "factor", "ordered"))
}

step_feature_hash_new(
terms = x$terms,
role = x$role,
trained = TRUE,
num_hash = x$num_hash,
preserve = x$preserve,
columns = col_names,
keep_original_cols = get_keep_original_cols(x),
skip = x$skip,
id = x$id
)
}

make_hash_vars <- function(x, prefix, num_hash = 2^8) {
if (!is.character(x)) {
x <- as.character(x)
}

tmp <- tibble(data = x, ..order = seq_along(x))

uni_x <- unique(x)

rlang::check_installed("keras")

column_int <-
purrr::map_int(
uni_x,
keras::text_hashing_trick,
n = num_hash,
filters = "",
split = "dont split characters",
lower = FALSE
)
column_int[is.na(uni_x)] <- NA

nms <- names0(num_hash, prefix)
make_hash_tbl(column_int, nms) %>%
dplyr::mutate(data = uni_x) %>%
dplyr::left_join(tmp, by = "data", multiple = "all") %>%
dplyr::arrange(..order) %>%
dplyr::select(-data, -..order)
}

make_row <- function(ind, p) {
if (!is.na(ind)) {
x <- rep(0, p)
x[ind] <- 1
} else {
x <- rep(NA_real_, p)
}
x
}

make_hash_tbl <- function(ind, nms) {
p <- length(nms)
x <- purrr::map(ind, make_row, p = p)
x <- do.call("rbind", x)
colnames(x) <- nms
tibble::as_tibble(x)
}

#' @export
bake.step_feature_hash <- function(object, new_data, ...) {
col_names <- names(object$columns)
check_new_data(col_names, object, new_data)

# If no terms were selected
if (length(col_names) == 0) {
return(new_data)
}

new_names <- paste0(col_names, "_hash_")

new_cols <- purrr::map2_dfc(
new_data[, col_names],
new_names, make_hash_vars,
num_hash =
object$num_hash
step_feature_hash <- function(
recipe,
...,
role = "predictor",
trained = FALSE,
num_hash = 2^6,
preserve = deprecated(),
columns = NULL,
keep_original_cols = FALSE,
skip = FALSE,
id = rand_id("feature_hash")
) {
lifecycle::deprecate_stop(
"0.2.0",
"embed::step_feature_hash()",
"textrecipes::step_dummy_hash()"
)

new_cols <- recipes::check_name(new_cols, new_data, object, names(new_cols))

new_data <- vec_cbind(new_data, new_cols)

new_data <- remove_original_cols(new_data, object, col_names)

new_data
}

#' @export
print.step_feature_hash <-
function(x, width = max(20, options()$width - 31), ...) {
title <- "Feature hashed dummy variables for "
print_step(names(x$mapping), x$terms, x$trained, title, width)
invisible(x)
}

#' @rdname step_feature_hash
#' @usage NULL
#' @export
tidy.step_feature_hash <- function(x, ...) {
if (is_trained(x)) {
res <- tibble(terms = unname(x$columns))
} else {
term_names <- sel2char(x$terms)
res <- tibble(terms = term_names)
}
res$id <- x$id
res
}

#' @rdname required_pkgs.embed
#' @export
required_pkgs.step_feature_hash <- function(x, ...) {
c("keras", "embed")
}
9 changes: 3 additions & 6 deletions man/required_pkgs.embed.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading