Skip to content

Conversation

EmilHvitfeldt
Copy link
Member

Will close #232

library(embed)

n_obs <- 100000
n_lvls <- 100

data <- tibble(
  outcome = rnorm(n_obs),
  x = factor(sample(seq_len(n_lvls), n_obs, TRUE))
)

tictoc::tic("old")
res <- recipe(outcome ~ x, data = data) |>
  step_lencode_glm(x, outcome = vars(outcome)) |>
  prep()
tictoc::toc()
#> old: 1.157 sec elapsed

tictoc::tic("new")
res <- recipe(outcome ~ x, data = data) |>
  step_lencode(x, outcome = vars(outcome)) |>
  prep()
tictoc::toc()
#> new: 0.013 sec elapsed

We have yet to add regularization, but I'll add it as an argument in a future PR to keep things clean

@EmilHvitfeldt
Copy link
Member Author

the big benefit for step_lencode() is that the performance is almost constant with respect to number of labels in the predictor.

library(embed)
n_obs <- 10000

res <- bench::press(
  n_lvls = c(5, 10, 20, 40, 80),
  {
    data <- tibble(
  outcome = rnorm(n_obs),
  x = factor(sample(seq_len(n_lvls), n_obs, TRUE))
)
    
    rec = recipe(outcome ~ x, data = data)
    bench::mark(
      check=FALSE,
      old = rec |> step_lencode_glm(x, outcome = vars(outcome)) |> prep(),
      new = rec |> step_lencode(x, outcome = vars(outcome)) |> prep()
    )
  }
)
#> Running with:
#>   n_lvls
#> 1      5
#> 2     10
#> 3     20
#> 4     40
#> 5     80
library(bench)
library(ggplot2)

res |>
  ggplot(aes(n_lvls, median, color = expression)) +
  geom_point() +
  scale_color_bench_expr() +
  scale_x_log10()

res |>
  ggplot(aes(n_lvls, mem_alloc, color = expression)) +
    geom_point() +
  scale_color_bench_expr() +
    scale_x_log10()

Created on 2025-08-27 with reprex v2.1.1

@EmilHvitfeldt
Copy link
Member Author

and it performs better with respect to the number of rows

library(embed)

n_lvls <- 10

res <- bench::press(
  n_obs = c(100, 1000, 10000, 100000),
  {
    data <- tibble(
  outcome = rnorm(n_obs),
  x = factor(sample(seq_len(n_lvls), n_obs, TRUE))
)
    
    rec = recipe(outcome ~ x, data = data)
    bench::mark(
      check=FALSE,
      old = rec |> step_lencode_glm(x, outcome = vars(outcome)) |> prep(),
      new = rec |> step_lencode(x, outcome = vars(outcome)) |> prep()
    )
  }
)
#> Running with:
#>    n_obs
#> 1    100
#> 2   1000
#> 3  10000
#> 4 100000
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
library(bench)
library(ggplot2)

res |>
  ggplot(aes(n_obs, median, color = expression)) +
  geom_point() +
  scale_color_bench_expr() +
  scale_x_log10()

res |>
  ggplot(aes(n_obs, mem_alloc, color = expression)) +
    geom_point() +
  scale_color_bench_expr() +
    scale_x_log10()

Created on 2025-08-27 with reprex v2.1.1

@EmilHvitfeldt EmilHvitfeldt merged commit 55ff0f0 into main Aug 28, 2025
13 of 14 checks passed
@EmilHvitfeldt EmilHvitfeldt deleted the step-lencode branch August 28, 2025 19:18
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

speed improvement for step_lencode_glm()

1 participant