faster internal version of update_tibble_attrs()#717
Conversation
| } | ||
|
|
||
| x | ||
| .Call(`tibble_update_attrs`, x, pairlist2(...)) |
There was a problem hiding this comment.
Probably safer to check if any of the special attributes is supplied, and fall back to the R code in that case?
There was a problem hiding this comment.
I'm not sure how safer that would be, setAttrib() does some internal dispatch to deal with special attributes, i.e. names, row.names, class ...
There was a problem hiding this comment.
oh yes, I was confused!
There's still some special handling at R level in structure() though, but that doesn't seem important:
specials <- c(".Dim", ".Dimnames", ".Names", ".Tsp", ".Label")|
Thanks! I have removed compat code that is no longer necessary with the sf update. It's still very slow, though. I have also added a test regarding overwriting of attributes. I think our implementation will need to account for this. Do we need to adapt? |
|
library(tibble)
library(rlang)
update_tibble_attrs_old <- function(x, ...) {
# Compat for https://github.com/r-spatial/sf/pull/1232
attribs <- list(...)
if (has_length(attribs) && is_named(attribs)) {
attributes(x)[names(attribs)] <- attribs
}
x
}
update_tibble_attrs <- tibble:::update_tibble_attrs
df <- tibble(x = rnorm(1e6), g = sample(rep(1:1e4, 100)))
groups <- tibble(g = unique(df$g))
bench::mark(
new = update_tibble_attrs(df, groups = groups),
old = update_tibble_attrs_old(df, groups = groups),
str = structure(df, groups = groups)
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 new 2.02µs 2.71µs 335090. 0B 0
#> 2 old 3.97ms 4.77ms 211. 3.82MB 22.4
#> 3 str 3.96ms 4.3ms 226. 3.81MB 15.8Created on 2020-02-14 by the reprex package (v0.3.0.9000) |
|
which test @krlmlr ? |
|
I believe vectbl_restore <- function(xo, x) {
n <- fast_nrow(xo)
# FIXME: Use new_tibble()?
forbidden <- attrs_names_only
attrs <- attributes(x)
attrs <- attrs[!(names(attrs) %in% forbidden)]
attributes(xo)[names(attrs)] <- attrs
# Need to patch manually
attr(xo, "row.names") <- .set_row_names(n)
xo
} |
|
Good catch, pushed the tests. I still wonder why library(tibble)
x <- tibble(a = 1:3)
y <- x
vectbl_restore <- tibble:::vectbl_restore
bench::mark(vectbl_restore(x, y), y)
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 vectbl_restore(x, y) 5.8µs 6.93µs 123302. 3.97KB 24.7
#> 2 y 23.1ns 28.17ns 23492926. 0B 0Created on 2020-02-14 by the reprex package (v0.3.0) |
|
For similar reasons, I've updated library(tibble)
library(rlang)
df1 <- tibble(x = rnorm(1e6), g = sample(rep(1:1e4, 100)))
df2 <- tibble(x = rnorm(1e6))
vectbl_restore <- tibble:::vectbl_restore
fast_nrow <- function(x) {
.row_names_info(x, 2L)
}
attrs_names_only <- c("names", "row.names")
vectbl_restore_old <- function(xo, x) {
n <- fast_nrow(xo)
# FIXME: Use new_tibble()?
forbidden <- attrs_names_only
attrs <- attributes(x)
attrs <- attrs[!(names(attrs) %in% forbidden)]
attributes(xo)[names(attrs)] <- attrs
# Need to patch manually
attr(xo, "row.names") <- .set_row_names(n)
xo
}
bench::mark(
vectbl_restore(df1, df2),
vectbl_restore_old(df1, df2)
)
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:> <dbl> <bch:byt> <dbl>
#> 1 vectbl_restore(df1, df2) 929ns 1.27µs 753454. 0B 0
#> 2 vectbl_restore_old(df1, df2) 3.97ms 4.81ms 209. 3.81MB 22.7Created on 2020-02-14 by the reprex package (v0.3.0.9000) |
|
closes tidyverse/dplyr#4855 |
5aec45a to
8ca9312
Compare
|
Oh, it's only a problem with long data frames: library(tibble)
library(rlang)
df1 <- tibble(x = rnorm(1e6), g = sample(rep(1:1e4, 100)))
df1a <- df1[1:10000, ]
df1b <- df1[1:1000, ]
df1c <- df1[1:100, ]
vectbl_restore <- tibble:::vectbl_restore
bench::mark(
vectbl_restore(df1, df1),
vectbl_restore(df1a, df1a),
vectbl_restore(df1b, df1b),
vectbl_restore(df1c, df1c),
check = FALSE
)
#> # A tibble: 4 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 vectbl_restore(df1, df1) 3.57ms 3.97ms 241. 3.81MB 23.4
#> 2 vectbl_restore(df1a, df1a) 40.44µs 43.49µs 20181. 39.11KB 16.4
#> 3 vectbl_restore(df1b, df1b) 9.54µs 11.04µs 75970. 3.95KB 15.2
#> 4 vectbl_restore(df1c, df1c) 6.44µs 7.39µs 118001. 448B 23.6Created on 2020-02-14 by the reprex package (v0.3.0) Interesting threshold: going from 10k to 100k slows down by a factor of 100. |
|
yes @krlmlr I suppose this is |
This comment has been minimized.
This comment has been minimized.
|
If you revert 752bfa2 the CI errors should go away. Applying library(tibble)
library(rlang)
df1 <- tibble(x = rnorm(1e6), g = sample(rep(1:1e4, 100)))
df1a <- df1[1:100000, ]
df1b <- df1[1:10000, ]
df1c <- df1[1:1000, ]
vectbl_restore <- tibble:::vectbl_restore
bench::mark(
vectbl_restore(df1, df1),
vectbl_restore(df1a, df1a),
vectbl_restore(df1b, df1b),
vectbl_restore(df1c, df1c),
check = FALSE
)
#> # A tibble: 4 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 vectbl_restore(df1, df1) 3.54ms 4.05ms 232. 3.81MB 23.4
#> 2 vectbl_restore(df1a, df1a) 345.57µs 401.14µs 2235. 390.67KB 18.3
#> 3 vectbl_restore(df1b, df1b) 40.69µs 46.54µs 19021. 39.11KB 13.2
#> 4 vectbl_restore(df1c, df1c) 9.79µs 11.77µs 71793. 3.95KB 14.4Created on 2020-02-14 by the reprex package (v0.3.0) I guess this no longer holds with the C implementation? |
library(tibble)
library(rlang)
df1 <- tibble(x = rnorm(1e6), g = sample(rep(1:1e4, 100)))
df1a <- df1[1:100000, ]
df1b <- df1[1:10000, ]
df1c <- df1[1:1000, ]
vectbl_restore <- tibble:::vectbl_restore
bench::mark(
vectbl_restore(df1, df1),
vectbl_restore(df1a, df1a),
vectbl_restore(df1b, df1b),
vectbl_restore(df1c, df1c),
check = FALSE
)
#> # A tibble: 4 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 vectbl_restore(df1, df1) 947ns 1.09µs 629424. 0B 0
#> 2 vectbl_restore(df1a, df1a) 932ns 1.06µs 635387. 0B 0
#> 3 vectbl_restore(df1b, df1b) 1µs 1.23µs 516068. 0B 0
#> 4 vectbl_restore(df1c, df1c) 949ns 1.1µs 782567. 0B 78.3Created on 2020-02-14 by the reprex package (v0.3.0.9000) |
Co-Authored-By: Kirill Müller <krlmlr@users.noreply.github.com>
|
Thanks! |
* 🧞tidyverse/tibble#717 * prototype vec_sprinkle() * require fallback for more types * handle names when sprinkling * need to vec_cast() all the chunks to common type * make all sprinkle steps internal with short_vec_init() and vec_cast() callables * internal recycling by repeatedly calling vec_assign_impl() with the sprinkling * rows_i_j needs PROTECT() * swing back to master tibble * globalVariables(o_rows) because it is defined lazily * rename to vec_unchop() and move arguments around * Apply suggestions from code review Co-Authored-By: Lionel Henry <lionel.hry@gmail.com> * Apply suggestions from code review Co-Authored-By: Lionel Henry <lionel.hry@gmail.com> * PROTECT(Rf_getAttrib()) * skip again for a different reason * intercept mutate() recycling problems earlier * using vctrs::vec_unchop() * using the callable vctrs::short_vec_recycle() so that chunks are recycled to fit * No need to hang on to rows_lengths * - simplify DataMask constructor - provide DataMask$get_rows() * Move size handling until after all the expressions. use vec_unchop() then * recycling and then call vec_c() instead of using vec_unchop() * use master vctrs * vec_assign_impl() no longer used * shirt_vec_init() no longer used * vec_cast() not used intrenally Co-authored-by: Lionel Henry <lionel.hry@gmail.com>
This has come up as part of performance benchmarks for dplyr 1.0.0:
Created on 2020-02-13 by the reprex package (v0.3.0.9000)
I believe this is because the form
attributes(x)[names(attribs)] <-pays for setting all the attributes, includingrow.nameswhich is expensive to get/set.closes tidyverse/dplyr#4855