Skip to content

Commit

Permalink
Add tidyr 1.2.0 updates to pivot_wider() (#774)
Browse files Browse the repository at this point in the history
* Use `check_pivot_spec()` from `tidyr`

* Support `names_expand` argument

* Support `names_vary` argument

* Use `build_wider_id_cols_expr()`

* Compute `id_cols` by excluding `names_from` and `values_from`

* Test `names_repair` of `pivot_wider()`

* Require at least 1 column selection for `names_from` and `values_from`

* Implement `unused_fn` for summarizing unused columns

* Minor refactoring

* Update snapshots

* document()

* Make test more robust...

* NEWS
  • Loading branch information
mgirlich committed Mar 2, 2022
1 parent 65d6b7e commit aa8a28a
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 94 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ S3method(sql_query_join,SQLiteConnection)
S3method(sql_query_rows,DBIConnection)
S3method(sql_query_save,"Microsoft SQL Server")
S3method(sql_query_save,DBIConnection)
S3method(sql_query_save,Oracle)
S3method(sql_query_select,"Microsoft SQL Server")
S3method(sql_query_select,ACCESS)
S3method(sql_query_select,DBIConnection)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dbplyr (development version)

* `pivot_wider()` now supports the arguments `names_vary`, `names_expand`, and
`unused_fn` (@mgirlich, #774).

* The `*_join()` verbs now have arguments `x_as` and `y_as` that allow to
specify the table alias to use in the SQL query (@mgirlich, #637).

Expand Down
28 changes: 2 additions & 26 deletions R/verb-pivot-longer.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ pivot_longer.tbl_lazy <- function(data,
rlang::check_dots_empty()

cols <- enquo(cols)

spec <- tidyr::build_longer_spec(simulate_vars(data), !!cols,
names_to = names_to,
values_to = values_to,
Expand All @@ -102,7 +101,7 @@ dbplyr_pivot_longer_spec <- function(data,
names_repair = "check_unique",
values_drop_na = FALSE,
values_transform = NULL) {
spec <- check_spec(spec)
spec <- tidyr::check_pivot_spec(spec)
# .seq col needed if different input columns are mapped to the same output
# column
spec <- deduplicate_spec(spec, data)
Expand Down Expand Up @@ -211,29 +210,8 @@ apply_name_repair_pivot_longer <- function(id_cols, spec, names_repair) {
}

# The following is copy-pasted from `tidyr`
# `check_spec()` can be removed once it is exported by `tidyr`
# see https://github.com/tidyverse/tidyr/issues/1087

# nocov start
check_spec <- function(spec) {
# COPIED FROM tidyr

# Eventually should just be vec_assert() on partial_frame()
# Waiting for https://github.com/r-lib/vctrs/issues/198

if (!is.data.frame(spec)) {
abort("`spec` must be a data frame")
}

if (!has_name(spec, ".name") || !has_name(spec, ".value")) {
abort("`spec` must have `.name` and `.value` columns")
}

# Ensure .name and .value come first
vars <- union(c(".name", ".value"), names(spec))
spec[vars]
}

# Ensure that there's a one-to-one match from spec to data by adding
# a special .seq variable which is automatically removed after pivotting.
deduplicate_spec <- function(spec, df) {
Expand Down Expand Up @@ -274,7 +252,7 @@ deduplicate_spec <- function(spec, df) {
}

check_list_of_functions <- function(x, names, arg) {
# COPIED FROM tidyr
# mostly COPIED FROM tidyr
if (is.null(x)) {
x <- set_names(list(), character())
}
Expand All @@ -291,8 +269,6 @@ check_list_of_functions <- function(x, names, arg) {
abort(glue("The names of `{arg}` must be unique."))
}

x <- purrr::map(x, as_function)

# Silently drop user supplied names not found in the data
x <- x[intersect(names(x), names)]

Expand Down
217 changes: 159 additions & 58 deletions R/verb-pivot-wider.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,42 @@
#' `.value`) to create custom column names.
#' @param names_sort Should the column names be sorted? If `FALSE`, the default,
#' column names are ordered by first appearance.
#' @param names_vary When `names_from` identifies a column (or columns) with
#' multiple unique values, and multiple `values_from` columns are provided,
#' in what order should the resulting column names be combined?
#'
#' - `"fastest"` varies `names_from` values fastest, resulting in a column
#' naming scheme of the form: `value1_name1, value1_name2, value2_name1,
#' value2_name2`. This is the default.
#'
#' - `"slowest"` varies `names_from` values slowest, resulting in a column
#' naming scheme of the form: `value1_name1, value2_name1, value1_name2,
#' value2_name2`.
#' @param names_repair What happens if the output has invalid column names?
#' @param names_expand Should the values in the `names_from` columns be expanded
#' by [expand()] before pivoting? This results in more columns, the output
#' will contain column names corresponding to a complete expansion of all
#' possible values in `names_from`. Additionally, the column names will be
#' sorted, identical to what `names_sort` would produce.
#' @param values_fill Optionally, a (scalar) value that specifies what each
#' `value` should be filled in with when missing.
#' @param values_fn A function, the default is `max()`, applied to the `value`
#' in each cell in the output. In contrast to local data frames it must not be
#' `NULL`.
#' @param unused_fn Optionally, a function applied to summarize the values from
#' the unused columns (i.e. columns not identified by `id_cols`,
#' `names_from`, or `values_from`).
#'
#' The default drops all unused columns from the result.
#'
#' This can be a named list if you want to apply different aggregations
#' to different unused columns.
#'
#' `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
#' all unspecified columns will be considered `id_cols`.
#'
#' This is similar to grouping by the `id_cols` then summarizing the
#' unused columns using `unused_fn`.
#' @param ... Unused; included for compatibility with generic.
#'
#' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
Expand All @@ -71,10 +101,13 @@ pivot_wider.tbl_lazy <- function(data,
names_sep = "_",
names_glue = NULL,
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE,
names_repair = "check_unique",
values_from = value,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL,
...
) {
rlang::check_dots_empty()
Expand All @@ -87,14 +120,26 @@ pivot_wider.tbl_lazy <- function(data,
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand
)

id_cols <- enquo(id_cols)
dbplyr_pivot_wider_spec(data, spec, !!id_cols,
id_cols <- build_wider_id_cols_expr(
data = data,
id_cols = {{id_cols}},
names_from = !!names_from,
values_from = !!values_from
)

dbplyr_pivot_wider_spec(
data = data,
spec = spec,
id_cols = !!id_cols,
names_repair = names_repair,
values_fill = values_fill,
values_fn = values_fn
values_fn = values_fn,
unused_fn = unused_fn
)
}

Expand All @@ -104,7 +149,9 @@ dbplyr_build_wider_spec <- function(data,
names_prefix = "",
names_sep = "_",
names_glue = NULL,
names_sort = FALSE) {
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE) {
if (!inherits(data, "tbl_sql")) {
error_message <- c(
"`dbplyr_build_wider_spec()` doesn't work with local lazy tibbles.",
Expand All @@ -119,10 +166,16 @@ dbplyr_build_wider_spec <- function(data,
# row_ids <- vec_unique(data[names_from])
sim_data <- simulate_vars(data)
names_from <- tidyselect::eval_select(enquo(names_from), sim_data) %>% names()
if (is_empty(names_from)) {
abort("`names_from` must select at least one column.")
}
distinct_data <- collect(distinct(data, !!!syms(names_from)))

# 2. add `values_from` column
values_from <- tidyselect::eval_select(enquo(values_from), sim_data) %>% names()
if (is_empty(values_from)) {
abort("`values_from` must select at least one column.")
}
dummy_data <- vctrs::vec_cbind(
distinct_data,
!!!rlang::rep_named(values_from, list(TRUE)),
Expand All @@ -135,7 +188,9 @@ dbplyr_build_wider_spec <- function(data,
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand
)
}

Expand All @@ -144,69 +199,45 @@ dbplyr_pivot_wider_spec <- function(data,
names_repair = "check_unique",
id_cols = NULL,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE)) {
spec <- check_spec(spec)
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL) {
input <- data

if (is_scalar(values_fill)) {
values_fill <- rep_named(unique(spec$.value), list(values_fill))
}
if (!is.null(values_fill) && !is.list(values_fill)) {
abort("`values_fill` must be NULL, a scalar, or a named list")
}
spec <- tidyr::check_pivot_spec(spec)

values <- vctrs::vec_unique(spec$.value)
spec_cols <- c(names(spec)[-(1:2)], values)
names_from_cols <- names(spec)[-(1:2)]
values_from_cols <- vctrs::vec_unique(spec$.value)
non_id_cols <- c(names_from_cols, values_from_cols)

if (is.null(values_fn)) {
abort(c(
"`values_fn` must not be NULL",
i = "`values_fn` must be a function or a named list of functions"
))
}
id_cols <- select_wider_id_cols(
data = data,
id_cols = {{id_cols}},
non_id_cols = non_id_cols
)

if (!vctrs::vec_is_list(values_fn)) {
values_fn <- rep_named(values, list(values_fn))
if (is.null(values_fill)) {
values_fill <- list()
}
if (is_scalar(values_fill)) {
values_fill <- rep_named(values_from_cols, list(values_fill))
}
if (!vctrs::vec_is_list(values_fill)) {
abort("`values_fill` must be NULL, a scalar, or a named list")
}
values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]

values_fn <- purrr::compact(values_fn)
missing_values <- setdiff(values, names(values_fn))
values_fn <- check_list_of_functions(values_fn, values_from_cols, "values_fn")
missing_values <- setdiff(values_from_cols, names(values_fn))
if (!is_empty(missing_values)) {
abort("`values_fn` must specify a function for each col in `values_from`")
}

id_cols <- enquo(id_cols)
if (!quo_is_null(id_cols)) {
cn <- set_names(colnames(data))
key_vars <- names(tidyselect::eval_select(enquo(id_cols), cn))
} else {
key_vars <- tbl_vars(data)
}
key_vars <- setdiff(key_vars, spec_cols)

key_cols <- syms(names(spec)[-(1:2)])
pivot_exprs <- purrr::map(
vctrs::vec_seq_along(spec),
function(row) {
values_col <- spec[[".value"]][row]
fill_value <- values_fill[[values_col]]

keys <- vctrs::vec_slice(spec[, -(1:2)], row)
keys_cond <- purrr::imap(
keys,
function(value, name) {
expr(!!sym(name) == !!value)
}
) %>%
purrr::reduce(~ expr(!!.x & !!.y))

case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))

agg_fn <- values_fn[[values_col]]
resolve_fun(agg_fn, case_expr)
}
) %>%
set_names(spec$.name)
set_names(vctrs::vec_seq_along(spec), spec$.name),
~ build_pivot_wider_exprs(.x, spec, values_fill, values_fn)
)

key_vars <- setdiff(id_cols, non_id_cols)
data_grouped <- group_by(data, !!!syms(key_vars), .add = TRUE)

group_names <- group_vars(data_grouped)
Expand All @@ -218,13 +249,80 @@ dbplyr_pivot_wider_spec <- function(data,
}
pivot_exprs <- set_names(pivot_exprs, out_nms_repaired)

unused_cols <- setdiff(colnames(data), c(id_cols, non_id_cols))
unused_fn <- check_list_of_functions(unused_fn, unused_cols, "unused_fn")
unused_col_expr <- purrr::imap(unused_fn, ~ resolve_fun(.x, sym(.y)))

data_grouped %>%
summarise(!!!pivot_exprs, .groups = "drop") %>%
summarise(
!!!pivot_exprs,
!!!unused_col_expr,
.groups = "drop"
) %>%
group_by(!!!syms(group_vars(data)))
}

globalVariables(c("name", "value"))

build_wider_id_cols_expr <- function(data,
id_cols = NULL,
names_from = name,
values_from = value) {
# COPIED FROM tidyr
# TODO: Use `allow_rename = FALSE`.
# Requires https://github.com/r-lib/tidyselect/issues/225.
sim_data <- simulate_vars(data)
names_from <- names(tidyselect::eval_select(enquo(names_from), sim_data))
values_from <- names(tidyselect::eval_select(enquo(values_from), sim_data))
non_id_cols <- c(names_from, values_from)

out <- select_wider_id_cols(
data = data,
id_cols = {{id_cols}},
non_id_cols = non_id_cols
)

expr(c(!!!out))
}

build_pivot_wider_exprs <- function(row_id, spec, values_fill, values_fn) {
values_col <- spec[[".value"]][row_id]
fill_value <- values_fill[[values_col]]

keys <- vctrs::vec_slice(spec[, -(1:2)], row_id)
keys_cond <- purrr::imap(
keys,
function(value, name) {
expr(!!sym(name) == !!value)
}
) %>%
purrr::reduce(~ expr(!!.x & !!.y))

case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))

agg_fn <- values_fn[[values_col]]
resolve_fun(agg_fn, case_expr)
}

select_wider_id_cols <- function(data,
id_cols = NULL,
non_id_cols = character()) {
# COPIED FROM tidyr
id_cols <- enquo(id_cols)
sim_data <- simulate_vars(data)

# Remove known non-id-cols so they are never selected
sim_data <- sim_data[setdiff(names(sim_data), non_id_cols)]

if (quo_is_null(id_cols)) {
names(sim_data)
} else {
# TODO: Use `allow_rename = FALSE`.
# Requires https://github.com/r-lib/tidyselect/issues/225.
names(tidyselect::eval_select(enquo(id_cols), sim_data))
}
}

is_scalar <- function(x) {
if (is.null(x)) {
return(FALSE)
Expand All @@ -244,6 +342,9 @@ resolve_fun <- function(x, var) {
exec(.fn_expr, var)
} else {
fn_name <- find_fun(x)
if (is_null(fn_name)) {
abort("Can't convert to a function.")
}
call2(fn_name, var)
}
}
Loading

0 comments on commit aa8a28a

Please sign in to comment.