Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tidyr 1.2.0 updates to pivot_wider() #774

Merged
merged 17 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ S3method(sql_query_join,SQLiteConnection)
S3method(sql_query_rows,DBIConnection)
S3method(sql_query_save,"Microsoft SQL Server")
S3method(sql_query_save,DBIConnection)
S3method(sql_query_save,Oracle)
S3method(sql_query_select,"Microsoft SQL Server")
S3method(sql_query_select,ACCESS)
S3method(sql_query_select,DBIConnection)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dbplyr (development version)

* `pivot_wider()` now supports the arguments `names_vary`, `names_expand`, and
`unused_fn` (@mgirlich, #774).

* The `*_join()` verbs now have arguments `x_as` and `y_as` that allow to
specify the table alias to use in the SQL query (@mgirlich, #637).

Expand Down
28 changes: 2 additions & 26 deletions R/verb-pivot-longer.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ pivot_longer.tbl_lazy <- function(data,
rlang::check_dots_empty()

cols <- enquo(cols)

spec <- tidyr::build_longer_spec(simulate_vars(data), !!cols,
names_to = names_to,
values_to = values_to,
Expand All @@ -102,7 +101,7 @@ dbplyr_pivot_longer_spec <- function(data,
names_repair = "check_unique",
values_drop_na = FALSE,
values_transform = NULL) {
spec <- check_spec(spec)
spec <- tidyr::check_pivot_spec(spec)
# .seq col needed if different input columns are mapped to the same output
# column
spec <- deduplicate_spec(spec, data)
Expand Down Expand Up @@ -211,29 +210,8 @@ apply_name_repair_pivot_longer <- function(id_cols, spec, names_repair) {
}

# The following is copy-pasted from `tidyr`
# `check_spec()` can be removed once it is exported by `tidyr`
# see https://github.com/tidyverse/tidyr/issues/1087

# nocov start
check_spec <- function(spec) {
# COPIED FROM tidyr

# Eventually should just be vec_assert() on partial_frame()
# Waiting for https://github.com/r-lib/vctrs/issues/198

if (!is.data.frame(spec)) {
abort("`spec` must be a data frame")
}

if (!has_name(spec, ".name") || !has_name(spec, ".value")) {
abort("`spec` must have `.name` and `.value` columns")
}

# Ensure .name and .value come first
vars <- union(c(".name", ".value"), names(spec))
spec[vars]
}

# Ensure that there's a one-to-one match from spec to data by adding
# a special .seq variable which is automatically removed after pivotting.
deduplicate_spec <- function(spec, df) {
Expand Down Expand Up @@ -274,7 +252,7 @@ deduplicate_spec <- function(spec, df) {
}

check_list_of_functions <- function(x, names, arg) {
# COPIED FROM tidyr
# mostly COPIED FROM tidyr
if (is.null(x)) {
x <- set_names(list(), character())
}
Expand All @@ -291,8 +269,6 @@ check_list_of_functions <- function(x, names, arg) {
abort(glue("The names of `{arg}` must be unique."))
}

x <- purrr::map(x, as_function)

# Silently drop user supplied names not found in the data
x <- x[intersect(names(x), names)]

Expand Down
217 changes: 159 additions & 58 deletions R/verb-pivot-wider.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,42 @@
#' `.value`) to create custom column names.
#' @param names_sort Should the column names be sorted? If `FALSE`, the default,
#' column names are ordered by first appearance.
#' @param names_vary When `names_from` identifies a column (or columns) with
#' multiple unique values, and multiple `values_from` columns are provided,
#' in what order should the resulting column names be combined?
#'
#' - `"fastest"` varies `names_from` values fastest, resulting in a column
#' naming scheme of the form: `value1_name1, value1_name2, value2_name1,
#' value2_name2`. This is the default.
#'
#' - `"slowest"` varies `names_from` values slowest, resulting in a column
#' naming scheme of the form: `value1_name1, value2_name1, value1_name2,
#' value2_name2`.
#' @param names_repair What happens if the output has invalid column names?
#' @param names_expand Should the values in the `names_from` columns be expanded
#' by [expand()] before pivoting? This results in more columns, the output
#' will contain column names corresponding to a complete expansion of all
#' possible values in `names_from`. Additionally, the column names will be
#' sorted, identical to what `names_sort` would produce.
#' @param values_fill Optionally, a (scalar) value that specifies what each
#' `value` should be filled in with when missing.
#' @param values_fn A function, the default is `max()`, applied to the `value`
#' in each cell in the output. In contrast to local data frames it must not be
#' `NULL`.
#' @param unused_fn Optionally, a function applied to summarize the values from
#' the unused columns (i.e. columns not identified by `id_cols`,
#' `names_from`, or `values_from`).
#'
#' The default drops all unused columns from the result.
#'
#' This can be a named list if you want to apply different aggregations
#' to different unused columns.
#'
#' `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
#' all unspecified columns will be considered `id_cols`.
#'
#' This is similar to grouping by the `id_cols` then summarizing the
#' unused columns using `unused_fn`.
#' @param ... Unused; included for compatibility with generic.
#'
#' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
Expand All @@ -71,10 +101,13 @@ pivot_wider.tbl_lazy <- function(data,
names_sep = "_",
names_glue = NULL,
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE,
names_repair = "check_unique",
values_from = value,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL,
...
) {
rlang::check_dots_empty()
Expand All @@ -87,14 +120,26 @@ pivot_wider.tbl_lazy <- function(data,
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand
)

id_cols <- enquo(id_cols)
dbplyr_pivot_wider_spec(data, spec, !!id_cols,
id_cols <- build_wider_id_cols_expr(
data = data,
id_cols = {{id_cols}},
names_from = !!names_from,
values_from = !!values_from
)

dbplyr_pivot_wider_spec(
data = data,
spec = spec,
id_cols = !!id_cols,
names_repair = names_repair,
values_fill = values_fill,
values_fn = values_fn
values_fn = values_fn,
unused_fn = unused_fn
)
}

Expand All @@ -104,7 +149,9 @@ dbplyr_build_wider_spec <- function(data,
names_prefix = "",
names_sep = "_",
names_glue = NULL,
names_sort = FALSE) {
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE) {
if (!inherits(data, "tbl_sql")) {
error_message <- c(
"`dbplyr_build_wider_spec()` doesn't work with local lazy tibbles.",
Expand All @@ -119,10 +166,16 @@ dbplyr_build_wider_spec <- function(data,
# row_ids <- vec_unique(data[names_from])
sim_data <- simulate_vars(data)
names_from <- tidyselect::eval_select(enquo(names_from), sim_data) %>% names()
if (is_empty(names_from)) {
abort("`names_from` must select at least one column.")
}
distinct_data <- collect(distinct(data, !!!syms(names_from)))

# 2. add `values_from` column
values_from <- tidyselect::eval_select(enquo(values_from), sim_data) %>% names()
if (is_empty(values_from)) {
abort("`values_from` must select at least one column.")
}
dummy_data <- vctrs::vec_cbind(
distinct_data,
!!!rlang::rep_named(values_from, list(TRUE)),
Expand All @@ -135,7 +188,9 @@ dbplyr_build_wider_spec <- function(data,
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand
)
}

Expand All @@ -144,69 +199,45 @@ dbplyr_pivot_wider_spec <- function(data,
names_repair = "check_unique",
id_cols = NULL,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE)) {
spec <- check_spec(spec)
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL) {
input <- data

if (is_scalar(values_fill)) {
values_fill <- rep_named(unique(spec$.value), list(values_fill))
}
if (!is.null(values_fill) && !is.list(values_fill)) {
abort("`values_fill` must be NULL, a scalar, or a named list")
}
spec <- tidyr::check_pivot_spec(spec)

values <- vctrs::vec_unique(spec$.value)
spec_cols <- c(names(spec)[-(1:2)], values)
names_from_cols <- names(spec)[-(1:2)]
values_from_cols <- vctrs::vec_unique(spec$.value)
non_id_cols <- c(names_from_cols, values_from_cols)

if (is.null(values_fn)) {
abort(c(
"`values_fn` must not be NULL",
i = "`values_fn` must be a function or a named list of functions"
))
}
id_cols <- select_wider_id_cols(
data = data,
id_cols = {{id_cols}},
non_id_cols = non_id_cols
)

if (!vctrs::vec_is_list(values_fn)) {
values_fn <- rep_named(values, list(values_fn))
if (is.null(values_fill)) {
values_fill <- list()
}
if (is_scalar(values_fill)) {
values_fill <- rep_named(values_from_cols, list(values_fill))
}
if (!vctrs::vec_is_list(values_fill)) {
abort("`values_fill` must be NULL, a scalar, or a named list")
}
values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]

values_fn <- purrr::compact(values_fn)
missing_values <- setdiff(values, names(values_fn))
values_fn <- check_list_of_functions(values_fn, values_from_cols, "values_fn")
missing_values <- setdiff(values_from_cols, names(values_fn))
if (!is_empty(missing_values)) {
abort("`values_fn` must specify a function for each col in `values_from`")
}

id_cols <- enquo(id_cols)
if (!quo_is_null(id_cols)) {
cn <- set_names(colnames(data))
key_vars <- names(tidyselect::eval_select(enquo(id_cols), cn))
} else {
key_vars <- tbl_vars(data)
}
key_vars <- setdiff(key_vars, spec_cols)

key_cols <- syms(names(spec)[-(1:2)])
pivot_exprs <- purrr::map(
vctrs::vec_seq_along(spec),
function(row) {
values_col <- spec[[".value"]][row]
fill_value <- values_fill[[values_col]]

keys <- vctrs::vec_slice(spec[, -(1:2)], row)
keys_cond <- purrr::imap(
keys,
function(value, name) {
expr(!!sym(name) == !!value)
}
) %>%
purrr::reduce(~ expr(!!.x & !!.y))

case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))

agg_fn <- values_fn[[values_col]]
resolve_fun(agg_fn, case_expr)
}
) %>%
set_names(spec$.name)
set_names(vctrs::vec_seq_along(spec), spec$.name),
~ build_pivot_wider_exprs(.x, spec, values_fill, values_fn)
)

key_vars <- setdiff(id_cols, non_id_cols)
data_grouped <- group_by(data, !!!syms(key_vars), .add = TRUE)

group_names <- group_vars(data_grouped)
Expand All @@ -218,13 +249,80 @@ dbplyr_pivot_wider_spec <- function(data,
}
pivot_exprs <- set_names(pivot_exprs, out_nms_repaired)

unused_cols <- setdiff(colnames(data), c(id_cols, non_id_cols))
unused_fn <- check_list_of_functions(unused_fn, unused_cols, "unused_fn")
unused_col_expr <- purrr::imap(unused_fn, ~ resolve_fun(.x, sym(.y)))

data_grouped %>%
summarise(!!!pivot_exprs, .groups = "drop") %>%
summarise(
!!!pivot_exprs,
!!!unused_col_expr,
.groups = "drop"
) %>%
group_by(!!!syms(group_vars(data)))
}

globalVariables(c("name", "value"))

build_wider_id_cols_expr <- function(data,
id_cols = NULL,
names_from = name,
values_from = value) {
# COPIED FROM tidyr
# TODO: Use `allow_rename = FALSE`.
# Requires https://github.com/r-lib/tidyselect/issues/225.
sim_data <- simulate_vars(data)
names_from <- names(tidyselect::eval_select(enquo(names_from), sim_data))
values_from <- names(tidyselect::eval_select(enquo(values_from), sim_data))
non_id_cols <- c(names_from, values_from)

out <- select_wider_id_cols(
data = data,
id_cols = {{id_cols}},
non_id_cols = non_id_cols
)

expr(c(!!!out))
}

build_pivot_wider_exprs <- function(row_id, spec, values_fill, values_fn) {
values_col <- spec[[".value"]][row_id]
fill_value <- values_fill[[values_col]]

keys <- vctrs::vec_slice(spec[, -(1:2)], row_id)
keys_cond <- purrr::imap(
keys,
function(value, name) {
expr(!!sym(name) == !!value)
}
) %>%
purrr::reduce(~ expr(!!.x & !!.y))

case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))

agg_fn <- values_fn[[values_col]]
resolve_fun(agg_fn, case_expr)
}

select_wider_id_cols <- function(data,
id_cols = NULL,
non_id_cols = character()) {
# COPIED FROM tidyr
id_cols <- enquo(id_cols)
sim_data <- simulate_vars(data)

# Remove known non-id-cols so they are never selected
sim_data <- sim_data[setdiff(names(sim_data), non_id_cols)]

if (quo_is_null(id_cols)) {
names(sim_data)
} else {
# TODO: Use `allow_rename = FALSE`.
# Requires https://github.com/r-lib/tidyselect/issues/225.
names(tidyselect::eval_select(enquo(id_cols), sim_data))
}
}

is_scalar <- function(x) {
if (is.null(x)) {
return(FALSE)
Expand All @@ -244,6 +342,9 @@ resolve_fun <- function(x, var) {
exec(.fn_expr, var)
} else {
fn_name <- find_fun(x)
if (is_null(fn_name)) {
abort("Can't convert to a function.")
}
call2(fn_name, var)
}
}
Loading