Add tidyr 1.2.0 updates to pivot_wider() (#774)

* Use `check_pivot_spec()` from `tidyr` * Support `names_expand` argument * Support `names_vary` argument * Use `build_wider_id_cols_expr()` * Compute `id_cols` by excluding `names_from` and `values_from` * Test `names_repair` of `pivot_wider()` * Require at least 1 column selection for `names_from` and `values_from` * Implement `unused_fn` for summarizing unused columns * Minor refactoring * Update snapshots * document() * Make test more robust... * NEWS
tidyverse · Mar 2, 2022 · aa8a28a · aa8a28a
1 parent 65d6b7e
commit aa8a28a
Show file tree

Hide file tree

Showing 9 changed files with 452 additions and 94 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -236,6 +236,7 @@ S3method(sql_query_join,SQLiteConnection)
 S3method(sql_query_rows,DBIConnection)
 S3method(sql_query_save,"Microsoft SQL Server")
 S3method(sql_query_save,DBIConnection)
+S3method(sql_query_save,Oracle)
 S3method(sql_query_select,"Microsoft SQL Server")
 S3method(sql_query_select,ACCESS)
 S3method(sql_query_select,DBIConnection)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # dbplyr (development version)
 
+* `pivot_wider()` now supports the arguments `names_vary`, `names_expand`, and
+  `unused_fn` (@mgirlich, #774).
+
 * The `*_join()` verbs now have arguments `x_as` and `y_as` that allow to
   specify the table alias to use in the SQL query (@mgirlich, #637).
 

diff --git a/R/verb-pivot-longer.R b/R/verb-pivot-longer.R
@@ -79,7 +79,6 @@ pivot_longer.tbl_lazy <- function(data,
   rlang::check_dots_empty()
 
   cols <- enquo(cols)
-
   spec <- tidyr::build_longer_spec(simulate_vars(data), !!cols,
     names_to = names_to,
     values_to = values_to,
@@ -102,7 +101,7 @@ dbplyr_pivot_longer_spec <- function(data,
                                      names_repair = "check_unique",
                                      values_drop_na = FALSE,
                                      values_transform = NULL) {
-  spec <- check_spec(spec)
+  spec <- tidyr::check_pivot_spec(spec)
   # .seq col needed if different input columns are mapped to the same output
   # column
   spec <- deduplicate_spec(spec, data)
@@ -211,29 +210,8 @@ apply_name_repair_pivot_longer <- function(id_cols, spec, names_repair) {
 }
 
 # The following is copy-pasted from `tidyr`
-# `check_spec()` can be removed once it is exported by `tidyr`
-# see https://github.com/tidyverse/tidyr/issues/1087
 
 # nocov start
-check_spec <- function(spec) {
-  # COPIED FROM tidyr
-
-  # Eventually should just be vec_assert() on partial_frame()
-  # Waiting for https://github.com/r-lib/vctrs/issues/198
-
-  if (!is.data.frame(spec)) {
-    abort("`spec` must be a data frame")
-  }
-
-  if (!has_name(spec, ".name") || !has_name(spec, ".value")) {
-    abort("`spec` must have `.name` and `.value` columns")
-  }
-
-  # Ensure .name and .value come first
-  vars <- union(c(".name", ".value"), names(spec))
-  spec[vars]
-}
-
 # Ensure that there's a one-to-one match from spec to data by adding
 # a special .seq variable which is automatically removed after pivotting.
 deduplicate_spec <- function(spec, df) {
@@ -274,7 +252,7 @@ deduplicate_spec <- function(spec, df) {
 }
 
 check_list_of_functions <- function(x, names, arg) {
-  # COPIED FROM tidyr
+  # mostly COPIED FROM tidyr
   if (is.null(x)) {
     x <- set_names(list(), character())
   }
@@ -291,8 +269,6 @@ check_list_of_functions <- function(x, names, arg) {
     abort(glue("The names of `{arg}` must be unique."))
   }
 
-  x <- purrr::map(x, as_function)
-
   # Silently drop user supplied names not found in the data
   x <- x[intersect(names(x), names)]
 

diff --git a/R/verb-pivot-wider.R b/R/verb-pivot-wider.R
@@ -45,12 +45,42 @@
 #'   `.value`) to create custom column names.
 #' @param names_sort Should the column names be sorted? If `FALSE`, the default,
 #'   column names are ordered by first appearance.
+#' @param names_vary When `names_from` identifies a column (or columns) with
+#'   multiple unique values, and multiple `values_from` columns are provided,
+#'   in what order should the resulting column names be combined?
+#'
+#'   - `"fastest"` varies `names_from` values fastest, resulting in a column
+#'     naming scheme of the form: `value1_name1, value1_name2, value2_name1,
+#'     value2_name2`. This is the default.
+#'
+#'   - `"slowest"` varies `names_from` values slowest, resulting in a column
+#'     naming scheme of the form: `value1_name1, value2_name1, value1_name2,
+#'     value2_name2`.
 #' @param names_repair What happens if the output has invalid column names?
+#' @param names_expand Should the values in the `names_from` columns be expanded
+#'   by [expand()] before pivoting? This results in more columns, the output
+#'   will contain column names corresponding to a complete expansion of all
+#'   possible values in `names_from`. Additionally, the column names will be
+#'   sorted, identical to what `names_sort` would produce.
 #' @param values_fill Optionally, a (scalar) value that specifies what each
 #'   `value` should be filled in with when missing.
 #' @param values_fn A function, the default is `max()`, applied to the `value`
 #' in each cell in the output. In contrast to local data frames it must not be
 #' `NULL`.
+#' @param unused_fn Optionally, a function applied to summarize the values from
+#'   the unused columns (i.e. columns not identified by `id_cols`,
+#'   `names_from`, or `values_from`).
+#'
+#'   The default drops all unused columns from the result.
+#'
+#'   This can be a named list if you want to apply different aggregations
+#'   to different unused columns.
+#'
+#'   `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
+#'   all unspecified columns will be considered `id_cols`.
+#'
+#'   This is similar to grouping by the `id_cols` then summarizing the
+#'   unused columns using `unused_fn`.
 #' @param ... Unused; included for compatibility with generic.
 #'
 #' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
@@ -71,10 +101,13 @@ pivot_wider.tbl_lazy <- function(data,
                                  names_sep = "_",
                                  names_glue = NULL,
                                  names_sort = FALSE,
+                                 names_vary = "fastest",
+                                 names_expand = FALSE,
                                  names_repair = "check_unique",
                                  values_from = value,
                                  values_fill = NULL,
                                  values_fn = ~ max(.x, na.rm = TRUE),
+                                 unused_fn = NULL,
                                  ...
                                  ) {
   rlang::check_dots_empty()
@@ -87,14 +120,26 @@ pivot_wider.tbl_lazy <- function(data,
     names_prefix = names_prefix,
     names_sep = names_sep,
     names_glue = names_glue,
-    names_sort = names_sort
+    names_sort = names_sort,
+    names_vary = names_vary,
+    names_expand = names_expand
   )
 
-  id_cols <- enquo(id_cols)
-  dbplyr_pivot_wider_spec(data, spec, !!id_cols,
+  id_cols <- build_wider_id_cols_expr(
+    data = data,
+    id_cols = {{id_cols}},
+    names_from = !!names_from,
+    values_from = !!values_from
+  )
+
+  dbplyr_pivot_wider_spec(
+    data = data,
+    spec = spec,
+    id_cols = !!id_cols,
     names_repair = names_repair,
     values_fill = values_fill,
-    values_fn = values_fn
+    values_fn = values_fn,
+    unused_fn = unused_fn
   )
 }
 
@@ -104,7 +149,9 @@ dbplyr_build_wider_spec <- function(data,
                                     names_prefix = "",
                                     names_sep = "_",
                                     names_glue = NULL,
-                                    names_sort = FALSE) {
+                                    names_sort = FALSE,
+                                    names_vary = "fastest",
+                                    names_expand = FALSE) {
   if (!inherits(data, "tbl_sql")) {
     error_message <- c(
       "`dbplyr_build_wider_spec()` doesn't work with local lazy tibbles.",
@@ -119,10 +166,16 @@ dbplyr_build_wider_spec <- function(data,
   # row_ids <- vec_unique(data[names_from])
   sim_data <- simulate_vars(data)
   names_from <- tidyselect::eval_select(enquo(names_from), sim_data) %>% names()
+  if (is_empty(names_from)) {
+    abort("`names_from` must select at least one column.")
+  }
   distinct_data <- collect(distinct(data, !!!syms(names_from)))
 
   # 2. add `values_from` column
   values_from <- tidyselect::eval_select(enquo(values_from), sim_data) %>% names()
+  if (is_empty(values_from)) {
+    abort("`values_from` must select at least one column.")
+  }
   dummy_data <- vctrs::vec_cbind(
     distinct_data,
     !!!rlang::rep_named(values_from, list(TRUE)),
@@ -135,7 +188,9 @@ dbplyr_build_wider_spec <- function(data,
     names_prefix = names_prefix,
     names_sep = names_sep,
     names_glue = names_glue,
-    names_sort = names_sort
+    names_sort = names_sort,
+    names_vary = names_vary,
+    names_expand = names_expand
   )
 }
 
@@ -144,69 +199,45 @@ dbplyr_pivot_wider_spec <- function(data,
                                     names_repair = "check_unique",
                                     id_cols = NULL,
                                     values_fill = NULL,
-                                    values_fn = ~ max(.x, na.rm = TRUE)) {
-  spec <- check_spec(spec)
+                                    values_fn = ~ max(.x, na.rm = TRUE),
+                                    unused_fn = NULL) {
+  input <- data
 
-  if (is_scalar(values_fill)) {
-    values_fill <- rep_named(unique(spec$.value), list(values_fill))
-  }
-  if (!is.null(values_fill) && !is.list(values_fill)) {
-    abort("`values_fill` must be NULL, a scalar, or a named list")
-  }
+  spec <- tidyr::check_pivot_spec(spec)
 
-  values <- vctrs::vec_unique(spec$.value)
-  spec_cols <- c(names(spec)[-(1:2)], values)
+  names_from_cols <- names(spec)[-(1:2)]
+  values_from_cols <- vctrs::vec_unique(spec$.value)
+  non_id_cols <- c(names_from_cols, values_from_cols)
 
-  if (is.null(values_fn)) {
-    abort(c(
-      "`values_fn` must not be NULL",
-      i = "`values_fn` must be a function or a named list of functions"
-    ))
-  }
+  id_cols <- select_wider_id_cols(
+    data = data,
+    id_cols = {{id_cols}},
+    non_id_cols = non_id_cols
+  )
 
-  if (!vctrs::vec_is_list(values_fn)) {
-    values_fn <- rep_named(values, list(values_fn))
+  if (is.null(values_fill)) {
+    values_fill <- list()
   }
+  if (is_scalar(values_fill)) {
+    values_fill <- rep_named(values_from_cols, list(values_fill))
+  }
+  if (!vctrs::vec_is_list(values_fill)) {
+    abort("`values_fill` must be NULL, a scalar, or a named list")
+  }
+  values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]
 
-  values_fn <- purrr::compact(values_fn)
-  missing_values <- setdiff(values, names(values_fn))
+  values_fn <- check_list_of_functions(values_fn, values_from_cols, "values_fn")
+  missing_values <- setdiff(values_from_cols, names(values_fn))
   if (!is_empty(missing_values)) {
     abort("`values_fn` must specify a function for each col in `values_from`")
   }
 
-  id_cols <- enquo(id_cols)
-  if (!quo_is_null(id_cols)) {
-    cn <- set_names(colnames(data))
-    key_vars <- names(tidyselect::eval_select(enquo(id_cols), cn))
-  } else {
-    key_vars <- tbl_vars(data)
-  }
-  key_vars <- setdiff(key_vars, spec_cols)
-
-  key_cols <- syms(names(spec)[-(1:2)])
   pivot_exprs <- purrr::map(
-    vctrs::vec_seq_along(spec),
-    function(row) {
-      values_col <- spec[[".value"]][row]
-      fill_value <- values_fill[[values_col]]
-
-      keys <- vctrs::vec_slice(spec[, -(1:2)], row)
-      keys_cond <- purrr::imap(
-        keys,
-        function(value, name) {
-          expr(!!sym(name) == !!value)
-        }
-      ) %>%
-        purrr::reduce(~ expr(!!.x & !!.y))
-
-      case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))
-
-      agg_fn <- values_fn[[values_col]]
-      resolve_fun(agg_fn, case_expr)
-    }
-  ) %>%
-    set_names(spec$.name)
+    set_names(vctrs::vec_seq_along(spec), spec$.name),
+    ~ build_pivot_wider_exprs(.x, spec, values_fill, values_fn)
+  )
 
+  key_vars <- setdiff(id_cols, non_id_cols)
   data_grouped <- group_by(data, !!!syms(key_vars), .add = TRUE)
 
   group_names <- group_vars(data_grouped)
@@ -218,13 +249,80 @@ dbplyr_pivot_wider_spec <- function(data,
   }
   pivot_exprs <- set_names(pivot_exprs, out_nms_repaired)
 
+  unused_cols <- setdiff(colnames(data), c(id_cols, non_id_cols))
+  unused_fn <- check_list_of_functions(unused_fn, unused_cols, "unused_fn")
+  unused_col_expr <- purrr::imap(unused_fn, ~ resolve_fun(.x, sym(.y)))
+
   data_grouped %>%
-    summarise(!!!pivot_exprs, .groups = "drop") %>%
+    summarise(
+      !!!pivot_exprs,
+      !!!unused_col_expr,
+      .groups = "drop"
+    ) %>%
     group_by(!!!syms(group_vars(data)))
 }
 
 globalVariables(c("name", "value"))
 
+build_wider_id_cols_expr <- function(data,
+                                     id_cols = NULL,
+                                     names_from = name,
+                                     values_from = value) {
+  # COPIED FROM tidyr
+  # TODO: Use `allow_rename = FALSE`.
+  # Requires https://github.com/r-lib/tidyselect/issues/225.
+  sim_data <- simulate_vars(data)
+  names_from <- names(tidyselect::eval_select(enquo(names_from), sim_data))
+  values_from <- names(tidyselect::eval_select(enquo(values_from), sim_data))
+  non_id_cols <- c(names_from, values_from)
+
+  out <- select_wider_id_cols(
+    data = data,
+    id_cols = {{id_cols}},
+    non_id_cols = non_id_cols
+  )
+
+  expr(c(!!!out))
+}
+
+build_pivot_wider_exprs <- function(row_id, spec, values_fill, values_fn) {
+  values_col <- spec[[".value"]][row_id]
+  fill_value <- values_fill[[values_col]]
+
+  keys <- vctrs::vec_slice(spec[, -(1:2)], row_id)
+  keys_cond <- purrr::imap(
+    keys,
+    function(value, name) {
+      expr(!!sym(name) == !!value)
+    }
+  ) %>%
+    purrr::reduce(~ expr(!!.x & !!.y))
+
+  case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))
+
+  agg_fn <- values_fn[[values_col]]
+  resolve_fun(agg_fn, case_expr)
+}
+
+select_wider_id_cols <- function(data,
+                                 id_cols = NULL,
+                                 non_id_cols = character()) {
+  # COPIED FROM tidyr
+  id_cols <- enquo(id_cols)
+  sim_data <- simulate_vars(data)
+
+  # Remove known non-id-cols so they are never selected
+  sim_data <- sim_data[setdiff(names(sim_data), non_id_cols)]
+
+  if (quo_is_null(id_cols)) {
+    names(sim_data)
+  } else {
+    # TODO: Use `allow_rename = FALSE`.
+    # Requires https://github.com/r-lib/tidyselect/issues/225.
+    names(tidyselect::eval_select(enquo(id_cols), sim_data))
+  }
+}
+
 is_scalar <- function(x) {
   if (is.null(x)) {
     return(FALSE)
@@ -244,6 +342,9 @@ resolve_fun <- function(x, var) {
     exec(.fn_expr, var)
   } else {
     fn_name <- find_fun(x)
+    if (is_null(fn_name)) {
+      abort("Can't convert to a function.")
+    }
     call2(fn_name, var)
   }
 }