Minimise regrouping work (#4751)

This PR starts to develop the dplyr "interface", i.e. the set of generics that you need to provide methods for if you want to extend dplyr to work with new data frame subclasses. It also uses those methods (along with a count_regroups()) to ensure that the existing grouped_df implementations are not needlessly regrouping data. Fixes #4086 because count() can now use dplyr_reconstruct() to restore the original class Fixes #4051 because I've carefully documented the return value of the major verbs Fixes #4711 since implementing with_groups() is now easy.
tidyverse · Jan 15, 2020 · 56c2197 · 56c2197
1 parent 8df2d66
commit 56c2197
Show file tree

Hide file tree

Showing 47 changed files with 818 additions and 507 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,7 +9,6 @@ S3method("names<-",grouped_df)
 S3method(anti_join,data.frame)
 S3method(arrange,data.frame)
 S3method(arrange,default)
-S3method(arrange,grouped_df)
 S3method(arrange_,data.frame)
 S3method(arrange_,tbl_df)
 S3method(as.data.frame,grouped_df)
@@ -32,7 +31,6 @@ S3method(default_missing,data.frame)
 S3method(default_missing,default)
 S3method(distinct,data.frame)
 S3method(distinct,default)
-S3method(distinct,grouped_df)
 S3method(distinct_,data.frame)
 S3method(distinct_,grouped_df)
 S3method(distinct_,tbl_df)
@@ -45,9 +43,14 @@ S3method(do_,"NULL")
 S3method(do_,data.frame)
 S3method(do_,grouped_df)
 S3method(do_,rowwise_df)
+S3method(dplyr_col_modify,data.frame)
+S3method(dplyr_col_modify,grouped_df)
+S3method(dplyr_reconstruct,data.frame)
+S3method(dplyr_reconstruct,grouped_df)
+S3method(dplyr_row_slice,data.frame)
+S3method(dplyr_row_slice,grouped_df)
 S3method(filter,data.frame)
 S3method(filter,default)
-S3method(filter,grouped_df)
 S3method(filter,ts)
 S3method(filter_,data.frame)
 S3method(filter_,tbl_df)
@@ -88,7 +91,6 @@ S3method(intersect,default)
 S3method(left_join,data.frame)
 S3method(mutate,data.frame)
 S3method(mutate,default)
-S3method(mutate,grouped_df)
 S3method(mutate_,data.frame)
 S3method(mutate_,tbl_df)
 S3method(n_groups,data.frame)
@@ -108,7 +110,6 @@ S3method(recode,factor)
 S3method(recode,numeric)
 S3method(rename,data.frame)
 S3method(rename,default)
-S3method(rename,grouped_df)
 S3method(rename_,data.frame)
 S3method(rename_,grouped_df)
 S3method(right_join,data.frame)
@@ -119,7 +120,6 @@ S3method(sample_n,data.frame)
 S3method(sample_n,default)
 S3method(select,data.frame)
 S3method(select,default)
-S3method(select,grouped_df)
 S3method(select,list)
 S3method(select_,data.frame)
 S3method(select_,grouped_df)
@@ -130,7 +130,6 @@ S3method(setequal,data.frame)
 S3method(setequal,default)
 S3method(slice,data.frame)
 S3method(slice,default)
-S3method(slice,grouped_df)
 S3method(slice_,data.frame)
 S3method(slice_,tbl_df)
 S3method(slice_head,data.frame)
@@ -150,7 +149,6 @@ S3method(tbl_ptype,default)
 S3method(tbl_sum,grouped_df)
 S3method(tbl_vars,data.frame)
 S3method(transmute,data.frame)
-S3method(transmute,grouped_df)
 S3method(transmute_,default)
 S3method(transmute_,grouped_df)
 S3method(ungroup,data.frame)
@@ -241,6 +239,9 @@ export(distinct_if)
 export(distinct_prepare)
 export(do)
 export(do_)
+export(dplyr_col_modify)
+export(dplyr_reconstruct)
+export(dplyr_row_slice)
 export(ends_with)
 export(enexpr)
 export(enexprs)
@@ -424,6 +425,7 @@ export(union)
 export(union_all)
 export(validate_grouped_df)
 export(vars)
+export(with_groups)
 export(with_order)
 export(wrap_dbplyr_obj)
 import(rlang)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,25 @@
 # dplyr 0.9.0 (in development)
 
+* New, experimental, `with_groups()` makes it easy to temporarily group or
+  ungroup (#4711).
+
+* dplyr now has a rudimentary, experimental, and stop-gap, extension mechanism
+  documented in `?dplyr_extending`
+
+* The implementation of all verbs has been carefully thought through. This 
+  mostly makes implementation simpler but should hopefully increase consistency,
+  and also makes it easier to adapt to dplyr to new data structures in the 
+  new future. Pragmatically, the biggest difference for most people will be
+  that each verb documents its return value in terms of rows, columns, groups,
+  and data frame attributes.
+
+* Row names are now preserved when working with data frames.
+
+* `count()` and `add_count()` now preserve the type of the input (#4086).
+
+* `add_count(drop = )` is deprecated because it didn't actually affect
+  the output.
+
 * `full_join()` gains keep argument so that you can optionally choose to 
   keep both sets of join keys (#4589). This is useful when you want to
   figure out which rows were missing from either side.

diff --git a/R/all-equal.r b/R/all-equal.r
@@ -66,29 +66,29 @@ equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = T
   # keys must be identical
   msg <- ""
   if (any(wrong <- !vec_in(x_split$key, y_split$key))) {
-    rows <- sort(map_int(x_split$pos[which(wrong)], function(.x) .x[1L]))
+    rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L]))
     msg <- paste0(msg, "- Rows in x but not in y: ", glue_collapse(rows, sep = ", "), "\n")
   }
 
   if (any(wrong <- !vec_in(y_split$key, x_split$key))) {
-    rows <- sort(map_int(y_split$pos[which(wrong)], function(.x) .x[1L]))
+    rows <- sort(map_int(y_split$loc[which(wrong)], function(.x) .x[1L]))
     msg <- paste0(msg, "- Rows in y but not in x: ", glue_collapse(rows, sep = ", "), "\n")
   }
   if (msg != "") {
     return(msg)
   }
 
   # keys are identical, check that rows occur the same number of times
-  if (any(wrong <- lengths(x_split$pos) != lengths(y_split$pos))) {
-    rows <- sort(map_int(x_split$pos[which(wrong)], function(.x) .x[1L]))
+  if (any(wrong <- lengths(x_split$loc) != lengths(y_split$loc))) {
+    rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L]))
     return(paste0("- Rows with difference occurences in x and y: ",
       glue_collapse(rows, sep = ", "),
       "\n"
     ))
   }
 
   # then if we care about row order, the id need to be identical
-  if (!ignore_row_order && !all(vec_equal(x_split$pos, y_split$pos))) {
+  if (!ignore_row_order && !all(vec_equal(x_split$loc, y_split$loc))) {
     return("Same row values, but different order")
   }
 

diff --git a/R/arrange.R b/R/arrange.R
@@ -16,11 +16,14 @@
 #' * treated differently for remote data, depending on the backend.
 #'
 #' @return
-#' An object of the same type as `.data`. The columns will be left as is;
-#' the rows will be in different order.
+#' An object of the same type as `.data`.
+#'
+#' * All rows appear in the output, but (usually) in a different place.
+#' * Columns are not modified.
+#' * Groups are not modified.
+#' * Data frame attributes are preserved.
 #' @export
 #' @inheritParams filter
-#' @inheritSection filter Tidy data
 #' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or
 #'   variables. Use [desc()] to sort a variable in descending order.
 #' @family single table verbs
@@ -46,19 +49,8 @@ arrange.data.frame <- function(.data, ..., .by_group = FALSE) {
     return(.data)
   }
 
-  idx <- arrange_rows(.data, ...)
-  .data[idx, , drop = FALSE]
-}
-
-#' @export
-arrange.grouped_df <- function(.data, ..., .by_group = FALSE) {
-  if (missing(...)) {
-    return(.data)
-  }
-
-  # TODO: figure out how to update group_indices more efficiently
-  idx <- arrange_rows(.data, ..., .by_group = .by_group)
-  .data[idx, , drop = FALSE]
+  loc <- arrange_rows(.data, ..., .by_group = .by_group)
+  dplyr_row_slice(.data, loc)
 }
 
 # Helpers -----------------------------------------------------------------

diff --git a/R/count-tally.R b/R/count-tally.R
@@ -25,8 +25,12 @@
 #'
 #'   If omitted, it will default to `n`. If there's already a column called `n`,
 #'   it will error, and require you to specify the name.
-#' @param .drop see [group_by()]
-#' @return A tbl, grouped the same way as the input.
+#' @param .drop For `count()`: if `FALSE` will include counts for empty groups
+#'   (i.e. for levels of factors that don't exist in the data). Deprecated for
+#'   `add_count()` since it didn't actually affect the output.
+#' @return
+#' An object of the same type as `.data`. `count()` and `add_count()`
+#' group transiently, so the output has the same groups as the input.
 #' @export
 #' @examples
 #' # count() is a convenient way to get a sense of the distribution of
@@ -84,26 +88,33 @@ add_tally <- function(x, wt = NULL, sort = FALSE, name = NULL) {
 #' @export
 #' @rdname tally
 count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x)) {
-  groups <- group_vars(x)
+
   if (!missing(...)) {
-    x <- .group_by_static_drop(x, ..., .add = TRUE, .drop = .drop)
+    out <- group_by(x, ..., .add = TRUE, .drop = .drop)
+  } else {
+    out <- x
   }
 
-  x <- tally(x, wt = !!enquo(wt), sort = sort, name = name)
-  x <- .group_by_static_drop(x, !!!syms(groups), .add = FALSE, .drop = .drop)
-  x
+  out <- tally(out, wt = !!enquo(wt), sort = sort, name = name)
+  dplyr_reconstruct(out, x)
 }
 
 #' @rdname tally
 #' @export
-add_count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x)) {
-  groups <- group_vars(x)
+add_count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated()) {
+  if (!missing(.drop)) {
+    lifecycle::deprecate_warn("1.0.0", "add_count(.drop = )")
+  }
+
   if (!missing(...)) {
-    x <- .group_by_static_drop(x, ..., .add = TRUE, .drop = .drop)
+    out <- group_by(x, ..., .add = TRUE)
+  } else {
+    out <- x
   }
+  out <- add_tally(out, wt = !!enquo(wt), sort = sort, name = name)
 
-  x <- add_tally(x, wt = !!enquo(wt), sort = sort, name = name)
-  x <- .group_by_static_drop(x, !!!syms(groups), .add = FALSE, .drop = .drop)
+  name <- check_name(x, name)
+  x[[name]] <- out[[name]]
   x
 }
 

diff --git a/R/deprec-lazyeval.R b/R/deprec-lazyeval.R
@@ -325,7 +325,7 @@ select_.data.frame <- function(.data, ..., .dots = list()) {
 #' @export
 select_.grouped_df <- function(.data, ..., .dots = list()) {
   dots <- compat_lazy_dots(.dots, caller_env(), ...)
-  select.grouped_df(.data, !!!dots)
+  select(.data, !!!dots)
 }
 
 

diff --git a/R/distinct.R b/R/distinct.R
@@ -12,10 +12,13 @@
 #'   If a combination of `...` is not distinct, this keeps the
 #'   first row of values.
 #' @return
-#' An object the same type as `.data`. If `...` is empty or `.keep_all` is
-#' `TRUE`, the columns will be unchanged. Otherwise, it will first perform a
-#' `mutate()`. The rows will be in the same order as the input, but only
-#' distinct elements will be preserved.
+#' An object of the same type as `.data`.
+#'
+#' * Rows are a subset of the input, but appear in the same order.
+#' * Columns are not modified if `...` is empty or `.keep_all` is `TRUE`.
+#'   Otherwise, `distinct()` first calls `mutate()` to create new columns.
+#' * Groups are not modified.
+#' * Data frame attributes are preserved.
 #' @export
 #' @examples
 #' df <- tibble(
@@ -91,24 +94,17 @@ distinct_prepare <- function(.data, vars, group_vars = character(), .keep_all =
 
 #' @export
 distinct.data.frame <- function(.data, ..., .keep_all = FALSE) {
-  prep <- distinct_prepare(.data, enquos(...), .keep_all = .keep_all)
-
-  idx <- vec_unique_loc(prep$data[, prep$vars, drop = FALSE])
-  prep$data[idx, prep$keep, drop = FALSE]
-}
-
-#' @export
-distinct.grouped_df <- function(.data, ..., .keep_all = FALSE) {
-  prep <- distinct_prepare(
-    .data,
+  prep <- distinct_prepare(.data,
     vars = enquos(...),
     group_vars = group_vars(.data),
     .keep_all = .keep_all
   )
 
-  # TODO: figure out how to update group indices more efficiently
-  idx <- vec_unique_loc(prep$data[, prep$vars, drop = FALSE])
-  prep$data[idx, prep$keep, drop = FALSE]
+  # out <- as_tibble(prep$data)
+  out <- prep$data
+  loc <- vec_unique_loc(as_tibble(out)[prep$vars])
+
+  dplyr_row_slice(out[prep$keep], loc)
 }
 
 

diff --git a/R/dplyr.r b/R/dplyr.r
@@ -43,4 +43,4 @@
 #' @importFrom lifecycle deprecated
 "_PACKAGE"
 
-utils::globalVariables(c("old_keys", "old_rows", ".rows", "new_indices", "new_rows", "new_rows_sizes", "needs_recycle", "distinct_vars"))
+utils::globalVariables(c("old_keys", "old_rows", ".rows", "new_indices", "new_rows", "new_rows_sizes", "needs_recycle", "distinct_vars", "out"))
diff --git a/R/filter.R b/R/filter.R
@@ -41,10 +41,6 @@
 #' When applied on a grouped tibble, `filter()` automatically [rearranges][arrange]
 #' the tibble by groups for performance reasons.
 #'
-#' @section Tidy data:
-#' When applied to a data frame, row names are silently dropped. To preserve,
-#' convert to an explicit variable with [tibble::rownames_to_column()].
-#'
 #' @section Scoped filtering:
 #' The three [scoped] variants ([filter_all()], [filter_if()] and
 #' [filter_at()]) make it easy to apply a filtering condition to a
@@ -59,7 +55,13 @@
 #'   condition evaluates to `TRUE` are kept.
 #' @param .preserve when `FALSE` (the default), the grouping structure
 #'   is recalculated based on the resulting data, otherwise it is kept as is.
-#' @inherit arrange return
+#' @return
+#' An object of the same type as `.data`.
+#'
+#' * Rows are a subset of the input, but appear in the same order.
+#' * Columns are not modified.
+#' * The number of groups may be reduced (if `.preserve` is not `TRUE`).
+#' * Data frame attributes are preserved.
 #' @seealso [filter_all()], [filter_if()] and [filter_at()].
 #' @export
 #' @examples
@@ -105,24 +107,8 @@ filter.data.frame <- function(.data, ..., .preserve = FALSE) {
     return(.data)
   }
 
-  idx <- filter_rows(.data, ...)
-  .data[idx[[1]], , drop = FALSE]
-}
-
-#' @export
-filter.grouped_df <- function(.data, ..., .preserve = !group_by_drop_default(.data)) {
-  if (missing(...)) {
-    return(.data)
-  }
-
-  idx <- filter_rows(.data, ...)
-  data <- as.data.frame(.data)[idx[[1]], , drop = FALSE]
-
-  groups <- group_data(.data)
-  groups$.rows <- filter_update_rows(nrow(.data), idx[[3]], idx[[1]], idx[[2]])
-  groups <- group_data_trim(groups, .preserve)
-
-  new_grouped_df(data, groups)
+  loc <- filter_rows(.data, ...)[[1]]
+  dplyr_row_slice(.data, loc, preserve = .preserve)
 }
 
 filter_rows <- function(.data, ...) {
@@ -160,8 +146,3 @@ check_filter <- function(dots) {
 
   }
 }
-
-
-filter_update_rows <- function(n_rows, group_indices, keep, new_rows_sizes) {
-  .Call(`dplyr_filter_update_rows`, n_rows, group_indices, keep, new_rows_sizes)
-}