Skip to content

Commit

Permalink
Minimise regrouping work (#4751)
Browse files Browse the repository at this point in the history
This PR starts to develop the dplyr "interface", i.e. the set of generics that you need to provide methods for if you want to extend dplyr to work with new data frame subclasses. It also uses those methods (along with a count_regroups()) to ensure that the existing grouped_df implementations are not needlessly regrouping data.

Fixes #4086 because count() can now use dplyr_reconstruct() to restore the original class
Fixes #4051 because I've carefully documented the return value of the major verbs
Fixes #4711 since implementing with_groups() is now easy.
  • Loading branch information
hadley committed Jan 15, 2020
1 parent 8df2d66 commit 56c2197
Show file tree
Hide file tree
Showing 47 changed files with 818 additions and 507 deletions.
18 changes: 10 additions & 8 deletions NAMESPACE
Expand Up @@ -9,7 +9,6 @@ S3method("names<-",grouped_df)
S3method(anti_join,data.frame)
S3method(arrange,data.frame)
S3method(arrange,default)
S3method(arrange,grouped_df)
S3method(arrange_,data.frame)
S3method(arrange_,tbl_df)
S3method(as.data.frame,grouped_df)
Expand All @@ -32,7 +31,6 @@ S3method(default_missing,data.frame)
S3method(default_missing,default)
S3method(distinct,data.frame)
S3method(distinct,default)
S3method(distinct,grouped_df)
S3method(distinct_,data.frame)
S3method(distinct_,grouped_df)
S3method(distinct_,tbl_df)
Expand All @@ -45,9 +43,14 @@ S3method(do_,"NULL")
S3method(do_,data.frame)
S3method(do_,grouped_df)
S3method(do_,rowwise_df)
S3method(dplyr_col_modify,data.frame)
S3method(dplyr_col_modify,grouped_df)
S3method(dplyr_reconstruct,data.frame)
S3method(dplyr_reconstruct,grouped_df)
S3method(dplyr_row_slice,data.frame)
S3method(dplyr_row_slice,grouped_df)
S3method(filter,data.frame)
S3method(filter,default)
S3method(filter,grouped_df)
S3method(filter,ts)
S3method(filter_,data.frame)
S3method(filter_,tbl_df)
Expand Down Expand Up @@ -88,7 +91,6 @@ S3method(intersect,default)
S3method(left_join,data.frame)
S3method(mutate,data.frame)
S3method(mutate,default)
S3method(mutate,grouped_df)
S3method(mutate_,data.frame)
S3method(mutate_,tbl_df)
S3method(n_groups,data.frame)
Expand All @@ -108,7 +110,6 @@ S3method(recode,factor)
S3method(recode,numeric)
S3method(rename,data.frame)
S3method(rename,default)
S3method(rename,grouped_df)
S3method(rename_,data.frame)
S3method(rename_,grouped_df)
S3method(right_join,data.frame)
Expand All @@ -119,7 +120,6 @@ S3method(sample_n,data.frame)
S3method(sample_n,default)
S3method(select,data.frame)
S3method(select,default)
S3method(select,grouped_df)
S3method(select,list)
S3method(select_,data.frame)
S3method(select_,grouped_df)
Expand All @@ -130,7 +130,6 @@ S3method(setequal,data.frame)
S3method(setequal,default)
S3method(slice,data.frame)
S3method(slice,default)
S3method(slice,grouped_df)
S3method(slice_,data.frame)
S3method(slice_,tbl_df)
S3method(slice_head,data.frame)
Expand All @@ -150,7 +149,6 @@ S3method(tbl_ptype,default)
S3method(tbl_sum,grouped_df)
S3method(tbl_vars,data.frame)
S3method(transmute,data.frame)
S3method(transmute,grouped_df)
S3method(transmute_,default)
S3method(transmute_,grouped_df)
S3method(ungroup,data.frame)
Expand Down Expand Up @@ -241,6 +239,9 @@ export(distinct_if)
export(distinct_prepare)
export(do)
export(do_)
export(dplyr_col_modify)
export(dplyr_reconstruct)
export(dplyr_row_slice)
export(ends_with)
export(enexpr)
export(enexprs)
Expand Down Expand Up @@ -424,6 +425,7 @@ export(union)
export(union_all)
export(validate_grouped_df)
export(vars)
export(with_groups)
export(with_order)
export(wrap_dbplyr_obj)
import(rlang)
Expand Down
20 changes: 20 additions & 0 deletions NEWS.md
@@ -1,5 +1,25 @@
# dplyr 0.9.0 (in development)

* New, experimental, `with_groups()` makes it easy to temporarily group or
ungroup (#4711).

* dplyr now has a rudimentary, experimental, and stop-gap, extension mechanism
documented in `?dplyr_extending`

* The implementation of all verbs has been carefully thought through. This
mostly makes implementation simpler but should hopefully increase consistency,
and also makes it easier to adapt to dplyr to new data structures in the
new future. Pragmatically, the biggest difference for most people will be
that each verb documents its return value in terms of rows, columns, groups,
and data frame attributes.

* Row names are now preserved when working with data frames.

* `count()` and `add_count()` now preserve the type of the input (#4086).

* `add_count(drop = )` is deprecated because it didn't actually affect
the output.

* `full_join()` gains keep argument so that you can optionally choose to
keep both sets of join keys (#4589). This is useful when you want to
figure out which rows were missing from either side.
Expand Down
10 changes: 5 additions & 5 deletions R/all-equal.r
Expand Up @@ -66,29 +66,29 @@ equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = T
# keys must be identical
msg <- ""
if (any(wrong <- !vec_in(x_split$key, y_split$key))) {
rows <- sort(map_int(x_split$pos[which(wrong)], function(.x) .x[1L]))
rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L]))
msg <- paste0(msg, "- Rows in x but not in y: ", glue_collapse(rows, sep = ", "), "\n")
}

if (any(wrong <- !vec_in(y_split$key, x_split$key))) {
rows <- sort(map_int(y_split$pos[which(wrong)], function(.x) .x[1L]))
rows <- sort(map_int(y_split$loc[which(wrong)], function(.x) .x[1L]))
msg <- paste0(msg, "- Rows in y but not in x: ", glue_collapse(rows, sep = ", "), "\n")
}
if (msg != "") {
return(msg)
}

# keys are identical, check that rows occur the same number of times
if (any(wrong <- lengths(x_split$pos) != lengths(y_split$pos))) {
rows <- sort(map_int(x_split$pos[which(wrong)], function(.x) .x[1L]))
if (any(wrong <- lengths(x_split$loc) != lengths(y_split$loc))) {
rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L]))
return(paste0("- Rows with difference occurences in x and y: ",
glue_collapse(rows, sep = ", "),
"\n"
))
}

# then if we care about row order, the id need to be identical
if (!ignore_row_order && !all(vec_equal(x_split$pos, y_split$pos))) {
if (!ignore_row_order && !all(vec_equal(x_split$loc, y_split$loc))) {
return("Same row values, but different order")
}

Expand Down
24 changes: 8 additions & 16 deletions R/arrange.R
Expand Up @@ -16,11 +16,14 @@
#' * treated differently for remote data, depending on the backend.
#'
#' @return
#' An object of the same type as `.data`. The columns will be left as is;
#' the rows will be in different order.
#' An object of the same type as `.data`.
#'
#' * All rows appear in the output, but (usually) in a different place.
#' * Columns are not modified.
#' * Groups are not modified.
#' * Data frame attributes are preserved.
#' @export
#' @inheritParams filter
#' @inheritSection filter Tidy data
#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or
#' variables. Use [desc()] to sort a variable in descending order.
#' @family single table verbs
Expand All @@ -46,19 +49,8 @@ arrange.data.frame <- function(.data, ..., .by_group = FALSE) {
return(.data)
}

idx <- arrange_rows(.data, ...)
.data[idx, , drop = FALSE]
}

#' @export
arrange.grouped_df <- function(.data, ..., .by_group = FALSE) {
if (missing(...)) {
return(.data)
}

# TODO: figure out how to update group_indices more efficiently
idx <- arrange_rows(.data, ..., .by_group = .by_group)
.data[idx, , drop = FALSE]
loc <- arrange_rows(.data, ..., .by_group = .by_group)
dplyr_row_slice(.data, loc)
}

# Helpers -----------------------------------------------------------------
Expand Down
35 changes: 23 additions & 12 deletions R/count-tally.R
Expand Up @@ -25,8 +25,12 @@
#'
#' If omitted, it will default to `n`. If there's already a column called `n`,
#' it will error, and require you to specify the name.
#' @param .drop see [group_by()]
#' @return A tbl, grouped the same way as the input.
#' @param .drop For `count()`: if `FALSE` will include counts for empty groups
#' (i.e. for levels of factors that don't exist in the data). Deprecated for
#' `add_count()` since it didn't actually affect the output.
#' @return
#' An object of the same type as `.data`. `count()` and `add_count()`
#' group transiently, so the output has the same groups as the input.
#' @export
#' @examples
#' # count() is a convenient way to get a sense of the distribution of
Expand Down Expand Up @@ -84,26 +88,33 @@ add_tally <- function(x, wt = NULL, sort = FALSE, name = NULL) {
#' @export
#' @rdname tally
count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x)) {
groups <- group_vars(x)

if (!missing(...)) {
x <- .group_by_static_drop(x, ..., .add = TRUE, .drop = .drop)
out <- group_by(x, ..., .add = TRUE, .drop = .drop)
} else {
out <- x
}

x <- tally(x, wt = !!enquo(wt), sort = sort, name = name)
x <- .group_by_static_drop(x, !!!syms(groups), .add = FALSE, .drop = .drop)
x
out <- tally(out, wt = !!enquo(wt), sort = sort, name = name)
dplyr_reconstruct(out, x)
}

#' @rdname tally
#' @export
add_count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x)) {
groups <- group_vars(x)
add_count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated()) {
if (!missing(.drop)) {
lifecycle::deprecate_warn("1.0.0", "add_count(.drop = )")
}

if (!missing(...)) {
x <- .group_by_static_drop(x, ..., .add = TRUE, .drop = .drop)
out <- group_by(x, ..., .add = TRUE)
} else {
out <- x
}
out <- add_tally(out, wt = !!enquo(wt), sort = sort, name = name)

x <- add_tally(x, wt = !!enquo(wt), sort = sort, name = name)
x <- .group_by_static_drop(x, !!!syms(groups), .add = FALSE, .drop = .drop)
name <- check_name(x, name)
x[[name]] <- out[[name]]
x
}

Expand Down
2 changes: 1 addition & 1 deletion R/deprec-lazyeval.R
Expand Up @@ -325,7 +325,7 @@ select_.data.frame <- function(.data, ..., .dots = list()) {
#' @export
select_.grouped_df <- function(.data, ..., .dots = list()) {
dots <- compat_lazy_dots(.dots, caller_env(), ...)
select.grouped_df(.data, !!!dots)
select(.data, !!!dots)
}


Expand Down
30 changes: 13 additions & 17 deletions R/distinct.R
Expand Up @@ -12,10 +12,13 @@
#' If a combination of `...` is not distinct, this keeps the
#' first row of values.
#' @return
#' An object the same type as `.data`. If `...` is empty or `.keep_all` is
#' `TRUE`, the columns will be unchanged. Otherwise, it will first perform a
#' `mutate()`. The rows will be in the same order as the input, but only
#' distinct elements will be preserved.
#' An object of the same type as `.data`.
#'
#' * Rows are a subset of the input, but appear in the same order.
#' * Columns are not modified if `...` is empty or `.keep_all` is `TRUE`.
#' Otherwise, `distinct()` first calls `mutate()` to create new columns.
#' * Groups are not modified.
#' * Data frame attributes are preserved.
#' @export
#' @examples
#' df <- tibble(
Expand Down Expand Up @@ -91,24 +94,17 @@ distinct_prepare <- function(.data, vars, group_vars = character(), .keep_all =

#' @export
distinct.data.frame <- function(.data, ..., .keep_all = FALSE) {
prep <- distinct_prepare(.data, enquos(...), .keep_all = .keep_all)

idx <- vec_unique_loc(prep$data[, prep$vars, drop = FALSE])
prep$data[idx, prep$keep, drop = FALSE]
}

#' @export
distinct.grouped_df <- function(.data, ..., .keep_all = FALSE) {
prep <- distinct_prepare(
.data,
prep <- distinct_prepare(.data,
vars = enquos(...),
group_vars = group_vars(.data),
.keep_all = .keep_all
)

# TODO: figure out how to update group indices more efficiently
idx <- vec_unique_loc(prep$data[, prep$vars, drop = FALSE])
prep$data[idx, prep$keep, drop = FALSE]
# out <- as_tibble(prep$data)
out <- prep$data
loc <- vec_unique_loc(as_tibble(out)[prep$vars])

dplyr_row_slice(out[prep$keep], loc)
}


Expand Down
2 changes: 1 addition & 1 deletion R/dplyr.r
Expand Up @@ -43,4 +43,4 @@
#' @importFrom lifecycle deprecated
"_PACKAGE"

utils::globalVariables(c("old_keys", "old_rows", ".rows", "new_indices", "new_rows", "new_rows_sizes", "needs_recycle", "distinct_vars"))
utils::globalVariables(c("old_keys", "old_rows", ".rows", "new_indices", "new_rows", "new_rows_sizes", "needs_recycle", "distinct_vars", "out"))
37 changes: 9 additions & 28 deletions R/filter.R
Expand Up @@ -41,10 +41,6 @@
#' When applied on a grouped tibble, `filter()` automatically [rearranges][arrange]
#' the tibble by groups for performance reasons.
#'
#' @section Tidy data:
#' When applied to a data frame, row names are silently dropped. To preserve,
#' convert to an explicit variable with [tibble::rownames_to_column()].
#'
#' @section Scoped filtering:
#' The three [scoped] variants ([filter_all()], [filter_if()] and
#' [filter_at()]) make it easy to apply a filtering condition to a
Expand All @@ -59,7 +55,13 @@
#' condition evaluates to `TRUE` are kept.
#' @param .preserve when `FALSE` (the default), the grouping structure
#' is recalculated based on the resulting data, otherwise it is kept as is.
#' @inherit arrange return
#' @return
#' An object of the same type as `.data`.
#'
#' * Rows are a subset of the input, but appear in the same order.
#' * Columns are not modified.
#' * The number of groups may be reduced (if `.preserve` is not `TRUE`).
#' * Data frame attributes are preserved.
#' @seealso [filter_all()], [filter_if()] and [filter_at()].
#' @export
#' @examples
Expand Down Expand Up @@ -105,24 +107,8 @@ filter.data.frame <- function(.data, ..., .preserve = FALSE) {
return(.data)
}

idx <- filter_rows(.data, ...)
.data[idx[[1]], , drop = FALSE]
}

#' @export
filter.grouped_df <- function(.data, ..., .preserve = !group_by_drop_default(.data)) {
if (missing(...)) {
return(.data)
}

idx <- filter_rows(.data, ...)
data <- as.data.frame(.data)[idx[[1]], , drop = FALSE]

groups <- group_data(.data)
groups$.rows <- filter_update_rows(nrow(.data), idx[[3]], idx[[1]], idx[[2]])
groups <- group_data_trim(groups, .preserve)

new_grouped_df(data, groups)
loc <- filter_rows(.data, ...)[[1]]
dplyr_row_slice(.data, loc, preserve = .preserve)
}

filter_rows <- function(.data, ...) {
Expand Down Expand Up @@ -160,8 +146,3 @@ check_filter <- function(dots) {

}
}


filter_update_rows <- function(n_rows, group_indices, keep, new_rows_sizes) {
.Call(`dplyr_filter_update_rows`, n_rows, group_indices, keep, new_rows_sizes)
}

0 comments on commit 56c2197

Please sign in to comment.