-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
arrange.R
280 lines (241 loc) · 9.1 KB
/
arrange.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#' Order rows using column values
#'
#' @description
#' `arrange()` orders the rows of a data frame by the values of selected
#' columns.
#'
#' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you
#' need to explicitly mention grouping variables (or use `.by_group = TRUE`)
#' in order to group by them, and functions of variables are evaluated
#' once per data frame, not once per group.
#'
#' @details
#' ## Missing values
#' Unlike base sorting with `sort()`, `NA` are:
#' * always sorted to the end for local data, even when wrapped with `desc()`.
#' * treated differently for remote data, depending on the backend.
#'
#' @return
#' An object of the same type as `.data`. The output has the following
#' properties:
#'
#' * All rows appear in the output, but (usually) in a different place.
#' * Columns are not modified.
#' * Groups are not modified.
#' * Data frame attributes are preserved.
#' @section Methods:
#' This function is a **generic**, which means that packages can provide
#' implementations (methods) for other classes. See the documentation of
#' individual methods for extra arguments and differences in behaviour.
#'
#' The following methods are currently available in loaded packages:
#' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("arrange")}.
#' @export
#' @param .data A data frame, data frame extension (e.g. a tibble), or a
#' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for
#' more details.
#' @param ... <[`data-masking`][rlang::args_data_masking]> Variables, or
#' functions of variables. Use [desc()] to sort a variable in descending
#' order.
#' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to
#' grouped data frames only.
#' @param .locale The locale to sort character vectors in.
#'
#' - If `NULL`, the default, uses the `"C"` locale unless the
#' `dplyr.legacy_locale` global option escape hatch is active. See the
#' [dplyr-locale] help page for more details.
#'
#' - If a single string from [stringi::stri_locale_list()] is supplied, then
#' this will be used as the locale to sort with. For example, `"en"` will
#' sort with the American English locale. This requires the stringi package.
#'
#' - If `"C"` is supplied, then character vectors will always be sorted in the
#' C locale. This does not require stringi and is often much faster than
#' supplying a locale identifier.
#'
#' The C locale is not the same as English locales, such as `"en"`,
#' particularly when it comes to data containing a mix of upper and lower case
#' letters. This is explained in more detail on the [locale][dplyr-locale]
#' help page under the `Default locale` section.
#' @family single table verbs
#' @examples
#' arrange(mtcars, cyl, disp)
#' arrange(mtcars, desc(disp))
#'
#' # grouped arrange ignores groups
#' by_cyl <- mtcars %>% group_by(cyl)
#' by_cyl %>% arrange(desc(wt))
#' # Unless you specifically ask:
#' by_cyl %>% arrange(desc(wt), .by_group = TRUE)
#'
#' # use embracing when wrapping in a function;
#' # see ?rlang::args_data_masking for more details
#' tidy_eval_arrange <- function(.data, var) {
#' .data %>%
#' arrange({{ var }})
#' }
#' tidy_eval_arrange(mtcars, mpg)
#'
#' # Use `across()` or `pick()` to select columns with tidy-select
#' iris %>% arrange(pick(starts_with("Sepal")))
#' iris %>% arrange(across(starts_with("Sepal"), desc))
arrange <- function(.data, ..., .by_group = FALSE) {
UseMethod("arrange")
}
#' @rdname arrange
#' @export
arrange.data.frame <- function(.data,
...,
.by_group = FALSE,
.locale = NULL) {
dots <- enquos(...)
if (.by_group) {
dots <- c(quos(!!!groups(.data)), dots)
}
loc <- arrange_rows(.data, dots = dots, locale = .locale)
dplyr_row_slice(.data, loc)
}
# Helpers -----------------------------------------------------------------
arrange_rows <- function(data,
dots,
locale,
error_call = caller_env()) {
dplyr_local_error_call(error_call)
size <- nrow(data)
# `arrange()` implementation always uses bare data frames
data <- new_data_frame(data, n = size)
# Strip out calls to desc() replacing with direction argument
is_desc_call <- function(x) {
quo_is_call(x, "desc", ns = c("", "dplyr"))
}
directions <- map_chr(dots, function(dot) {
if (is_desc_call(dot)) "desc" else "asc"
})
dots <- map(dots, function(dot) {
if (is_desc_call(dot)) {
expr <- quo_get_expr(dot)
if (!has_length(expr, 2L)) {
abort("`desc()` must be called with exactly one argument.", call = error_call)
}
dot <- new_quosure(expr[[2]], quo_get_env(dot))
}
dot
})
n_dots <- length(dots)
seq_dots <- seq_len(n_dots)
cols <- vector("list", length = n_dots)
names(cols) <- as.character(seq_dots)
for (i in seq_dots) {
name <- vec_paste0("..", i)
dot <- dots[[i]]
# Evaluate each `dot` on the original data separately, rather than
# evaluating all at once. We want to avoid the "sequential evaluation"
# feature of `mutate()` where the 2nd expression can access results of the
# 1st (#6495).
elt <- mutate(data, "{name}" := !!dot, .keep = "none")
elt <- elt[[name]]
if (is.null(elt)) {
# In this case, `dot` evaluated to `NULL` for "column removal" so
# `elt[[name]]` won't exist, but we don't want to shorten `cols`.
next
}
cols[[i]] <- elt
}
if (vec_any_missing(cols)) {
# Drop `NULL`s that could result from `dot` evaluating to `NULL` above
not_missing <- !vec_detect_missing(cols)
cols <- vec_slice(cols, not_missing)
directions <- vec_slice(directions, not_missing)
}
data <- new_data_frame(cols, n = size)
if (is.null(locale) && dplyr_legacy_locale()) {
# Temporary legacy support for respecting the system locale.
# Only applied when `.locale` is `NULL` and `dplyr.legacy_locale` is set.
# Matches legacy `group_by()` ordering.
out <- dplyr_order_legacy(data = data, direction = directions)
return(out)
}
na_values <- if_else(directions == "desc", "smallest", "largest")
chr_proxy_collate <- locale_to_chr_proxy_collate(
locale = locale,
error_call = error_call
)
vec_order_radix(
x = data,
direction = directions,
na_value = na_values,
chr_proxy_collate = chr_proxy_collate
)
}
locale_to_chr_proxy_collate <- function(locale,
...,
has_stringi = has_minimum_stringi(),
error_call = caller_env()) {
check_dots_empty0(...)
if (is.null(locale) || is_string(locale, string = "C")) {
return(NULL)
}
if (is_character(locale)) {
if (!is_string(locale)) {
abort("If `.locale` is a character vector, it must be a single string.", call = error_call)
}
if (!has_stringi) {
abort("stringi >=1.5.3 is required to arrange in a different locale.", call = error_call)
}
if (!locale %in% stringi::stri_locale_list()) {
abort("`.locale` must be one of the locales within `stringi::stri_locale_list()`.", call = error_call)
}
return(sort_key_generator(locale))
}
abort("`.locale` must be a string or `NULL`.", call = error_call)
}
sort_key_generator <- function(locale) {
function(x) {
stringi::stri_sort_key(x, locale = locale)
}
}
# ------------------------------------------------------------------------------
dplyr_order_legacy <- function(data, direction = "asc") {
if (df_n_col(data) == 0L) {
# Work around `order(!!!list())` returning `NULL`
return(seq_len(nrow(data)))
}
proxies <- map2(data, direction, dplyr_proxy_order_legacy)
proxies <- unname(proxies)
inject(order(!!!proxies))
}
dplyr_proxy_order_legacy <- function(x, direction) {
# `order()` doesn't have a vectorized `decreasing` argument for most values of
# `method` ("radix" is an exception). So we need to apply this by column ahead
# of time. We have to apply `vec_proxy_order()` by column too, rather than on
# the original data frame, because it flattens df-cols and we can lose track
# of where to apply `direction`.
x <- vec_proxy_order(x)
if (is.data.frame(x)) {
if (any(map_lgl(x, is.data.frame))) {
abort(
"All data frame columns should have been flattened by now.",
.internal = TRUE
)
}
# Special handling for data frame proxies (either from df-cols or from
# vector classes with df proxies, like rcrds), which `order()` can't handle.
# We have to replace the df proxy with a single vector that orders the same
# way, so we use a dense rank that utilizes the system locale.
unique <- vec_unique(x)
order <- dplyr_order_legacy(unique, direction)
sorted_unique <- vec_slice(unique, order)
out <- vec_match(x, sorted_unique)
return(out)
}
if (!is_character(x) && !is_logical(x) && !is_integer(x) && !is_double(x) && !is_complex(x)) {
abort("Invalid type returned by `vec_proxy_order()`.", .internal = TRUE)
}
if (is.object(x)) {
x <- unstructure(x)
}
if (direction == "desc") {
x <- desc(x)
}
x
}