-
Notifications
You must be signed in to change notification settings - Fork 135
Expand file tree
/
Copy pathlump.R
More file actions
237 lines (210 loc) · 6.08 KB
/
lump.R
File metadata and controls
237 lines (210 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#' Superseded helper for lumping factor levels
#'
#' @description
#' `r lifecycle::badge("superseded")`
#'
#' `fct_lump()` automatically chooses between [fct_lump_min()],
#' [fct_lump_prop()], [fct_lump_n()], and [fct_lump_lowfreq()] based on
#' its arguments. It is kept for backward compatibility, but is
#' superseded and no longer recommended for new code.
#'
#' For new code, prefer the more explicit helpers:
#' [fct_lump_min()], [fct_lump_prop()], [fct_lump_n()], and
#' [fct_lump_lowfreq()].
#'
#' @inheritParams fct_lump_min
#' @keywords internal
#' @examples
#' x <- factor(letters[rpois(100, 5)])
#' table(x)
#' table(fct_lump(x, n = 3))
#' @export
fct_lump <- function(
f,
n,
prop,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
) {
if (missing(n) && missing(prop)) {
fct_lump_lowfreq(f, w = w, other_level = other_level)
} else if (missing(prop)) {
fct_lump_n(
f,
n,
w = w,
other_level = other_level,
ties.method = ties.method
)
} else if (missing(n)) {
fct_lump_prop(f, prop, w = w, other_level = other_level)
} else {
cli::cli_abort("Must supply only one of {.arg n} and {.arg prop}.")
}
}
#' Lump uncommon factor levels together into "other"
#'
#' @description
#' A family of functions to lump together levels based on different criteria:
#'
#' * `fct_lump_min()`: lumps levels that appear fewer than `min` times.
#' * `fct_lump_prop()`: lumps levels that appear in fewer than (or equal to)
#' `prop * n` times.
#' * `fct_lump_n()` lumps all levels except for the `n` most frequent
#' (or least frequent if `n < 0`)
#' * `fct_lump_lowfreq()` lumps together the least frequent levels, ensuring
#' that "other" is still the smallest level.
#'
#' @param f A factor (or character vector).
#' @param n Positive `n` preserves the most common `n` values.
#' Negative `n` preserves the least common `-n` values.
#' If there are ties, you will get at least `abs(n)` values.
#' @param prop Positive `prop` lumps values which do not appear at least
#' `prop` of the time. Negative `prop` lumps values that
#' do not appear at most `-prop` of the time.
#' @param min Preserve levels that appear at least `min` number of times.
#' @param w An optional numeric vector giving weights for frequency of
#' each value (not level) in `f`.
#' @param other_level Value of level used for "other" values. Always
#' placed at end of levels.
#' @param ties.method A character string specifying how ties are
#' treated. See [rank()] for details.
#'
#' @name fct_lump_helpers
#' @seealso [fct_other()] to convert specified levels to other.
#' @examples
#' x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
#' x |> table()
#' x |>
#' fct_lump_n(3) |>
#' table()
#' x |>
#' fct_lump_prop(0.10) |>
#' table()
#' x |>
#' fct_lump_min(5) |>
#' table()
#' x |>
#' fct_lump_lowfreq() |>
#' table()
NULL
#' @export
#' @rdname fct_lump_helpers
fct_lump_min <- function(f, min, w = NULL, other_level = "Other") {
f <- check_factor(f)
check_number_decimal(min, min = 0)
check_string(other_level, allow_na = TRUE)
level_w <- compute_weights(f, w)
lvls_other(f, level_w >= min, other_level)
}
#' @export
#' @rdname fct_lump_helpers
fct_lump_prop <- function(f, prop, w = NULL, other_level = "Other") {
f <- check_factor(f)
check_number_decimal(prop)
check_string(other_level, allow_na = TRUE)
level_w <- compute_weights(f, w)
# Compute proportion of total, including NAs
if (is.null(w)) {
prop_n <- level_w / length(f)
} else {
prop_n <- level_w / sum(w)
}
if (prop < 0) {
lvls_other(f, prop_n <= -prop, other_level)
} else {
lvls_other(f, prop_n > prop, other_level)
}
}
#' @export
#' @rdname fct_lump_helpers
fct_lump_n <- function(
f,
n,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
) {
f <- check_factor(f)
check_number_decimal(n)
check_string(other_level, allow_na = TRUE)
ties.method <- arg_match(ties.method)
level_w <- compute_weights(f, w)
if (n < 0) {
rank <- rank(level_w, ties.method = ties.method)
n <- -n
} else {
rank <- rank(-level_w, ties.method = ties.method)
}
lvls_other(f, rank <= n, other_level)
}
#' @export
#' @rdname fct_lump_helpers
fct_lump_lowfreq <- function(f, w = NULL, other_level = "Other") {
f <- check_factor(f)
check_string(other_level, allow_na = TRUE)
level_w <- compute_weights(f, w)
lvls_other(f, !in_smallest(level_w), other_level)
}
# helpers -----------------------------------------------------------------
compute_weights <- function(f, w = NULL, call = caller_env()) {
w <- check_weights(w, length(f), call = call)
w <- w %||% rep(1L, length(f))
n <- as.vector(tapply(w, f, sum))
# fill in counts for empty levels
n[is.na(n)] <- 0
n
}
# Lump together smallest groups, ensuring that the collective
# "other" is still the smallest group. Assumes x is vector
# of counts in descending order
lump_cutoff <- function(x) {
left <- sum(x)
for (i in seq_along(x)) {
# After group, there are this many left
left <- left - x[i]
if (x[i] > left) {
return(i + 1)
}
}
length(x) + 1
}
# Given vector of counts, returns logical vector if in
# smallest groups
in_smallest <- function(x) {
ord_x <- order(x, decreasing = TRUE)
idx <- lump_cutoff(x[ord_x])
to_lump <- seq_along(x) >= idx
# Undo initial ordering
to_lump[order(ord_x)]
}
check_weights <- function(w, n = length(w), call = caller_env()) {
if (is.null(w)) {
return(w)
}
if (!is.numeric(w)) {
cli::cli_abort(
"{.arg w} must be a numeric vector, not {.obj_type_friendly w}.",
call = call
)
}
if (length(w) != n) {
cli::cli_abort(
"{.arg w} must be the same length as {.arg f} ({n}), not length {length(w)}.",
call = call
)
}
bad <- w < 0 | is.na(w)
if (any(bad)) {
probs <- which(bad)
cli::cli_abort(
c(
"All {.arg w} must be non-negative and non-missing.",
"{length(probs)} problem{?s} at positions {probs}."
),
call = call
)
}
w
}