/
collectors.R
400 lines (369 loc) · 13 KB
/
collectors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
collector <- function(type, ...) {
structure(list(...), class = c(paste0("collector_", type), "collector"))
}
is.collector <- function(x) inherits(x, "collector")
#' @export
print.collector <- function(x, ...) {
cat("<", class(x)[1], ">\n", sep = "")
}
collector_find <- function(name) {
if (is.na(name)) {
return(col_character())
}
get(paste0("col_", name), envir = asNamespace("readr"))()
}
#' Parse a character vector.
#'
#' @param x Character vector of elements to parse.
#' @param collector Column specification.
#' @inheritParams read_delim
#' @inheritParams tokenizer_delim
#' @keywords internal
#' @export
#' @examples
#' x <- c("1", "2", "3", "NA")
#' parse_vector(x, col_integer())
#' parse_vector(x, col_double())
parse_vector <- function(x, collector, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
stopifnot(is.character(x))
if (is.character(collector)) {
collector <- collector_find(collector)
}
warn_problems(parse_vector_(x, collector, na = na, locale_ = locale, trim_ws = trim_ws))
}
#' Parse logicals, integers, and reals
#'
#' Use `parse_*()` if you have a character vector you want to parse. Use
#' `col_*()` in conjunction with a `read_*()` function to parse the
#' values as they're read in.
#'
#' @name parse_atomic
#' @aliases NULL
#' @param x Character vector of values to parse.
#' @inheritParams tokenizer_delim
#' @inheritParams read_delim
#' @family parsers
#' @examples
#' parse_integer(c("1", "2", "3"))
#' parse_double(c("1", "2", "3.123"))
#' parse_number("$1,123,456.00")
#'
#' # Use locale to override default decimal and grouping marks
#' es_MX <- locale("es", decimal_mark = ",")
#' parse_number("$1.123.456,00", locale = es_MX)
#'
#' # Invalid values are replaced with missing values with a warning.
#' x <- c("1", "2", "3", "-")
#' parse_double(x)
#' # Or flag values as missing
#' parse_double(x, na = "-")
NULL
#' @rdname parse_atomic
#' @export
parse_logical <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_logical(), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_atomic
#' @export
parse_integer <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_integer(), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_atomic
#' @export
parse_double <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_double(), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_atomic
#' @export
parse_character <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_character(), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_atomic
#' @export
col_logical <- function() {
collector("logical")
}
#' @rdname parse_atomic
#' @export
col_integer <- function() {
collector("integer")
}
#' @rdname parse_atomic
#' @export
col_double <- function() {
collector("double")
}
#' @rdname parse_atomic
#' @export
col_character <- function() {
collector("character")
}
#' Skip a column
#'
#' Use this function to ignore a column when reading in a file.
#' To skip all columns not otherwise specified, use \code{\link{cols_only}()}.
#'
#' @family parsers
#' @export
col_skip <- function() {
collector("skip")
}
#' Parse numbers, flexibly
#'
#' This drops any non-numeric characters before or after the first number.
#' The grouping mark specified by the locale is ignored inside the number.
#'
#' @inheritParams parse_atomic
#' @inheritParams tokenizer_delim
#' @inheritParams read_delim
#' @family parsers
#' @export
#' @examples
#' parse_number("$1000")
#' parse_number("1,234,567.78")
parse_number <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_number(), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_number
#' @export
col_number <- function() {
collector("number")
}
#' Parse using the "best" type
#'
#' `parse_guess()` returns the parser vector; `guess_parser()`
#' returns the name of the parser. These functions use a number of heuristics
#' to determine which type of vector is "best". Generally they try to err of
#' the side of safety, as it's straightforward to override the parsing choice
#' if needed.
#'
#' @inheritParams parse_atomic
#' @inheritParams tokenizer_delim
#' @inheritParams read_delim
#' @family parsers
#' @export
#' @examples
#' # Logical vectors
#' parse_guess(c("FALSE", "TRUE", "F", "T"))
#'
#' # Integers and doubles
#' parse_guess(c("1","2","3"))
#' parse_guess(c("1.6","2.6","3.4"))
#'
#' # Numbers containing grouping mark
#' guess_parser("1,234,566")
#' parse_guess("1,234,566")
#'
#' # ISO 8601 date times
#' guess_parser(c("2010-10-10"))
#' parse_guess(c("2010-10-10"))
parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, guess_parser(x, locale), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_guess
#' @export
col_guess <- function() {
collector("guess")
}
#' @rdname parse_guess
#' @export
guess_parser <- function(x, locale = default_locale()) {
stopifnot(is.locale(locale))
collectorGuess(x, locale)
}
#' Parse factors
#'
#' `parse_factor` is similar to [factor()], but will generate
#' warnings if elements of `x` are not found in `levels`.
#'
#' @param levels Character vector providing set of allowed levels. if `NULL`,
#' will generate levels based on the unique values of `x`, ordered by order
#' of appearance in `x`.
#' @param ordered Is it an ordered factor?
#' @param include_na If `NA` are present, include as an explicit factor to level?
#' @inheritParams parse_atomic
#' @inheritParams tokenizer_delim
#' @inheritParams read_delim
#' @family parsers
#' @export
#' @examples
#' parse_factor(c("a", "b"), letters)
#'
#' x <- c("cat", "dog", "caw")
#' levels <- c("cat", "dog", "cow")
#'
#' # Base R factor() silently converts unknown levels to NA
#' x1 <- factor(x, levels)
#'
#' # parse_factor generates a warning & problems
#' x2 <- parse_factor(x, levels)
#'
#' # Using an argument of `NULL` will generate levels based on values of `x`
#' x2 <- parse_factor(x, levels = NULL)
parse_factor <- function(x, levels, ordered = FALSE, na = c("", "NA"),
locale = default_locale(), include_na = TRUE, trim_ws = TRUE) {
parse_vector(x, col_factor(levels, ordered, include_na), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_factor
#' @export
col_factor <- function(levels, ordered = FALSE, include_na = FALSE) {
collector("factor", levels = levels, ordered = ordered, include_na = include_na)
}
# More complex ------------------------------------------------------------
#' Parse date/times
#'
#' @section Format specification:
#' `readr` uses a format specification similar to [strptime()].
#' There are three types of element:
#'
#' \enumerate{
#' \item Date components are specified with "\%" followed by a letter.
#' For example "\%Y" matches a 4 digit year, "\%m", matches a 2 digit
#' month and "\%d" matches a 2 digit day. Month and day default to `1`,
#' (i.e. Jan 1st) if not present, for example if only a year is given.
#' \item Whitespace is any sequence of zero or more whitespace characters.
#' \item Any other character is matched exactly.
#' }
#'
#' `parse_datetime()` recognises the following format specifications:
#' \itemize{
#' \item Year: "\%Y" (4 digits). "\%y" (2 digits); 00-69 -> 2000-2069,
#' 70-99 -> 1970-1999.
#' \item Month: "\%m" (2 digits), "\%b" (abbreviated name in current
#' locale), "\%B" (full name in current locale).
#' \item Day: "\%d" (2 digits), "\%e" (optional leading space),
#' "%a" (abbreviated name in current locale).
#' \item Hour: "\%H" or "\%I", use I (and not H) with AM/PM.
#' \item Minutes: "\%M"
#' \item Seconds: "\%S" (integer seconds), "\%OS" (partial seconds)
#' \item Time zone: "\%Z" (as name, e.g. "America/Chicago"), "\%z" (as
#' offset from UTC, e.g. "+0800")
#' \item AM/PM indicator: "\%p".
#' \item Non-digits: "\%." skips one non-digit character,
#' "\%+" skips one or more non-digit characters,
#' "\%*" skips any number of non-digits characters.
#' \item Automatic parsers: "\%AD" parses with a flexible YMD parser,
#' "\%AT" parses with a flexible HMS parser.
#' \item Shortcuts: "\%D" = "\%m/\%d/\%y", "\%F" = "\%Y-\%m-\%d",
#' "\%R" = "\%H:\%M", "\%T" = "\%H:\%M:\%S", "\%x" = "\%y/\%m/\%d".
#' }
#'
#' @section ISO8601 support:
#'
#' Currently, readr does not support all of ISO8601. Missing features:
#'
#' \itemize{
#' \item Week & weekday specifications, e.g. "2013-W05", "2013-W05-10"
#' \item Ordinal dates, e.g. "2013-095".
#' \item Using commas instead of a period for decimal separator
#' }
#'
#' The parser is also a little laxer than ISO8601:
#'
#' \itemize{
#' \item Dates and times can be separated with a space, not just T.
#' \item Mostly correct specifications like "2009-05-19 14:" and "200912-01" work.
#' }
#'
#' @param x A character vector of dates to parse.
#' @param format A format specification, as described below. If set to "",
#' date times are parsed as ISO8601, dates and times used the date and
#' time formats specified in the [locale()].
#'
#' Unlike [strptime()], the format specification must match
#' the complete string.
#' @inheritParams read_delim
#' @inheritParams tokenizer_delim
#' @return A [POSIXct()] vector with `tzone` attribute set to
#' `tz`. Elements that could not be parsed (or did not generate valid
#' dates) will bes set to `NA`, and a warning message will inform
#' you of the total number of failures.
#' @family parsers
#' @export
#' @examples
#' # Format strings --------------------------------------------------------
#' parse_datetime("01/02/2010", "%d/%m/%Y")
#' parse_datetime("01/02/2010", "%m/%d/%Y")
#' # Handle any separator
#' parse_datetime("01/02/2010", "%m%.%d%.%Y")
#'
#' # Dates look the same, but internally they use the number of days since
#' # 1970-01-01 instead of the number of seconds. This avoids a whole lot
#' # of troubles related to time zones, so use if you can.
#' parse_date("01/02/2010", "%d/%m/%Y")
#' parse_date("01/02/2010", "%m/%d/%Y")
#'
#' # You can parse timezones from strings (as listed in OlsonNames())
#' parse_datetime("2010/01/01 12:00 US/Central", "%Y/%m/%d %H:%M %Z")
#' # Or from offsets
#' parse_datetime("2010/01/01 12:00 -0600", "%Y/%m/%d %H:%M %z")
#'
#' # Use the locale parameter to control the default time zone
#' # (but note UTC is considerably faster than other options)
#' parse_datetime("2010/01/01 12:00", "%Y/%m/%d %H:%M",
#' locale = locale(tz = "US/Central"))
#' parse_datetime("2010/01/01 12:00", "%Y/%m/%d %H:%M",
#' locale = locale(tz = "US/Eastern"))
#'
#' # Unlike strptime, the format specification must match the complete
#' # string (ignoring leading and trailing whitespace). This avoids common
#' # errors:
#' strptime("01/02/2010", "%d/%m/%y")
#' parse_datetime("01/02/2010", "%d/%m/%y")
#'
#' # Failures -------------------------------------------------------------
#' parse_datetime("01/01/2010", "%d/%m/%Y")
#' parse_datetime(c("01/ab/2010", "32/01/2010"), "%d/%m/%Y")
#'
#' # Locales --------------------------------------------------------------
#' # By default, readr expects English date/times, but that's easy to change'
#' parse_datetime("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
#' parse_datetime("1 enero 2015", "%d %B %Y", locale = locale("es"))
#'
#' # ISO8601 --------------------------------------------------------------
#' # With separators
#' parse_datetime("1979-10-14")
#' parse_datetime("1979-10-14T10")
#' parse_datetime("1979-10-14T10:11")
#' parse_datetime("1979-10-14T10:11:12")
#' parse_datetime("1979-10-14T10:11:12.12345")
#'
#' # Without separators
#' parse_datetime("19791014")
#' parse_datetime("19791014T101112")
#'
#' # Time zones
#' us_central <- locale(tz = "US/Central")
#' parse_datetime("1979-10-14T1010", locale = us_central)
#' parse_datetime("1979-10-14T1010-0500", locale = us_central)
#' parse_datetime("1979-10-14T1010Z", locale = us_central)
#' # Your current time zone
#' parse_datetime("1979-10-14T1010", locale = locale(tz = ""))
parse_datetime <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_datetime(format), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_datetime
#' @export
parse_date <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_date(format), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_datetime
#' @export
parse_time <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) {
parse_vector(x, col_time(format), na = na, locale = locale, trim_ws = trim_ws)
}
#' @rdname parse_datetime
#' @export
col_datetime <- function(format = "") {
collector("datetime", format = format)
}
#' @rdname parse_datetime
#' @export
col_date <- function(format = "") {
collector("date", format = format)
}
#' @rdname parse_datetime
#' @export
col_time <- function(format = "") {
collector("time", format = format)
}