Permalink
439 lines (377 sloc) 16.4 KB
##' Guess possible date-times formats from a character vector
##'
##' @param x input vector of date-times
##' @param orders format orders to look for. See examples.
##' @param locale locale to use, default to the current locale
##' @param preproc_wday whether to preprocess week days names. Internal
##' optimization used by ymd_hms family of functions. If TRUE, weekdays are
##' substituted with %a or %A accordingly, so that there is no need to supply
##' this format explicitly.
##' @param print_matches for development purpose mainly. If TRUE prints a matrix
##' of matched templates.
##' @return a vector of matched formats
##' @export
##' @examples
##'
##' x <- c('February 20th 1973',
##' "february 14, 2004",
##' "Sunday, May 1, 2000",
##' "Sunday, May 1, 2000",
##' "february 14, 04",
##' 'Feb 20th 73',
##' "January 5 1999 at 7pm",
##' "jan 3 2010",
##' "Jan 1, 1999",
##' "jan 3 10",
##' "01 3 2010",
##' "1 3 10",
##' '1 13 89',
##' "5/27/1979",
##' "12/31/99",
##' "DOB:12/11/00",
##' "-----------",
##' 'Thu, 1 July 2004 22:30:00',
##' 'Thu, 1st of July 2004 at 22:30:00',
##' 'Thu, 1July 2004 at 22:30:00',
##' 'Thu, 1July2004 22:30:00',
##' 'Thu, 1July04 22:30:00',
##' "21 Aug 2011, 11:15:34 pm",
##' "-----------",
##' "1979-05-27 05:00:59",
##' "1979-05-27",
##' "-----------",
##' "3 jan 2000",
##' "17 april 85",
##' "27/5/1979",
##' '20 01 89',
##' '00/13/10',
##' "-------",
##' "14 12 00",
##' "03:23:22 pm")
##'
##' guess_formats(x, "BdY")
##' guess_formats(x, "Bdy")
##' ## m also matches b and B; y also matches Y
##' guess_formats(x, "mdy", print_matches = TRUE)
##'
##' ## T also matches IMSp order
##' guess_formats(x, "T", print_matches = TRUE)
##'
##' ## b and B are equivalent and match, both, abreviated and full names
##' guess_formats(x, c("mdY", "BdY", "Bdy", "bdY", "bdy"), print_matches = TRUE)
##' guess_formats(x, c("dmy", "dbY", "dBy", "dBY"), print_matches = TRUE)
##'
##'
##' guess_formats(x, c("dBY HMS", "dbY HMS", "dmyHMS", "BdY H"), print_matches = TRUE)
##'
##' guess_formats(x, c("ymd HMS"), print_matches = TRUE)
##'
guess_formats <- function(x, orders, locale = Sys.getlocale("LC_TIME"),
preproc_wday = TRUE, print_matches = FALSE) {
## remove all separators
orders <- gsub("[^[:alpha:]]+", "", orders)
if (any(grepl("hms|hm|ms", orders))) {
.deprecated("hms, hm and ms usage", ", please use HMS, HM or MS instead", "1.5.6")
orders <- gsub("hms", "HMS", orders, ignore.case = TRUE)
orders <- gsub("hm", "HM", orders, ignore.case = TRUE)
orders <- gsub("ms", "MS", orders, ignore.case = TRUE)
}
## redirect some formats to C parser (using perl's lookbehind)
if (length(wp <- grepl("(?<!O)p", orders, perl = T)))
orders <- c(sub("p", "Op", orders[wp], fixed = T), orders)
if (length(wm <- grepl("(?<!O)m", orders, perl = T)))
orders <- c(sub("m", "Om", orders[wm]), orders)
if (length(wm <- grepl("(?<!O)[bB]", orders, perl = T)))
orders <- c(sub("b", "Ob", orders[wm]), orders)
if (length(wT <- grepl("T", orders, fixed = T)))
orders <- c(sub("T", "HMSOp", orders[wT], fixed = T), orders)
if (length(wR <- grepl("R", orders, fixed = T)))
orders <- c(sub("R", "HMOp", orders[wR], fixed = T), orders)
if (length(wr <- grepl("r", orders, fixed = T)))
orders <- c(sub("r", "HOp", orders[wR], fixed = T), orders)
## We split into characterst first and then paste together formats that start
## with O. If perl style lookahead would have worked we wouldn't need this,
## but it doesn't.
osplits <- strsplit(orders, "", fixed = TRUE)
osplits <- lapply(osplits,
function(ospt) {
if (length(which_O <- which(ospt == "O")) > 0) {
ospt[which_O + 1] <- paste("O", ospt[which_O + 1], sep = "")
ospt[-which_O]
} else
ospt
})
reg <- .get_locale_regs(locale)
flex_regs <- c(reg$alpha_flex, reg$num_flex, .c_parser_reg_flex)
exact_regs <- c(reg$alpha_exact, reg$num_exact, .c_parser_reg_exact)
REGS <- unlist(lapply(osplits, function(fnames) {
## fnames are names of smalest valid formats, like a, A, b, z, OS, OZ ...
which <- !fnames %in% c(names(reg$alpha_flex),
names(reg$num_flex),
names(.c_parser_reg_exact))
if (any(which))
stop("Unknown formats supplied: ", paste(fnames[ which ], sep = ", "))
## restriction: no numbers before or after
## fixme: using \\D*? because \\D+ doesn't work in flex match, why?
paste("^\\D*?\\b((",
paste(unlist(flex_regs[fnames]), collapse = "\\D*?"),
")|(",
paste(unlist(exact_regs[fnames]), collapse = "\\D*?"),
"))\\D*$", sep = "")
}))
## print debugging info
if (print_matches) {
subs <- lapply(REGS, .substitute_formats, x, fmts_only = FALSE)
names(subs) <- orders
print(do.call(cbind, c(list(x), subs)))
}
.build_formats <- function(regs, orders, x) {
out <- mapply(
function(reg, name) {
out <- .substitute_formats(reg, x)
if (!is.null(out)) names(out) <- rep.int(name, length(out))
out
}, REGS, orders, SIMPLIFY = F, USE.NAMES = F)
names(out) <- NULL
unlist(out)
}
if (preproc_wday && !any(grepl("[aA]", orders))) {
## replace short/long weak days in current locale
x2 <- gsub(reg$alpha_exact[["A"]], "%A", x, ignore.case = T, perl = T)
x2 <- gsub(reg$alpha_exact[["a"]], "%a", x2, ignore.case = T, perl = T)
formats <- .build_formats(REGS, orders, x2)
## In some locales abreviated day (italian "gio") is part of month name
## ("maggio"). So check if %a or %A is present and append wday-less formats.
if (any(grepl("%[aA]", formats)))
c(formats, .build_formats(REGS, orders, x))
else
formats
} else {
.build_formats(REGS, orders, x)
}
}
.substitute_formats <- function(reg, x, fmts_only = TRUE) {
## Take date X and substitute year with %Y/%y, month with %B/%b etc.
## Return the formatted string if REG matched, or null otherwise.
## REG should be with captures as build by .build_locale_regs
## Captures should start with the strptyme formats. For example Y_e, B_m_e
## Traling _e, _m_e are removed.
## fmts_only == FALSE is for debugging purpose only
m <- regexpr(reg, x, ignore.case = TRUE, perl = TRUE)
## print(regs[[1]])
matched <- m > 0
if (any(matched)) {
nms <- attr(m, "capture.names")
nms <- nms[nzchar(nms)]
## e <- grepl("_e", nms, fixed = TRUE)
## nms_e <- nms[e]
## nms <- nms[!e]
start <- attr(m, "capture.start")[matched, , drop = FALSE]
end <- start + attr(m, "capture.length")[matched, , drop = FALSE] - 1L
lout <- x[matched]
for (n in rev(nms)) { ## start from the end
w <- end[, n] > 0 ## -1 if unmatched subpatern
str_sub(lout[w], start[w, n], end[w, n]) <- paste("%", gsub("_.*$", "", n), sep = "")
}
if (fmts_only)
lout
else {
## developer
out <- character(length(x)) ## dev only
out[matched] <- lout
out
}
} else if (fmts_only)
NULL
else character(length(x))
}
.enclose <- function(fmts)
paste("@", fmts, "@", sep = "")
.enclosed.na <- function(x)
x == "@NA@"
.get_train_set <- function(x) {
## the best irregular guesser I could come up with
x <- x[!.enclosed.na(x)]
len <- length(x)
if (len < 100)
x
else if (len < 3571)
x[.primes[.primes <= length(x) ] ]
else
x[ .primes * (length(x) %/% 3571) ] # 501 primes
}
.train_formats <- function(x, formats, locale) {
## return a numeric vector of size length(formats), with each element giving
## the number of matched elements in X
## can return NULL if formats is NULL
trials <- lapply(formats, function(fmt) .strptime(x, fmt, locale = locale))
successes <- unlist(lapply(trials, function(x) sum(!is.na(x))), use.names = FALSE)
names(successes) <- formats
sort(successes, decreasing = TRUE)
}
.best_formats <- function(x, orders, locale, .select_formats, drop = FALSE) {
## return a vector of formats that matched X at least once.
## Can be zero length vector, if none matched
fmts <- unique(guess_formats(x, orders, locale = locale, preproc_wday = TRUE)) # orders as names
if (length(fmts)) {
trained <- .train_formats(x, fmts, locale = locale)
if (drop)
trained <- trained[ trained > 0 ]
.select_formats(trained, drop)
}
}
.select_formats <- function(trained, drop = FALSE) {
nms <- names(trained)
score <-
nchar(gsub("[^%]", "", nms)) + ## longer formats have priority
grepl("%Y", nms, fixed = T)*1.5 +
grepl("%y(?!%)", nms, perl = T)*1.6 + ## y has priority over Y, but only when not followed by %
grepl("%[Bb]", nms)*.31 + ## B/b format has priority over %Om
## C parser formats have higher priority
grepl("%Om", nms)*.3 +
grepl("%Op", nms)*.3 +
grepl("%Ob", nms)*.32 ## Ob has higher priority than B/b
## ties are broken by `trained`
n0 <- trained != 0
if (drop) {
score <- score[n0]
trained <- trained[n0]
} else {
score[!n0] <- -100
}
## print(rbind(trained, score))
names(trained)[order(score, trained, decreasing = T)]
}
## These are formats that are effectively matched by c parser. But we must get
## through the format guesser first for ymd_hms family.
.c_parser_reg_flex <- list(Op = "(?<Op>(AM|PM))(?![[:alpha:]])",
Om = "((?<Om>1[0-2]|0?[1-9](?!\\d))|(((?<Om_b>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(?<Om_B>January|February|March|April|May|June|July|August|September|October|November|December))(?![[:alpha:]])))",
Ob = "(((?<Ob_b>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(?<Ob_B>January|February|March|April|May|June|July|August|September|October|November|December))(?![[:alpha:]]))")
.c_parser_reg_exact <- list(Op = "(?<Op_e>AM|PM)(?![[:alpha:]])",
Om = "((?<Om_e>1[0-2]|0[1-9])|(((?<Om_b_e>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(?<Om_B_e>January|February|March|April|May|June|July|August|September|October|November|December))(?![[:alpha:]])))",
Ob = "(((?<Ob_b_e>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(?<Ob_B_e>January|February|March|April|May|June|July|August|September|October|November|December))(?![[:alpha:]]))")
.locale_reg_cache <- new.env(hash = FALSE)
.get_locale_regs <- function(locale = Sys.getlocale("LC_TIME")) {
## build locale specific regexps for all posible orders
if (exists(locale, envir = .locale_reg_cache, inherits = FALSE))
return(get(locale, envir = .locale_reg_cache))
orig_locale <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", locale)
orig_opt <- options(warn = 5)
on.exit({Sys.setlocale("LC_TIME", orig_locale)
options(orig_opt)})
format <- "%a@%A@%b@%B@%p@"
L <- enc2utf8(unique(format(.date_template, format = format)))
mat <- do.call(rbind, strsplit(L, "@", fixed = TRUE))
mat[] <- gsub("([].|(){^$*+?[])", "\\\\\\1", mat) ## escaping all meta chars
names <- colnames(mat) <- strsplit(format, "[%@]+")[[1]][-1L]
## Captures should be unique. Thus we build captures with the following
## rule. Capture starts with the name of strptime format (B, b, y etc) It ends
## with _e if the expression is an exact match (as oposed to flex), see below.
## It can contain _x where x is a main format in which this format
## occurs. For example <B_b_e> is an exact capture in the strptime format B
## but that also matches b in lubridate. Try lubridate:::.get_loc_regs()
## todo: elaborate
## ALPHABETIC FORMATS
alpha <- list()
alpha["b"] <-
sprintf("((?<b_b>%s)|(?<B_b>%s))(?![[:alpha:]])",
paste(unique(mat[, "b"]), collapse = "|"),
paste(unique(mat[, "B"]), collapse = "|"))
alpha["B"] <-
sprintf("(?<B_B>%s)(?![[:alpha:]])",
paste(unique(mat[, "B"]), collapse = "|"))
alpha["a"] <-
sprintf("((?<a_a>%s)|(?<A_a>%s))(?![[:alpha:]])",
paste(unique(mat[, "a"]), collapse = "|"),
paste(unique(mat[, "A"]), collapse = "|"))
alpha["A"] <-
sprintf("(?<A_A>%s)(?![[:alpha:]])",
paste(unique(mat[, "A"]), collapse = "|"))
## just match Z in ISO8601, (UTC zulu format)
alpha["Ou"] <- "(?<Ou_Ou>Z)(?![[:alpha:]])"
p <- unique(mat[, "p"])
p <- p[nzchar(p)]
alpha["p"] <-
if (length(p) == 0L) ""
else sprintf("(?<p>%s)(?![[:alpha:]])", paste(p, collapse = "|"))
alpha <- unlist(alpha)
## NUMERIC FORMATS
num <- num_flex <- num_exact <- c(
d = "(?<d>[012]?[1-9]|3[01]|[12]0)",
q = "(?<q>[0]?[1-4])",
H = "(?<H>2[0-4]|[01]?\\d)",
h = "(?<H>2[0-4]|[01]?\\d)",
I = "(?<I>1[0-2]|0?[1-9])",
j = "(?<j>[0-3]?\\d?\\d)",
M = "(?<M>[0-5]?\\d)",
S = "((?<OS_S>[0-5]?\\d\\.\\d+)|(?<S>[0-6]?\\d))",
s = "((?<OS_S>[0-5]?\\d\\.\\d+)|(?<S>[0-6]?\\d))",
U = "(?<U>[0-5]?\\d)",
w = "(?<w>[0-6])", # merge with a, A??
u = "(?<u>[1-7])",
W = "(?<W>[0-5]?\\d)",
## x = "(?<x>\\d{2}/[01]?\\d/[0-3]?\\d)",
## X = "(?<X>[012]?\\d:[0-5]?\\d:[0-6]?\\d)",
Y = "(?<Y>\\d{4})",
y = "((?<Y_y>\\d{4})|(?<y>\\d{2}))",
Oz = "(?<Oz_Oz>[-+]\\d{4})", ## sptrtime implements only this format (4 digits)
## F = "(?<F>\\d{4)-\\d{2}-\\d{2})",
OO = "(?<OO>[-+]\\d{2}:\\d{2})",
Oo = "(?<Oo>[-+]\\d{2})")
## special lubridate formats
nms <- c("T", "R", "r")
## in some locales %p = ""
if (length(p) == 0L) {
num <- c(num,
T = sprintf("(%s\\D+%s\\D+%s)", num[["H"]], num[["M"]], num[["S"]]),
R = sprintf("(%s\\D+%s)", num[["H"]], num[["M"]]),
r = sprintf("(%s\\D+)", num[["H"]]))
} else {
num <- c(num,
T = sprintf("((%s\\D+%s\\D+%s\\D*%s)|(%s\\D+%s\\D+%s))",
num[["I"]], num[["M"]], num[["S"]], alpha[["p"]], num[["H"]], num[["M"]], num[["S"]]),
R = sprintf("((%s\\D+%s\\D*%s)|(%s\\D+%s))",
num[["I"]], num[["M"]], alpha[["p"]], num[["H"]], num[["M"]]),
r = sprintf("((%s\\D*%s)|%s)",
num[["I"]], alpha[["p"]], num[["H"]]))
num[nms] <- sub("<M", "<M_T",
sub("<S", "<S_T",
sub("<OS", "<OS_T", num[nms])))
}
## don't let special nms be confused with standard formats
num[nms] <- gsub("(<[IMSpHS]|<OS)", "\\1_s", num[nms])
## The difference between flex and exact regexp is that flex should follow
## by non-digit, thus flexible strings like 12-1-2 can be matched. In exact
## regexp a number may not be followed by a non-number. In this case the
## number of digits in a number should be precisely specified. For example
## 120102 is matched by ymd.
##
## *Note*: the exact regexps is build from flex regex by gsubing below. So,
## pay attention when you modify the regexp above.
alpha_flex <- alpha
alpha_exact <- gsub(">", "_e>", alpha, fixed = TRUE)
num_exact <- num_flex <- num
num_flex[] <- sprintf("%s(?!\\d)", num)
num_exact[] <- gsub("(?<!\\()\\?(?!<)", "", perl = T, # remove ?
gsub("+", "*", fixed = T,
gsub(">", "_e>", num))) # append _e to avoid duplicates
num_flex["m"] <- sprintf("((?<m>1[0-2]|0?[1-9](?!\\d))|(%s))", gsub("_[bB]", "\\1_m", alpha[["b"]]))
num_exact["m"] <- sprintf("((?<m_e>1[0-2]|0[1-9])|(%s))", gsub("_[bB]", "\\1_m_e", alpha[["b"]]))
## canoot be in num above because gsub("+", "*") messes it up
num_flex["OS"] <- "(?<OS_f>[0-5]\\d\\.\\d+)"
num_exact["OS"] <- "(?<OS_e>[0-5]\\d\\.\\d+)"
num_flex["z"] <- sprintf("(%s|%s|%s|%s)", alpha_flex[["Ou"]], num_flex[["Oz"]], num_flex[["OO"]], num_flex[["Oo"]])
num_exact["z"] <- sprintf("(%s|%s|%s|%s)", alpha_exact[["Ou"]], num_exact[["Oz"]], num_exact[["OO"]], num_exact[["Oo"]])
wday_order <- order(wday(.date_template, week_start = 7))
wday_names <- list(abr = unique(mat[, "a"][wday_order]),
full = unique(mat[, "A"][wday_order]))
month_order <- order(month(.date_template))
month_names <- list(abr = unique(mat[, "b"][month_order]),
full = unique(mat[, "B"][month_order]))
out <- list(alpha_flex = alpha_flex, num_flex = num_flex,
alpha_exact = alpha_exact, num_exact = num_exact,
wday_names = wday_names, month_names = month_names)
assign(locale, out, envir = .locale_reg_cache)
out
}