Skip to content

Commit

Permalink
Make defaults consistent (#463)
Browse files Browse the repository at this point in the history
Implement %AD and %AT flexible parsers

* And use in the date and time formats in the default locale

* Fix bug in consumeInteger: previously it did not check that exactly n character were consumed

Fixes #442.
  • Loading branch information
hadley committed Jul 13, 2016
1 parent 8d40216 commit 095bd4a
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 32 deletions.
10 changes: 10 additions & 0 deletions NEWS.md
@@ -1,5 +1,15 @@
# readr 0.2.2.9000

* `%y` and `%Y` are now stricter and require 2 or 4 characters respectively.

* If omitted `col_date()` and `col_time()` (and `parse_date()` and
`parse_time()`) use the date and time formats specified in the locale.

* The default `date_format` and `time_format`s are `%AD` and `%AT` respectively.
These are "automatic" date and time parsers that are moderately flexible.
The automatic date parser is somewhat flexible than it used to be - it
requires a four digit year, and only accepts `-` and `/` as separators (#442).

* Missing colum name names are now given a default name (`X2`, `X7` etc) (#318).
Duplicated column names are now deduplicated. Both changes generate a warning;
to suppress it supply explicit `col_names` (setting skip = 1 if needed).
Expand Down
14 changes: 8 additions & 6 deletions R/collectors.R
Expand Up @@ -183,6 +183,7 @@ col_guess <- function() {
#' @rdname parse_guess
#' @export
guess_parser <- function(x, locale = default_locale()) {
stopifnot(is.locale(locale))
collectorGuess(x, locale)
}

Expand Down Expand Up @@ -241,6 +242,8 @@ col_factor <- function(levels, ordered = FALSE) {
#' \item Non-digits: "\%." skips one non-digit character,
#' "\%+" skips one or more non-digit characters,
#' "\%*" skips any number of non-digits characters.
#' \item Automatic parsers: "\%AD" parses with a flexible YMD parser,
#' "\%AT" parses with a flexible HMS parser.
#' \item Shortcuts: "\%D" = "\%m/\%d/\%y", "\%F" = "\%Y-\%m-\%d",
#' "\%R" = "\%H:\%M", "\%T" = "\%H:\%M:\%S", "\%x" = "\%y/\%m/\%d".
#' }
Expand All @@ -263,10 +266,9 @@ col_factor <- function(levels, ordered = FALSE) {
#' }
#'
#' @param x A character vector of dates to parse.
#' @param format A format specification, as described below. If omitted,
#' parses dates according to the ISO8601 specification (with caveats,
#' as described below). Times are parsed like ISO8601 times, but also
#' accept an optional am/pm specification.
#' @param format A format specification, as described below. If set to "",
#' date times are parsed as ISO8601, dates and times used the date and
#' time formats specified in the \code{\link{locale}}.
#'
#' Unlike \code{\link{strptime}}, the format specification must match
#' the complete string.
Expand Down Expand Up @@ -343,7 +345,7 @@ parse_datetime <- function(x, format = "", na = c("", "NA"), locale = default_lo

#' @rdname parse_datetime
#' @export
parse_date <- function(x, format = "%Y-%m-%d", na = c("", "NA"), locale = default_locale()) {
parse_date <- function(x, format = "", na = c("", "NA"), locale = default_locale()) {
parse_vector(x, col_date(format), na = na, locale = locale)
}

Expand All @@ -361,7 +363,7 @@ col_datetime <- function(format = "") {

#' @rdname parse_datetime
#' @export
col_date <- function(format = NULL) {
col_date <- function(format = "") {
collector("date", format = format)
}

Expand Down
4 changes: 3 additions & 1 deletion R/locale.R
Expand Up @@ -39,7 +39,7 @@
#' # South American locale
#' locale("es", decimal_mark = ",")
locale <- function(date_names = "en",
date_format = "%Y%.%m%.%d", time_format = "%H:%M",
date_format = "%AD", time_format = "%AT",
decimal_mark = ".", grouping_mark = ",",
tz = "UTC", encoding = "UTF-8",
asciify = FALSE) {
Expand Down Expand Up @@ -80,6 +80,8 @@ locale <- function(date_names = "en",
)
}

is.locale <- function(x) inherits(x, "locale")

#' @export
print.locale <- function(x, ...) {
cat("<locale>\n")
Expand Down
6 changes: 3 additions & 3 deletions man/locale.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions man/parse_datetime.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions src/Collector.cpp
Expand Up @@ -100,8 +100,7 @@ void CollectorDate::setValue(int i, const Token& t) {
std::string std_string(string.first, string.second);

parser_.setDate(std_string.c_str());
bool res = (format_ == "") ? parser_.parseISO8601()
: parser_.parse(format_);
bool res = (format_ == "") ? parser_.parseLocaleDate() : parser_.parse(format_);

if (!res) {
warn(t.row(), t.col(), "date like " + format_, std_string);
Expand Down Expand Up @@ -337,7 +336,7 @@ void CollectorTime::setValue(int i, const Token& t) {
std::string std_string(string.first, string.second);

parser_.setDate(std_string.c_str());
bool res = (format_ == "") ? parser_.parseTime() : parser_.parse(format_);
bool res = (format_ == "") ? parser_.parseLocaleTime() : parser_.parse(format_);

if (!res) {
warn(t.row(), t.col(), "time like " + format_, std_string);
Expand Down
4 changes: 2 additions & 2 deletions src/CollectorGuess.cpp
Expand Up @@ -73,14 +73,14 @@ bool isTime(const std::string& x, LocaleInfo* pLocale) {
DateTimeParser parser(pLocale);

parser.setDate(x.c_str());
return parser.parseTime();
return parser.parseLocaleTime();
}

bool isDate(const std::string& x, LocaleInfo* pLocale) {
DateTimeParser parser(pLocale);

parser.setDate(x.c_str());
return parser.parse(pLocale->dateFormat_);
return parser.parseLocaleDate();
}

static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
Expand Down
61 changes: 53 additions & 8 deletions src/DateTimeParser.h
Expand Up @@ -77,6 +77,14 @@ class DateTimeParser {
return isComplete();
}

bool parseLocaleTime() {
return parse(pLocale_->timeFormat_);
}

bool parseLocaleDate() {
return parse(pLocale_->dateFormat_);
}

// A flexible time parser for the most common formats
bool parseTime() {
if (!consumeInteger(2, &hour_))
Expand All @@ -95,6 +103,22 @@ class DateTimeParser {
return isComplete();
}

bool parseDate() {
// Date: YYYY-MM-DD, YYYY/MM/DD
if (!consumeInteger(4, &year_))
return false;
if (!consumeThisChar('-') && !consumeThisChar('/'))
return false;
if (!consumeInteger1(2, &mon_))
return false;
if (!consumeThisChar('-') && !consumeThisChar('/'))
return false;
if (!consumeInteger1(2, &day_))
return false;

return isComplete();
}

bool isComplete() {
return dateItr_ == dateEnd_;
}
Expand Down Expand Up @@ -138,7 +162,7 @@ class DateTimeParser {
year_ += (year_ < 69) ? 2000 : 1900;
break;
case 'm': // month
if (!consumeInteger1(2, &mon_))
if (!consumeInteger1(2, &mon_, false))
return false;
break;
case 'b': // abbreviated month name
Expand All @@ -150,19 +174,19 @@ class DateTimeParser {
return false;
break;
case 'd': // day
if (!consumeInteger1(2, &day_))
if (!consumeInteger1(2, &day_, false))
return false;
break;
case 'e': // day with optional leading space
if (!consumeInteger1WithSpace(2, &day_))
return false;
break;
case 'H': // hour
if (!consumeInteger(2, &hour_))
if (!consumeInteger(2, &hour_, false))
return false;
break;
case 'I': // hour
if (!consumeInteger(2, &hour_))
if (!consumeInteger(2, &hour_, false))
return false;
if (hour_ < 1 || hour_ > 12) {
return false;
Expand Down Expand Up @@ -215,6 +239,24 @@ class DateTimeParser {
consumeNonDigits();
break;

case 'A': // auto date / time
if (formatItr + 1 == formatEnd)
Rcpp::stop("Invalid format: %%A must be followed by another letter");
formatItr++;
switch(*formatItr) {
case 'D':
if (!parseDate())
return false;
break;
case 'T':
if (!parseTime())
return false;
break;
default:
Rcpp::stop("Invalid %%A auto parser");
}
break;

// Compound formats
case 'D':
parse("%m/%d/%y");
Expand Down Expand Up @@ -298,17 +340,20 @@ class DateTimeParser {
return false;
}

inline bool consumeInteger(int n, int* pOut) {
inline bool consumeInteger(int n, int* pOut, bool exact = true) {
if (dateItr_ == dateEnd_ || *dateItr_ == '-' || *dateItr_ == '+')
return false;

const char* start = dateItr_;
const char* end = std::min(dateItr_ + n, dateEnd_);
return parseInt(dateItr_, end, *pOut);
bool ok = parseInt(dateItr_, end, *pOut);

return ok && (!exact || (dateItr_ - start) == n);
}

// Integer indexed from 1 (i.e. month and date)
inline bool consumeInteger1(int n, int* pOut) {
if (!consumeInteger(n, pOut))
inline bool consumeInteger1(int n, int* pOut, bool exact = true) {
if (!consumeInteger(n, pOut, exact))
return false;

(*pOut)--;
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test-col-spec.R
Expand Up @@ -134,8 +134,8 @@ test_that("print(col_spec) works with dates", {
regex_escape(
"cols(
a = col_date(format = \"%Y-%m-%d\"),
b = col_date(format = NULL),
c = col_date(format = NULL)
b = col_date(format = \"\"),
c = col_date(format = \"\")
)"))
})

Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-parsing-datetime.R
Expand Up @@ -56,6 +56,12 @@ test_that("%OS captures partial seconds", {
expect_equal(as.POSIXlt(x)$sec, 1.333, tol = 1e-6)
})

test_that("%y requries 4 digits", {
expect_warning(parse_date("003-01-01", "%Y-%m-%d"), "parsing failure")
expect_warning(parse_date("03-01-01", "%Y-%m-%d"), "parsing failure")
expect_warning(parse_date("00003-01-01", "%Y-%m-%d"), "parsing failure")
})

test_that("invalid dates return NA", {
expect_warning(x <- parse_datetime("2010-02-30", "%Y-%m-%d"))
expect_true(is.na(x))
Expand Down

0 comments on commit 095bd4a

Please sign in to comment.