From f05d4ce31e7a06da5c7d731d9aa5fbd7212eb398 Mon Sep 17 00:00:00 2001 From: hadley Date: Thu, 3 Sep 2015 15:17:05 -0500 Subject: [PATCH] Selectively trim_ws. Parse NA in parse_vector. Fixes #137 --- NEWS.md | 6 ++++ R/RcppExports.R | 4 +-- R/collectors.R | 38 ++++++++++++++----------- R/read_delim.R | 14 +++++---- R/tokenizer.R | 13 +++++++-- R/type_convert.R | 3 ++ man/Tokenizers.Rd | 9 ++++-- man/collector.Rd | 16 ++++++----- man/parse_vector.Rd | 8 ++++-- man/read_delim.Rd | 9 ++++-- man/type_convert.Rd | 3 ++ src/RcppExports.cpp | 7 +++-- src/Tokenizer.cpp | 3 +- src/TokenizerDelim.h | 18 ++++++++---- src/parse.cpp | 5 +++- tests/testthat/test-parsing-character.R | 12 ++++++++ tests/testthat/test-parsing-numeric.R | 9 ++++++ 17 files changed, 124 insertions(+), 53 deletions(-) create mode 100644 tests/testthat/test-parsing-character.R diff --git a/NEWS.md b/NEWS.md index 558682cc..0a9fc2d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # readr 0.1.1.9000 +* `read_delim()` etc gains a `trim_ws` argument that controls whether leading + and trailing whitespace is automatically removed. It defaults to `TRUE`. (#137) + +* `parse_*` gain a `na` argument that allows you to specify which values should + be converted to missing. + * Month and abbreviated month name formats (`%b` and `%B`) now ignore case when matching (#219). diff --git a/R/RcppExports.R b/R/RcppExports.R index 2969b137..b943c21c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -29,8 +29,8 @@ tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) { .Call('readr_tokenize_', PACKAGE = 'readr', sourceSpec, tokenizerSpec, n_max) } -parse_vector_ <- function(x, collectorSpec) { - .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec) +parse_vector_ <- function(x, collectorSpec, na) { + .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec, na) } read_file_ <- function(sourceSpec) { diff --git a/R/collectors.R b/R/collectors.R index 17556564..fefd4a45 100644 --- a/R/collectors.R +++ b/R/collectors.R @@ -15,10 +15,11 @@ collector_find <- function(name) { #' #' @param x Character vector of elements to parse. #' @param collector Column specification. +#' @param na Character vector giving strings to treat as missing. #' @keywords internal #' @export #' @examples -#' x <- c("1", "2", "3", NA) +#' x <- c("1", "2", "3", "NA") #' parse_vector(x, col_integer()) #' parse_vector(x, col_double()) #' parse_vector(x, col_character()) @@ -27,8 +28,10 @@ collector_find <- function(name) { #' # Invalid values are replaced with missing values with a warning. #' x <- c("1", "2", "3", "-") #' parse_vector(x, col_double()) -parse_vector <- function(x, collector) { - warn_problems(parse_vector_(x, collector)) +#' # Or flag values as missing +#' parse_vector(x, col_double(), na = "-") +parse_vector <- function(x, collector, na = "NA") { + warn_problems(parse_vector_(x, collector, na = na)) } #' Parse character vectors into typed columns. @@ -39,6 +42,7 @@ parse_vector <- function(x, collector) { #' #' @name collector #' @param x Character vector of values to parse. +#' @param na Character vector giving strings to treat as missing. #' @seealso \code{\link{parse_datetime}}, \code{\link{type_convert}} to #' automatically re-parse all character columns in a data frame. #' @examples @@ -62,8 +66,8 @@ col_character <- function() { #' @rdname collector #' @export -parse_character <- function(x) { - parse_vector(x, col_character()) +parse_character <- function(x, na = c("", "NA")) { + parse_vector(x, col_character(), na = na) } #' @rdname collector @@ -74,8 +78,8 @@ col_integer <- function() { #' @rdname collector #' @export -parse_integer <- function(x) { - parse_vector(x, col_integer()) +parse_integer <- function(x, na = c("", "NA")) { + parse_vector(x, col_integer(), na = na) } #' @rdname collector @@ -86,8 +90,8 @@ col_double <- function() { #' @rdname collector #' @export -parse_double <- function(x) { - parse_vector(x, col_double()) +parse_double <- function(x, na = c("", "NA")) { + parse_vector(x, col_double(), na = na) } #' @rdname collector @@ -98,8 +102,8 @@ col_euro_double <- function() { #' @rdname collector #' @export -parse_euro_double <- function(x) { - parse_vector(x, col_euro_double()) +parse_euro_double <- function(x, na = c("", "NA")) { + parse_vector(x, col_euro_double(), na = na) } @@ -111,8 +115,8 @@ col_numeric <- function() { #' @rdname collector #' @export -parse_numeric <- function(x) { - parse_vector(x, col_numeric()) +parse_numeric <- function(x, na = c("", "NA")) { + parse_vector(x, col_numeric(), na = na) } #' @rdname collector @@ -123,8 +127,8 @@ col_logical <- function() { #' @rdname collector #' @export -parse_logical <- function(x) { - parse_vector(x, col_logical()) +parse_logical <- function(x, na = c("", "NA")) { + parse_vector(x, col_logical(), na = na) } #' @param levels Character vector providing set of allowed levels. @@ -137,8 +141,8 @@ col_factor <- function(levels, ordered = FALSE) { #' @rdname collector #' @export -parse_factor <- function(x, levels, ordered = FALSE) { - parse_vector(x, col_factor(levels, ordered)) +parse_factor <- function(x, levels, ordered = FALSE, na = c("", "NA")) { + parse_vector(x, col_factor(levels, ordered), na = na) } #' @rdname collector diff --git a/R/read_delim.R b/R/read_delim.R index d0679176..76cbd449 100644 --- a/R/read_delim.R +++ b/R/read_delim.R @@ -69,9 +69,10 @@ read_delim <- function(file, delim, quote = '"', escape_backslash = FALSE, #' @rdname read_delim #' @export read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), - skip = 0, n_max = -1, progress = interactive()) { + trim_ws = TRUE, skip = 0, n_max = -1, + progress = interactive()) { - tokenizer <- tokenizer_csv(na = na) + tokenizer <- tokenizer_csv(na = na, trim_ws = trim_ws) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, progress = progress) } @@ -79,10 +80,10 @@ read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), #' @rdname read_delim #' @export read_csv2 <- function(file, col_names = TRUE, col_types = NULL, - na = c("", "NA"), skip = 0, n_max = -1, + na = c("", "NA"), trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive()) { - tokenizer <- tokenizer_delim(delim = ";", na = na) + tokenizer <- tokenizer_delim(delim = ";", na = na, trim_ws = trim_ws) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, progress = progress) } @@ -91,9 +92,10 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL, #' @rdname read_delim #' @export read_tsv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), - skip = 0, n_max = -1, progress = interactive()) { + trim_ws = TRUE, skip = 0, n_max = -1, + progress = interactive()) { - tokenizer <- tokenizer_tsv(na = na) + tokenizer <- tokenizer_tsv(na = na, trim_ws = trim_ws) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, progress = progress) } diff --git a/R/tokenizer.R b/R/tokenizer.R index 153ca14a..ce7fc1eb 100644 --- a/R/tokenizer.R +++ b/R/tokenizer.R @@ -38,6 +38,8 @@ NULL #' option to \code{character()} to indicate no missing values. #' @param delim Single character used to separate fields within a record. #' @param quote Single character used to quote strings. +#' @param trim_ws Should leading and trailing whitespace be trimmed from +#' each field before parsing it? #' @param escape_double Does the file escape quotes by doubling them? #' i.e. If this option is \code{TRUE}, the value \code{""""} represents #' a single quote, \code{\"}. @@ -46,12 +48,15 @@ NULL #' can be used to escape the delimeter character, the quote characer, or #' to add special characters like \code{\\n}. tokenizer_delim <- function(delim, quote = '"', na = "NA", - escape_double = TRUE, escape_backslash = FALSE) { + trim_ws = TRUE, + escape_double = TRUE, + escape_backslash = FALSE) { structure( list( delim = delim, quote = quote, na = na, + trim_ws = trim_ws, escape_double = escape_double, escape_backslash = escape_backslash ), @@ -61,11 +66,12 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA", #' @export #' @rdname Tokenizers -tokenizer_csv <- function(na = "NA") { +tokenizer_csv <- function(na = "NA", trim_ws = TRUE) { tokenizer_delim( delim = ",", quote = '"', na = na, + trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE ) @@ -73,11 +79,12 @@ tokenizer_csv <- function(na = "NA") { #' @export #' @rdname Tokenizers -tokenizer_tsv <- function(na = "NA") { +tokenizer_tsv <- function(na = "NA", trim_ws = TRUE) { tokenizer_delim( delim = "\t", quote = '"', na = na, + trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE ) diff --git a/R/type_convert.R b/R/type_convert.R index a7b5aee2..6a37f75f 100644 --- a/R/type_convert.R +++ b/R/type_convert.R @@ -15,6 +15,9 @@ #' ) #' str(df) #' str(type_convert(df)) +#' +#' df <- data.frame(x = c("NA", "10"), stringsAsFactors= FALSE) +#' type_convert(df) type_convert <- function(df, col_types = NULL) { is_character <- vapply(df, is.character, logical(1)) diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd index cc4c44b6..56856697 100644 --- a/man/Tokenizers.Rd +++ b/man/Tokenizers.Rd @@ -11,11 +11,11 @@ \title{Tokenizers.} \usage{ tokenizer_delim(delim, - quote = "\\"", na = "NA", escape_double = TRUE, escape_backslash = FALSE) + quote = "\\"", na = "NA", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE) -tokenizer_csv(na = "NA") +tokenizer_csv(na = "NA", trim_ws = TRUE) -tokenizer_tsv(na = "NA") +tokenizer_tsv(na = "NA", trim_ws = TRUE) tokenizer_line() @@ -31,6 +31,9 @@ tokenizer_fwf(begin, end, na = "NA") \item{na}{Character vector of strings to use for missing values. Set this option to \code{character()} to indicate no missing values.} +\item{trim_ws}{Should leading and trailing whitespace be trimmed from +each field before parsing it?} + \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \code{""""} represents a single quote, \code{\"}.} diff --git a/man/collector.Rd b/man/collector.Rd index 5d6e3378..ce784b55 100644 --- a/man/collector.Rd +++ b/man/collector.Rd @@ -21,37 +21,39 @@ \usage{ col_character() -parse_character(x) +parse_character(x, na = c("", "NA")) col_integer() -parse_integer(x) +parse_integer(x, na = c("", "NA")) col_double() -parse_double(x) +parse_double(x, na = c("", "NA")) col_euro_double() -parse_euro_double(x) +parse_euro_double(x, na = c("", "NA")) col_numeric() -parse_numeric(x) +parse_numeric(x, na = c("", "NA")) col_logical() -parse_logical(x) +parse_logical(x, na = c("", "NA")) col_factor(levels, ordered = FALSE) -parse_factor(x, levels, ordered = FALSE) +parse_factor(x, levels, ordered = FALSE, na = c("", "NA")) col_skip() } \arguments{ \item{x}{Character vector of values to parse.} +\item{na}{Character vector giving strings to treat as missing.} + \item{levels}{Character vector providing set of allowed levels.} \item{ordered}{Is it an ordered factor?} diff --git a/man/parse_vector.Rd b/man/parse_vector.Rd index 9f450d41..b805a399 100644 --- a/man/parse_vector.Rd +++ b/man/parse_vector.Rd @@ -4,18 +4,20 @@ \alias{parse_vector} \title{Parse a character vector.} \usage{ -parse_vector(x, collector) +parse_vector(x, collector, na = "NA") } \arguments{ \item{x}{Character vector of elements to parse.} \item{collector}{Column specification.} + +\item{na}{Character vector giving strings to treat as missing.} } \description{ Parse a character vector. } \examples{ -x <- c("1", "2", "3", NA) +x <- c("1", "2", "3", "NA") parse_vector(x, col_integer()) parse_vector(x, col_double()) parse_vector(x, col_character()) @@ -24,6 +26,8 @@ parse_vector(x, col_skip()) # Invalid values are replaced with missing values with a warning. x <- c("1", "2", "3", "-") parse_vector(x, col_double()) +# Or flag values as missing +parse_vector(x, col_double(), na = "-") } \keyword{internal} diff --git a/man/read_delim.Rd b/man/read_delim.Rd index c2b8b475..bc62561a 100644 --- a/man/read_delim.Rd +++ b/man/read_delim.Rd @@ -12,13 +12,13 @@ read_delim(file, delim, quote = '\"', escape_backslash = FALSE, skip = 0, n_max = -1, progress = interactive()) read_csv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), - skip = 0, n_max = -1, progress = interactive()) + trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive()) read_csv2(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), - skip = 0, n_max = -1, progress = interactive()) + trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive()) read_tsv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), - skip = 0, n_max = -1, progress = interactive()) + trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive()) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data @@ -82,6 +82,9 @@ option to \code{character()} to indicate no missing values.} \item{progress}{Display a progress bar? By default it will only display in an interactive session. The display is updated every 50,000 values and will only display if estimated reading time is 5 seconds or more.} + +\item{trim_ws}{Should leading and trailing whitespace be trimmed from +each field before parsing it?} } \value{ A data frame. If there are parsing problems, a warning tells you diff --git a/man/type_convert.Rd b/man/type_convert.Rd index 29d7b1a8..f33a2458 100644 --- a/man/type_convert.Rd +++ b/man/type_convert.Rd @@ -38,5 +38,8 @@ df <- data.frame( ) str(df) str(type_convert(df)) + +df <- data.frame(x = c("NA", "10"), stringsAsFactors= FALSE) +type_convert(df) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index e1c814a8..7f7f0aaf 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -94,14 +94,15 @@ BEGIN_RCPP END_RCPP } // parse_vector_ -SEXP parse_vector_(CharacterVector x, List collectorSpec); -RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP) { +SEXP parse_vector_(CharacterVector x, List collectorSpec, const std::vector& na); +RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP, SEXP naSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP); Rcpp::traits::input_parameter< List >::type collectorSpec(collectorSpecSEXP); - __result = Rcpp::wrap(parse_vector_(x, collectorSpec)); + Rcpp::traits::input_parameter< const std::vector& >::type na(naSEXP); + __result = Rcpp::wrap(parse_vector_(x, collectorSpec, na)); return __result; END_RCPP } diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp index 634c0558..ceb58a98 100644 --- a/src/Tokenizer.cpp +++ b/src/Tokenizer.cpp @@ -15,11 +15,12 @@ TokenizerPtr Tokenizer::create(List spec) { char delim = as(spec["delim"]); char quote = as(spec["quote"]); std::vector na = as >(spec["na"]); + bool trimWs = as(spec["trim_ws"]); bool escapeDouble = as(spec["escape_double"]); bool escapeBackslash = as(spec["escape_backslash"]); return TokenizerPtr(new - TokenizerDelim(delim, quote, na, escapeBackslash, escapeDouble) + TokenizerDelim(delim, quote, na, trimWs, escapeBackslash, escapeDouble) ); } else if (subclass == "tokenizer_fwf") { std::vector diff --git a/src/TokenizerDelim.h b/src/TokenizerDelim.h index 2f42859e..a53ee155 100644 --- a/src/TokenizerDelim.h +++ b/src/TokenizerDelim.h @@ -20,7 +20,7 @@ class TokenizerDelim : public Tokenizer { char delim_, quote_; std::vector NA_; - bool escapeBackslash_, escapeDouble_; + bool trimWS_, escapeBackslash_, escapeDouble_; SourceIterator begin_, cur_, end_; DelimState state_; @@ -31,10 +31,12 @@ class TokenizerDelim : public Tokenizer { TokenizerDelim(char delim = ',', char quote = '"', std::vector NA = std::vector(1, "NA"), - bool escapeBackslash = false, bool escapeDouble = true): + bool trimWS = true, bool escapeBackslash = false, + bool escapeDouble = true): delim_(delim), quote_(quote), NA_(NA), + trimWS_(trimWS), escapeBackslash_(escapeBackslash), escapeDouble_(escapeDouble), moreTokens_(false) @@ -211,13 +213,19 @@ class TokenizerDelim : public Tokenizer { Token fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB, int row, int col) { - return Token(begin, end, row, col, (hasEscapeB) ? this : NULL).flagNA(NA_); + Token t(begin, end, row, col, (hasEscapeB) ? this : NULL); + t.flagNA(NA_); + if (trimWS_) + t.trim(); + return t; } Token stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB, bool hasEscapeD, int row, int col) { - return Token(begin, end, row, col, - (hasEscapeD || hasEscapeB) ? this : NULL); + Token t(begin, end, row, col, (hasEscapeD || hasEscapeB) ? this : NULL); + if (trimWS_) + t.trim(); + return t; } diff --git a/src/parse.cpp b/src/parse.cpp index 475c5051..59df7b5e 100644 --- a/src/parse.cpp +++ b/src/parse.cpp @@ -74,7 +74,8 @@ RObject tokenize_(List sourceSpec, List tokenizerSpec, int n_max) { } // [[Rcpp::export]] -SEXP parse_vector_(CharacterVector x, List collectorSpec) { +SEXP parse_vector_(CharacterVector x, List collectorSpec, + const std::vector& na) { Warnings warnings; int n = x.size(); @@ -89,6 +90,8 @@ SEXP parse_vector_(CharacterVector x, List collectorSpec) { } else { SEXP string = x[i]; t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1); + t.flagNA(na); + t.trim(); } col->setValue(i, t); } diff --git a/tests/testthat/test-parsing-character.R b/tests/testthat/test-parsing-character.R new file mode 100644 index 00000000..ff621a47 --- /dev/null +++ b/tests/testthat/test-parsing-character.R @@ -0,0 +1,12 @@ +context("Parsing, character") + +test_that("ws dropped by default", { + df <- read_csv("x\n a \n b\n") + expect_equal(df$x, c("a", "b")) +}) + +test_that("trim_ws = FALSE keeps ws", { + df <- read_csv("x\n a\nb \n", trim_ws = FALSE) + expect_equal(df$x, c(" a", "b ")) +}) + diff --git a/tests/testthat/test-parsing-numeric.R b/tests/testthat/test-parsing-numeric.R index ce8640cb..4b57f219 100644 --- a/tests/testthat/test-parsing-numeric.R +++ b/tests/testthat/test-parsing-numeric.R @@ -9,3 +9,12 @@ test_that("partial integer/double matches fail", { expect_true(is.na(parse_double("3d"))) expect_true(is.na(parse_integer("3d"))) }) + +test_that("parse functions converts NAs", { + expect_equal(parse_double(c("1.5", "NA")), c(1.5, NA)) +}) + +test_that("leading/trailing ws ignored when parsing", { + expect_equal(parse_double(c(" 1.5", "1.5", "1.5 ")), rep(1.5, 3)) + expect_equal(read_csv("x\n 1.5\n1.5\n1.5 \n")$x, rep(1.5, 3)) +})