Selectively trim_ws. Parse NA in parse_vector.

Fixes #137
tidyverse · Sep 3, 2015 · f05d4ce · f05d4ce
1 parent 6b11595
commit f05d4ce
Show file tree

Hide file tree

Showing 17 changed files with 124 additions and 53 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # readr 0.1.1.9000
 
+* `read_delim()` etc gains a `trim_ws` argument that controls whether leading
+  and trailing whitespace is automatically removed. It defaults to `TRUE`. (#137)
+
+* `parse_*` gain a `na` argument that allows you to specify which values should
+  be converted to missing.
+
 * Month and abbreviated month name formats (`%b` and `%B`) now ignore
   case when matching (#219).
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -29,8 +29,8 @@ tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) {
     .Call('readr_tokenize_', PACKAGE = 'readr', sourceSpec, tokenizerSpec, n_max)
 }
 
-parse_vector_ <- function(x, collectorSpec) {
-    .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec)
+parse_vector_ <- function(x, collectorSpec, na) {
+    .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec, na)
 }
 
 read_file_ <- function(sourceSpec) {

diff --git a/R/collectors.R b/R/collectors.R
@@ -15,10 +15,11 @@ collector_find <- function(name) {
 #'
 #' @param x Character vector of elements to parse.
 #' @param collector Column specification.
+#' @param na Character vector giving strings to treat as missing.
 #' @keywords internal
 #' @export
 #' @examples
-#' x <- c("1", "2", "3", NA)
+#' x <- c("1", "2", "3", "NA")
 #' parse_vector(x, col_integer())
 #' parse_vector(x, col_double())
 #' parse_vector(x, col_character())
@@ -27,8 +28,10 @@ collector_find <- function(name) {
 #' # Invalid values are replaced with missing values with a warning.
 #' x <- c("1", "2", "3", "-")
 #' parse_vector(x, col_double())
-parse_vector <- function(x, collector) {
-  warn_problems(parse_vector_(x, collector))
+#' # Or flag values as missing
+#' parse_vector(x, col_double(), na = "-")
+parse_vector <- function(x, collector, na = "NA") {
+  warn_problems(parse_vector_(x, collector, na = na))
 }
 
 #' Parse character vectors into typed columns.
@@ -39,6 +42,7 @@ parse_vector <- function(x, collector) {
 #'
 #' @name collector
 #' @param x Character vector of values to parse.
+#' @param na Character vector giving strings to treat as missing.
 #' @seealso \code{\link{parse_datetime}}, \code{\link{type_convert}} to
 #'   automatically re-parse all character columns in a data frame.
 #' @examples
@@ -62,8 +66,8 @@ col_character <- function() {
 
 #' @rdname collector
 #' @export
-parse_character <- function(x) {
-  parse_vector(x, col_character())
+parse_character <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_character(), na = na)
 }
 
 #' @rdname collector
@@ -74,8 +78,8 @@ col_integer <- function() {
 
 #' @rdname collector
 #' @export
-parse_integer <- function(x) {
-  parse_vector(x, col_integer())
+parse_integer <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_integer(), na = na)
 }
 
 #' @rdname collector
@@ -86,8 +90,8 @@ col_double <- function() {
 
 #' @rdname collector
 #' @export
-parse_double <- function(x) {
-  parse_vector(x, col_double())
+parse_double <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_double(), na = na)
 }
 
 #' @rdname collector
@@ -98,8 +102,8 @@ col_euro_double <- function() {
 
 #' @rdname collector
 #' @export
-parse_euro_double <- function(x) {
-  parse_vector(x, col_euro_double())
+parse_euro_double <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_euro_double(), na = na)
 }
 
 
@@ -111,8 +115,8 @@ col_numeric <- function() {
 
 #' @rdname collector
 #' @export
-parse_numeric <- function(x) {
-  parse_vector(x, col_numeric())
+parse_numeric <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_numeric(), na = na)
 }
 
 #' @rdname collector
@@ -123,8 +127,8 @@ col_logical <- function() {
 
 #' @rdname collector
 #' @export
-parse_logical <- function(x) {
-  parse_vector(x, col_logical())
+parse_logical <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_logical(), na = na)
 }
 
 #' @param levels Character vector providing set of allowed levels.
@@ -137,8 +141,8 @@ col_factor <- function(levels, ordered = FALSE) {
 
 #' @rdname collector
 #' @export
-parse_factor <- function(x, levels, ordered = FALSE) {
-  parse_vector(x, col_factor(levels, ordered))
+parse_factor <- function(x, levels, ordered = FALSE, na = c("", "NA")) {
+  parse_vector(x, col_factor(levels, ordered), na = na)
 }
 
 #' @rdname collector

diff --git a/R/read_delim.R b/R/read_delim.R
@@ -69,20 +69,21 @@ read_delim <- function(file, delim, quote = '"', escape_backslash = FALSE,
 #' @rdname read_delim
 #' @export
 read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-                     skip = 0, n_max = -1, progress = interactive()) {
+                     trim_ws = TRUE, skip = 0, n_max = -1,
+                     progress = interactive()) {
 
-  tokenizer <- tokenizer_csv(na = na)
+  tokenizer <- tokenizer_csv(na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }
 
 #' @rdname read_delim
 #' @export
 read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
-                      na = c("", "NA"), skip = 0, n_max = -1,
+                      na = c("", "NA"), trim_ws = TRUE, skip = 0, n_max = -1,
                       progress = interactive()) {
 
-  tokenizer <- tokenizer_delim(delim = ";", na = na)
+  tokenizer <- tokenizer_delim(delim = ";", na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }
@@ -91,9 +92,10 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
 #' @rdname read_delim
 #' @export
 read_tsv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-                     skip = 0, n_max = -1, progress = interactive()) {
+                     trim_ws = TRUE, skip = 0, n_max = -1,
+                     progress = interactive()) {
 
-  tokenizer <- tokenizer_tsv(na = na)
+  tokenizer <- tokenizer_tsv(na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }

diff --git a/R/tokenizer.R b/R/tokenizer.R
@@ -38,6 +38,8 @@ NULL
 #'   option to \code{character()} to indicate no missing values.
 #' @param delim Single character used to separate fields within a record.
 #' @param quote Single character used to quote strings.
+#' @param trim_ws Should leading and trailing whitespace be trimmed from
+#'   each field before parsing it?
 #' @param escape_double Does the file escape quotes by doubling them?
 #'   i.e. If this option is \code{TRUE}, the value \code{""""} represents
 #'   a single quote, \code{\"}.
@@ -46,12 +48,15 @@ NULL
 #'   can be used to escape the delimeter character, the quote characer, or
 #'   to add special characters like \code{\\n}.
 tokenizer_delim <- function(delim, quote = '"', na = "NA",
-                            escape_double = TRUE, escape_backslash = FALSE) {
+                            trim_ws = TRUE,
+                            escape_double = TRUE,
+                            escape_backslash = FALSE) {
   structure(
     list(
       delim = delim,
       quote = quote,
       na = na,
+      trim_ws = trim_ws,
       escape_double = escape_double,
       escape_backslash = escape_backslash
     ),
@@ -61,23 +66,25 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA",
 
 #' @export
 #' @rdname Tokenizers
-tokenizer_csv <- function(na = "NA") {
+tokenizer_csv <- function(na = "NA", trim_ws = TRUE) {
   tokenizer_delim(
     delim = ",",
     quote = '"',
     na = na,
+    trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
   )
 }
 
 #' @export
 #' @rdname Tokenizers
-tokenizer_tsv <- function(na = "NA") {
+tokenizer_tsv <- function(na = "NA", trim_ws = TRUE) {
   tokenizer_delim(
     delim = "\t",
     quote = '"',
     na = na,
+    trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
   )

diff --git a/R/type_convert.R b/R/type_convert.R
@@ -15,6 +15,9 @@
 #' )
 #' str(df)
 #' str(type_convert(df))
+#'
+#' df <- data.frame(x = c("NA", "10"), stringsAsFactors=  FALSE)
+#' type_convert(df)
 type_convert <- function(df, col_types = NULL) {
   is_character <- vapply(df, is.character, logical(1))
 

diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd
@@ -11,11 +11,11 @@
 \title{Tokenizers.}
 \usage{
 tokenizer_delim(delim,
-  quote = "\\"", na = "NA", escape_double = TRUE, escape_backslash = FALSE)
+  quote = "\\"", na = "NA", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE)
 
-tokenizer_csv(na = "NA")
+tokenizer_csv(na = "NA", trim_ws = TRUE)
 
-tokenizer_tsv(na = "NA")
+tokenizer_tsv(na = "NA", trim_ws = TRUE)
 
 tokenizer_line()
 
@@ -31,6 +31,9 @@ tokenizer_fwf(begin, end, na = "NA")
 \item{na}{Character vector of strings to use for missing values. Set this
 option to \code{character()} to indicate no missing values.}
 
+\item{trim_ws}{Should leading and trailing whitespace be trimmed from
+each field before parsing it?}
+
 \item{escape_double}{Does the file escape quotes by doubling them?
 i.e. If this option is \code{TRUE}, the value \code{""""} represents
 a single quote, \code{\"}.}

diff --git a/man/collector.Rd b/man/collector.Rd
@@ -21,37 +21,39 @@
 \usage{
 col_character()
 
-parse_character(x)
+parse_character(x, na = c("", "NA"))
 
 col_integer()
 
-parse_integer(x)
+parse_integer(x, na = c("", "NA"))
 
 col_double()
 
-parse_double(x)
+parse_double(x, na = c("", "NA"))
 
 col_euro_double()
 
-parse_euro_double(x)
+parse_euro_double(x, na = c("", "NA"))
 
 col_numeric()
 
-parse_numeric(x)
+parse_numeric(x, na = c("", "NA"))
 
 col_logical()
 
-parse_logical(x)
+parse_logical(x, na = c("", "NA"))
 
 col_factor(levels, ordered = FALSE)
 
-parse_factor(x, levels, ordered = FALSE)
+parse_factor(x, levels, ordered = FALSE, na = c("", "NA"))
 
 col_skip()
 }
 \arguments{
 \item{x}{Character vector of values to parse.}
 
+\item{na}{Character vector giving strings to treat as missing.}
+
 \item{levels}{Character vector providing set of allowed levels.}
 
 \item{ordered}{Is it an ordered factor?}

diff --git a/man/parse_vector.Rd b/man/parse_vector.Rd
@@ -4,18 +4,20 @@
 \alias{parse_vector}
 \title{Parse a character vector.}
 \usage{
-parse_vector(x, collector)
+parse_vector(x, collector, na = "NA")
 }
 \arguments{
 \item{x}{Character vector of elements to parse.}
 
 \item{collector}{Column specification.}
+
+\item{na}{Character vector giving strings to treat as missing.}
 }
 \description{
 Parse a character vector.
 }
 \examples{
-x <- c("1", "2", "3", NA)
+x <- c("1", "2", "3", "NA")
 parse_vector(x, col_integer())
 parse_vector(x, col_double())
 parse_vector(x, col_character())
@@ -24,6 +26,8 @@ parse_vector(x, col_skip())
 # Invalid values are replaced with missing values with a warning.
 x <- c("1", "2", "3", "-")
 parse_vector(x, col_double())
+# Or flag values as missing
+parse_vector(x, col_double(), na = "-")
 }
 \keyword{internal}
 
diff --git a/man/read_delim.Rd b/man/read_delim.Rd
@@ -12,13 +12,13 @@ read_delim(file, delim, quote = '\"', escape_backslash = FALSE,
   skip = 0, n_max = -1, progress = interactive())
 
 read_csv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 
 read_csv2(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 
 read_tsv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 }
 \arguments{
 \item{file}{Either a path to a file, a connection, or literal data
@@ -82,6 +82,9 @@ option to \code{character()} to indicate no missing values.}
 \item{progress}{Display a progress bar? By default it will only display
 in an interactive session. The display is updated every 50,000 values
 and will only display if estimated reading time is 5 seconds or more.}
+
+\item{trim_ws}{Should leading and trailing whitespace be trimmed from
+each field before parsing it?}
 }
 \value{
 A data frame. If there are parsing problems, a warning tells you

diff --git a/man/type_convert.Rd b/man/type_convert.Rd
@@ -38,5 +38,8 @@ df <- data.frame(
 )
 str(df)
 str(type_convert(df))
+
+df <- data.frame(x = c("NA", "10"), stringsAsFactors=  FALSE)
+type_convert(df)
 }
 
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -94,14 +94,15 @@ BEGIN_RCPP
 END_RCPP
 }
 // parse_vector_
-SEXP parse_vector_(CharacterVector x, List collectorSpec);
-RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP) {
+SEXP parse_vector_(CharacterVector x, List collectorSpec, const std::vector<std::string>& na);
+RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP, SEXP naSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP);
     Rcpp::traits::input_parameter< List >::type collectorSpec(collectorSpecSEXP);
-    __result = Rcpp::wrap(parse_vector_(x, collectorSpec));
+    Rcpp::traits::input_parameter< const std::vector<std::string>& >::type na(naSEXP);
+    __result = Rcpp::wrap(parse_vector_(x, collectorSpec, na));
     return __result;
 END_RCPP
 }