From f05d4ce31e7a06da5c7d731d9aa5fbd7212eb398 Mon Sep 17 00:00:00 2001
From: hadley <h.wickham@gmail.com>
Date: Thu, 3 Sep 2015 15:17:05 -0500
Subject: [PATCH] Selectively trim_ws. Parse NA in parse_vector.

Fixes #137
---
 NEWS.md                                 |  6 ++++
 R/RcppExports.R                         |  4 +--
 R/collectors.R                          | 38 ++++++++++++++-----------
 R/read_delim.R                          | 14 +++++----
 R/tokenizer.R                           | 13 +++++++--
 R/type_convert.R                        |  3 ++
 man/Tokenizers.Rd                       |  9 ++++--
 man/collector.Rd                        | 16 ++++++-----
 man/parse_vector.Rd                     |  8 ++++--
 man/read_delim.Rd                       |  9 ++++--
 man/type_convert.Rd                     |  3 ++
 src/RcppExports.cpp                     |  7 +++--
 src/Tokenizer.cpp                       |  3 +-
 src/TokenizerDelim.h                    | 18 ++++++++----
 src/parse.cpp                           |  5 +++-
 tests/testthat/test-parsing-character.R | 12 ++++++++
 tests/testthat/test-parsing-numeric.R   |  9 ++++++
 17 files changed, 124 insertions(+), 53 deletions(-)
 create mode 100644 tests/testthat/test-parsing-character.R

diff --git a/NEWS.md b/NEWS.md
index 558682cc..0a9fc2d5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,11 @@
 # readr 0.1.1.9000
 
+* `read_delim()` etc gains a `trim_ws` argument that controls whether leading
+  and trailing whitespace is automatically removed. It defaults to `TRUE`. (#137)
+  
+* `parse_*` gain a `na` argument that allows you to specify which values should
+  be converted to missing.
+
 * Month and abbreviated month name formats (`%b` and `%B`) now ignore
   case when matching (#219).
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 2969b137..b943c21c 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -29,8 +29,8 @@ tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) {
     .Call('readr_tokenize_', PACKAGE = 'readr', sourceSpec, tokenizerSpec, n_max)
 }
 
-parse_vector_ <- function(x, collectorSpec) {
-    .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec)
+parse_vector_ <- function(x, collectorSpec, na) {
+    .Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec, na)
 }
 
 read_file_ <- function(sourceSpec) {
diff --git a/R/collectors.R b/R/collectors.R
index 17556564..fefd4a45 100644
--- a/R/collectors.R
+++ b/R/collectors.R
@@ -15,10 +15,11 @@ collector_find <- function(name) {
 #'
 #' @param x Character vector of elements to parse.
 #' @param collector Column specification.
+#' @param na Character vector giving strings to treat as missing.
 #' @keywords internal
 #' @export
 #' @examples
-#' x <- c("1", "2", "3", NA)
+#' x <- c("1", "2", "3", "NA")
 #' parse_vector(x, col_integer())
 #' parse_vector(x, col_double())
 #' parse_vector(x, col_character())
@@ -27,8 +28,10 @@ collector_find <- function(name) {
 #' # Invalid values are replaced with missing values with a warning.
 #' x <- c("1", "2", "3", "-")
 #' parse_vector(x, col_double())
-parse_vector <- function(x, collector) {
-  warn_problems(parse_vector_(x, collector))
+#' # Or flag values as missing
+#' parse_vector(x, col_double(), na = "-")
+parse_vector <- function(x, collector, na = "NA") {
+  warn_problems(parse_vector_(x, collector, na = na))
 }
 
 #' Parse character vectors into typed columns.
@@ -39,6 +42,7 @@ parse_vector <- function(x, collector) {
 #'
 #' @name collector
 #' @param x Character vector of values to parse.
+#' @param na Character vector giving strings to treat as missing.
 #' @seealso \code{\link{parse_datetime}}, \code{\link{type_convert}} to
 #'   automatically re-parse all character columns in a data frame.
 #' @examples
@@ -62,8 +66,8 @@ col_character <- function() {
 
 #' @rdname collector
 #' @export
-parse_character <- function(x) {
-  parse_vector(x, col_character())
+parse_character <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_character(), na = na)
 }
 
 #' @rdname collector
@@ -74,8 +78,8 @@ col_integer <- function() {
 
 #' @rdname collector
 #' @export
-parse_integer <- function(x) {
-  parse_vector(x, col_integer())
+parse_integer <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_integer(), na = na)
 }
 
 #' @rdname collector
@@ -86,8 +90,8 @@ col_double <- function() {
 
 #' @rdname collector
 #' @export
-parse_double <- function(x) {
-  parse_vector(x, col_double())
+parse_double <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_double(), na = na)
 }
 
 #' @rdname collector
@@ -98,8 +102,8 @@ col_euro_double <- function() {
 
 #' @rdname collector
 #' @export
-parse_euro_double <- function(x) {
-  parse_vector(x, col_euro_double())
+parse_euro_double <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_euro_double(), na = na)
 }
 
 
@@ -111,8 +115,8 @@ col_numeric <- function() {
 
 #' @rdname collector
 #' @export
-parse_numeric <- function(x) {
-  parse_vector(x, col_numeric())
+parse_numeric <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_numeric(), na = na)
 }
 
 #' @rdname collector
@@ -123,8 +127,8 @@ col_logical <- function() {
 
 #' @rdname collector
 #' @export
-parse_logical <- function(x) {
-  parse_vector(x, col_logical())
+parse_logical <- function(x, na = c("", "NA")) {
+  parse_vector(x, col_logical(), na = na)
 }
 
 #' @param levels Character vector providing set of allowed levels.
@@ -137,8 +141,8 @@ col_factor <- function(levels, ordered = FALSE) {
 
 #' @rdname collector
 #' @export
-parse_factor <- function(x, levels, ordered = FALSE) {
-  parse_vector(x, col_factor(levels, ordered))
+parse_factor <- function(x, levels, ordered = FALSE, na = c("", "NA")) {
+  parse_vector(x, col_factor(levels, ordered), na = na)
 }
 
 #' @rdname collector
diff --git a/R/read_delim.R b/R/read_delim.R
index d0679176..76cbd449 100644
--- a/R/read_delim.R
+++ b/R/read_delim.R
@@ -69,9 +69,10 @@ read_delim <- function(file, delim, quote = '"', escape_backslash = FALSE,
 #' @rdname read_delim
 #' @export
 read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-                     skip = 0, n_max = -1, progress = interactive()) {
+                     trim_ws = TRUE, skip = 0, n_max = -1,
+                     progress = interactive()) {
 
-  tokenizer <- tokenizer_csv(na = na)
+  tokenizer <- tokenizer_csv(na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }
@@ -79,10 +80,10 @@ read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
 #' @rdname read_delim
 #' @export
 read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
-                      na = c("", "NA"), skip = 0, n_max = -1,
+                      na = c("", "NA"), trim_ws = TRUE, skip = 0, n_max = -1,
                       progress = interactive()) {
 
-  tokenizer <- tokenizer_delim(delim = ";", na = na)
+  tokenizer <- tokenizer_delim(delim = ";", na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }
@@ -91,9 +92,10 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
 #' @rdname read_delim
 #' @export
 read_tsv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-                     skip = 0, n_max = -1, progress = interactive()) {
+                     trim_ws = TRUE, skip = 0, n_max = -1,
+                     progress = interactive()) {
 
-  tokenizer <- tokenizer_tsv(na = na)
+  tokenizer <- tokenizer_tsv(na = na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress)
 }
diff --git a/R/tokenizer.R b/R/tokenizer.R
index 153ca14a..ce7fc1eb 100644
--- a/R/tokenizer.R
+++ b/R/tokenizer.R
@@ -38,6 +38,8 @@ NULL
 #'   option to \code{character()} to indicate no missing values.
 #' @param delim Single character used to separate fields within a record.
 #' @param quote Single character used to quote strings.
+#' @param trim_ws Should leading and trailing whitespace be trimmed from
+#'   each field before parsing it?
 #' @param escape_double Does the file escape quotes by doubling them?
 #'   i.e. If this option is \code{TRUE}, the value \code{""""} represents
 #'   a single quote, \code{\"}.
@@ -46,12 +48,15 @@ NULL
 #'   can be used to escape the delimeter character, the quote characer, or
 #'   to add special characters like \code{\\n}.
 tokenizer_delim <- function(delim, quote = '"', na = "NA",
-                            escape_double = TRUE, escape_backslash = FALSE) {
+                            trim_ws = TRUE,
+                            escape_double = TRUE,
+                            escape_backslash = FALSE) {
   structure(
     list(
       delim = delim,
       quote = quote,
       na = na,
+      trim_ws = trim_ws,
       escape_double = escape_double,
       escape_backslash = escape_backslash
     ),
@@ -61,11 +66,12 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA",
 
 #' @export
 #' @rdname Tokenizers
-tokenizer_csv <- function(na = "NA") {
+tokenizer_csv <- function(na = "NA", trim_ws = TRUE) {
   tokenizer_delim(
     delim = ",",
     quote = '"',
     na = na,
+    trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
   )
@@ -73,11 +79,12 @@ tokenizer_csv <- function(na = "NA") {
 
 #' @export
 #' @rdname Tokenizers
-tokenizer_tsv <- function(na = "NA") {
+tokenizer_tsv <- function(na = "NA", trim_ws = TRUE) {
   tokenizer_delim(
     delim = "\t",
     quote = '"',
     na = na,
+    trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
   )
diff --git a/R/type_convert.R b/R/type_convert.R
index a7b5aee2..6a37f75f 100644
--- a/R/type_convert.R
+++ b/R/type_convert.R
@@ -15,6 +15,9 @@
 #' )
 #' str(df)
 #' str(type_convert(df))
+#'
+#' df <- data.frame(x = c("NA", "10"), stringsAsFactors=  FALSE)
+#' type_convert(df)
 type_convert <- function(df, col_types = NULL) {
   is_character <- vapply(df, is.character, logical(1))
 
diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd
index cc4c44b6..56856697 100644
--- a/man/Tokenizers.Rd
+++ b/man/Tokenizers.Rd
@@ -11,11 +11,11 @@
 \title{Tokenizers.}
 \usage{
 tokenizer_delim(delim,
-  quote = "\\"", na = "NA", escape_double = TRUE, escape_backslash = FALSE)
+  quote = "\\"", na = "NA", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE)
 
-tokenizer_csv(na = "NA")
+tokenizer_csv(na = "NA", trim_ws = TRUE)
 
-tokenizer_tsv(na = "NA")
+tokenizer_tsv(na = "NA", trim_ws = TRUE)
 
 tokenizer_line()
 
@@ -31,6 +31,9 @@ tokenizer_fwf(begin, end, na = "NA")
 \item{na}{Character vector of strings to use for missing values. Set this
 option to \code{character()} to indicate no missing values.}
 
+\item{trim_ws}{Should leading and trailing whitespace be trimmed from
+each field before parsing it?}
+
 \item{escape_double}{Does the file escape quotes by doubling them?
 i.e. If this option is \code{TRUE}, the value \code{""""} represents
 a single quote, \code{\"}.}
diff --git a/man/collector.Rd b/man/collector.Rd
index 5d6e3378..ce784b55 100644
--- a/man/collector.Rd
+++ b/man/collector.Rd
@@ -21,37 +21,39 @@
 \usage{
 col_character()
 
-parse_character(x)
+parse_character(x, na = c("", "NA"))
 
 col_integer()
 
-parse_integer(x)
+parse_integer(x, na = c("", "NA"))
 
 col_double()
 
-parse_double(x)
+parse_double(x, na = c("", "NA"))
 
 col_euro_double()
 
-parse_euro_double(x)
+parse_euro_double(x, na = c("", "NA"))
 
 col_numeric()
 
-parse_numeric(x)
+parse_numeric(x, na = c("", "NA"))
 
 col_logical()
 
-parse_logical(x)
+parse_logical(x, na = c("", "NA"))
 
 col_factor(levels, ordered = FALSE)
 
-parse_factor(x, levels, ordered = FALSE)
+parse_factor(x, levels, ordered = FALSE, na = c("", "NA"))
 
 col_skip()
 }
 \arguments{
 \item{x}{Character vector of values to parse.}
 
+\item{na}{Character vector giving strings to treat as missing.}
+
 \item{levels}{Character vector providing set of allowed levels.}
 
 \item{ordered}{Is it an ordered factor?}
diff --git a/man/parse_vector.Rd b/man/parse_vector.Rd
index 9f450d41..b805a399 100644
--- a/man/parse_vector.Rd
+++ b/man/parse_vector.Rd
@@ -4,18 +4,20 @@
 \alias{parse_vector}
 \title{Parse a character vector.}
 \usage{
-parse_vector(x, collector)
+parse_vector(x, collector, na = "NA")
 }
 \arguments{
 \item{x}{Character vector of elements to parse.}
 
 \item{collector}{Column specification.}
+
+\item{na}{Character vector giving strings to treat as missing.}
 }
 \description{
 Parse a character vector.
 }
 \examples{
-x <- c("1", "2", "3", NA)
+x <- c("1", "2", "3", "NA")
 parse_vector(x, col_integer())
 parse_vector(x, col_double())
 parse_vector(x, col_character())
@@ -24,6 +26,8 @@ parse_vector(x, col_skip())
 # Invalid values are replaced with missing values with a warning.
 x <- c("1", "2", "3", "-")
 parse_vector(x, col_double())
+# Or flag values as missing
+parse_vector(x, col_double(), na = "-")
 }
 \keyword{internal}
 
diff --git a/man/read_delim.Rd b/man/read_delim.Rd
index c2b8b475..bc62561a 100644
--- a/man/read_delim.Rd
+++ b/man/read_delim.Rd
@@ -12,13 +12,13 @@ read_delim(file, delim, quote = '\"', escape_backslash = FALSE,
   skip = 0, n_max = -1, progress = interactive())
 
 read_csv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 
 read_csv2(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 
 read_tsv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
-  skip = 0, n_max = -1, progress = interactive())
+  trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
 }
 \arguments{
 \item{file}{Either a path to a file, a connection, or literal data
@@ -82,6 +82,9 @@ option to \code{character()} to indicate no missing values.}
 \item{progress}{Display a progress bar? By default it will only display
 in an interactive session. The display is updated every 50,000 values
 and will only display if estimated reading time is 5 seconds or more.}
+
+\item{trim_ws}{Should leading and trailing whitespace be trimmed from
+each field before parsing it?}
 }
 \value{
 A data frame. If there are parsing problems, a warning tells you
diff --git a/man/type_convert.Rd b/man/type_convert.Rd
index 29d7b1a8..f33a2458 100644
--- a/man/type_convert.Rd
+++ b/man/type_convert.Rd
@@ -38,5 +38,8 @@ df <- data.frame(
 )
 str(df)
 str(type_convert(df))
+
+df <- data.frame(x = c("NA", "10"), stringsAsFactors=  FALSE)
+type_convert(df)
 }
 
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index e1c814a8..7f7f0aaf 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -94,14 +94,15 @@ BEGIN_RCPP
 END_RCPP
 }
 // parse_vector_
-SEXP parse_vector_(CharacterVector x, List collectorSpec);
-RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP) {
+SEXP parse_vector_(CharacterVector x, List collectorSpec, const std::vector<std::string>& na);
+RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP, SEXP naSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP);
     Rcpp::traits::input_parameter< List >::type collectorSpec(collectorSpecSEXP);
-    __result = Rcpp::wrap(parse_vector_(x, collectorSpec));
+    Rcpp::traits::input_parameter< const std::vector<std::string>& >::type na(naSEXP);
+    __result = Rcpp::wrap(parse_vector_(x, collectorSpec, na));
     return __result;
 END_RCPP
 }
diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp
index 634c0558..ceb58a98 100644
--- a/src/Tokenizer.cpp
+++ b/src/Tokenizer.cpp
@@ -15,11 +15,12 @@ TokenizerPtr Tokenizer::create(List spec) {
     char delim = as<char>(spec["delim"]);
     char quote = as<char>(spec["quote"]);
     std::vector<std::string> na = as<std::vector<std::string> >(spec["na"]);
+    bool trimWs = as<bool>(spec["trim_ws"]);
     bool escapeDouble = as<bool>(spec["escape_double"]);
     bool escapeBackslash = as<bool>(spec["escape_backslash"]);
 
     return TokenizerPtr(new
-      TokenizerDelim(delim, quote, na, escapeBackslash, escapeDouble)
+      TokenizerDelim(delim, quote, na, trimWs, escapeBackslash, escapeDouble)
     );
   } else if (subclass == "tokenizer_fwf") {
     std::vector<int>
diff --git a/src/TokenizerDelim.h b/src/TokenizerDelim.h
index 2f42859e..a53ee155 100644
--- a/src/TokenizerDelim.h
+++ b/src/TokenizerDelim.h
@@ -20,7 +20,7 @@ class TokenizerDelim : public Tokenizer {
   char delim_, quote_;
   std::vector<std::string> NA_;
 
-  bool escapeBackslash_, escapeDouble_;
+  bool trimWS_, escapeBackslash_, escapeDouble_;
 
   SourceIterator begin_, cur_, end_;
   DelimState state_;
@@ -31,10 +31,12 @@ class TokenizerDelim : public Tokenizer {
 
   TokenizerDelim(char delim = ',', char quote = '"',
                std::vector<std::string> NA = std::vector<std::string>(1, "NA"),
-               bool escapeBackslash = false, bool escapeDouble = true):
+               bool trimWS = true, bool escapeBackslash = false,
+               bool escapeDouble = true):
     delim_(delim),
     quote_(quote),
     NA_(NA),
+    trimWS_(trimWS),
     escapeBackslash_(escapeBackslash),
     escapeDouble_(escapeDouble),
     moreTokens_(false)
@@ -211,13 +213,19 @@ class TokenizerDelim : public Tokenizer {
 
   Token fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
                    int row, int col) {
-    return Token(begin, end, row, col, (hasEscapeB) ? this : NULL).flagNA(NA_);
+    Token t(begin, end, row, col, (hasEscapeB) ? this : NULL);
+    t.flagNA(NA_);
+    if (trimWS_)
+      t.trim();
+    return t;
   }
 
   Token stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
                     bool hasEscapeD, int row, int col) {
-    return Token(begin, end, row, col,
-      (hasEscapeD || hasEscapeB) ? this : NULL);
+    Token t(begin, end, row, col, (hasEscapeD || hasEscapeB) ? this : NULL);
+    if (trimWS_)
+      t.trim();
+    return t;
   }
 
 
diff --git a/src/parse.cpp b/src/parse.cpp
index 475c5051..59df7b5e 100644
--- a/src/parse.cpp
+++ b/src/parse.cpp
@@ -74,7 +74,8 @@ RObject tokenize_(List sourceSpec, List tokenizerSpec, int n_max) {
 }
 
 // [[Rcpp::export]]
-SEXP parse_vector_(CharacterVector x, List collectorSpec) {
+SEXP parse_vector_(CharacterVector x, List collectorSpec,
+                   const std::vector<std::string>& na) {
   Warnings warnings;
   int n = x.size();
 
@@ -89,6 +90,8 @@ SEXP parse_vector_(CharacterVector x, List collectorSpec) {
     } else {
       SEXP string = x[i];
       t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1);
+      t.flagNA(na);
+      t.trim();
     }
     col->setValue(i, t);
   }
diff --git a/tests/testthat/test-parsing-character.R b/tests/testthat/test-parsing-character.R
new file mode 100644
index 00000000..ff621a47
--- /dev/null
+++ b/tests/testthat/test-parsing-character.R
@@ -0,0 +1,12 @@
+context("Parsing, character")
+
+test_that("ws dropped by default", {
+  df <- read_csv("x\n a \n    b\n")
+  expect_equal(df$x, c("a", "b"))
+})
+
+test_that("trim_ws = FALSE keeps ws", {
+  df <- read_csv("x\n a\nb \n", trim_ws = FALSE)
+  expect_equal(df$x, c(" a", "b "))
+})
+
diff --git a/tests/testthat/test-parsing-numeric.R b/tests/testthat/test-parsing-numeric.R
index ce8640cb..4b57f219 100644
--- a/tests/testthat/test-parsing-numeric.R
+++ b/tests/testthat/test-parsing-numeric.R
@@ -9,3 +9,12 @@ test_that("partial integer/double matches fail", {
   expect_true(is.na(parse_double("3d")))
   expect_true(is.na(parse_integer("3d")))
 })
+
+test_that("parse functions converts NAs", {
+  expect_equal(parse_double(c("1.5", "NA")), c(1.5, NA))
+})
+
+test_that("leading/trailing ws ignored when parsing", {
+  expect_equal(parse_double(c(" 1.5", "1.5", "1.5 ")), rep(1.5, 3))
+  expect_equal(read_csv("x\n 1.5\n1.5\n1.5 \n")$x, rep(1.5, 3))
+})