Allow multiple comment patterns

This commit allows users to define multiple comments using a character vector. Additionally, comment detection is now handled exclusively by the datasource, before it was splitted between both the datasource and the tokenizer. The `comment` argument in the tokenizer functions is deprecated and will be removed in future versions.
tidyverse · Dec 17, 2017 · b5760c3 · b5760c3
1 parent 1323253
commit b5760c3
Show file tree

Hide file tree

Showing 35 changed files with 318 additions and 257 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -31,6 +31,12 @@
 * Allow files to be read via FTP over SSH by recognising `sftp` as a URL protocol (#707, @jdeboer).
 * `read_*()` now converts string `file`s to UTF-8 before parsing, which is convenient for non-UTF-8 platforms
   in most cases (#730, @yutannihilation).
+
+* Comments are now handled by the `datasource` instead of by the `tokenizer` (@zeehio, #766)
+
+* Allow a character vector of comments. Using `comment = c("//", "#")` will
+  skip all the lines starting with either `//` or `#`. (@zeehio, #766)
+
 # readr 1.1.1
 
 * Point release for test compatibility with tibble v1.3.1.

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,14 +5,6 @@ collectorGuess <- function(input, locale_) {
     .Call(`_readr_collectorGuess`, input, locale_)
 }
 
-source_encoding <- function(spec) {
-    .Call(`_readr_source_encoding`, spec)
-}
-
-whitespaceColumns <- function(sourceSpec, n = 100L, comment = "") {
-    .Call(`_readr_whitespaceColumns`, sourceSpec, n, comment)
-}
-
 read_connection_ <- function(con, chunk_size = 64 * 1024L) {
     .Call(`_readr_read_connection_`, con, chunk_size)
 }
@@ -73,6 +65,14 @@ guess_types_ <- function(sourceSpec, tokenizerSpec, locale_, n = 100L) {
     .Call(`_readr_guess_types_`, sourceSpec, tokenizerSpec, locale_, n)
 }
 
+source_encoding <- function(spec) {
+    .Call(`_readr_source_encoding`, spec)
+}
+
+whitespaceColumns <- function(sourceSpec, n = 100L) {
+    .Call(`_readr_whitespaceColumns`, sourceSpec, n)
+}
+
 type_convert_col <- function(x, spec, locale_, col, na, trim_ws) {
     .Call(`_readr_type_convert_col`, x, spec, locale_, col, na, trim_ws)
 }

diff --git a/R/read_delim.R b/R/read_delim.R
@@ -101,7 +101,7 @@ read_delim <- function(file, delim, quote = '"',
   }
   tokenizer <- tokenizer_delim(delim, quote = quote,
     escape_backslash = escape_backslash, escape_double = escape_double,
-    na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws)
+    na = na, quoted_na = quoted_na, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
       guess_max, progress = progress)
@@ -115,7 +115,7 @@ read_csv <- function(file, col_names = TRUE, col_types = NULL,
                      skip = 0, n_max = Inf, guess_max = min(1000, n_max),
                      progress = show_progress()) {
   tokenizer <- tokenizer_csv(na = na, quoted_na = quoted_na, quote = quote,
-    comment = comment, trim_ws = trim_ws)
+    trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
       guess_max, progress = progress)
@@ -135,7 +135,7 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
     locale$grouping_mark <- "."
   }
   tokenizer <- tokenizer_delim(delim = ";", na = na, quoted_na = quoted_na,
-    quote = quote, comment = comment, trim_ws = trim_ws)
+    quote = quote, trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     locale = locale, skip = skip, comment = comment, n_max = n_max,
     guess_max = guess_max, progress = progress)
@@ -150,7 +150,7 @@ read_tsv <- function(file, col_names = TRUE, col_types = NULL,
                      comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
                      guess_max = min(1000, n_max), progress = show_progress()) {
   tokenizer <- tokenizer_tsv(na = na, quoted_na = quoted_na, quote = quote,
-    comment = comment, trim_ws = trim_ws)
+    trim_ws = trim_ws)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     locale = locale, skip = skip, comment = comment, n_max = n_max,
     guess_max = guess_max, progress = progress)

diff --git a/R/read_fwf.R b/R/read_fwf.R
@@ -37,12 +37,12 @@ read_fwf <- function(file, col_positions, col_types = NULL,
                      locale = default_locale(), na = c("", "NA"),
                      comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
                      guess_max = min(n_max, 1000), progress = show_progress()) {
-  ds <- datasource(file, skip = skip, encoding = locale$encoding)
+  ds <- datasource(file, skip = skip, comment = comment, encoding = locale$encoding)
   if (inherits(ds, "source_file") && empty_file(file)) {
     return(tibble::tibble())
   }
 
-  tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment, trim_ws = trim_ws)
+  tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, trim_ws = trim_ws)
 
   spec <- col_spec_standardise(
     file,
@@ -73,9 +73,9 @@ read_fwf <- function(file, col_positions, col_types = NULL,
 #' @param n Number of lines the tokenizer will read to determine file structure. By default
 #'      it is set to 100.
 fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L, encoding = "UTF-8") {
-  ds <- datasource(file, skip = skip, encoding = encoding)
+  ds <- datasource(file, skip = skip, comment = comment, encoding = encoding)
 
-  out <- whitespaceColumns(ds, comment = comment, n = n)
+  out <- whitespaceColumns(ds, n = n)
   out$end[length(out$end)] <- NA
 
   col_names <- fwf_col_names(col_names, length(out$begin))

diff --git a/R/read_table.R b/R/read_table.R
@@ -36,19 +36,20 @@ read_table <- function(file, col_names = TRUE, col_types = NULL,
                        locale = default_locale(), na = "NA", skip = 0,
                        n_max = Inf, guess_max = min(n_max, 1000),
                        progress = show_progress(), comment = "") {
-  ds <- datasource(file, skip = skip, encoding = locale$encoding)
+  ds <- datasource(file, skip = skip, comment = comment, encoding = locale$encoding)
   columns <- fwf_empty(ds, skip = skip, n = guess_max, comment = comment)
-  skip <- skip + columns$skip
 
-  tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment)
+  tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na)
 
   spec <- col_spec_standardise(
     file = ds, skip = skip, guess_max = guess_max,
     col_names = col_names, col_types = col_types,
+    comment = comment,
     locale = locale, tokenizer = tokenizer
   )
 
-  ds <- datasource(file = ds, skip = skip + isTRUE(col_names), encoding = locale$encoding)
+  ds <- datasource(file = ds, skip = skip + isTRUE(col_names), comment = comment,
+                   encoding = locale$encoding)
   if (is.null(col_types) && !inherits(ds, "source_string")) {
     show_cols_spec(spec)
   }
@@ -67,7 +68,7 @@ read_table2 <- function(file, col_names = TRUE, col_types = NULL,
                        n_max = Inf, guess_max = min(n_max, 1000),
                        progress = show_progress(), comment = "") {
 
-  tokenizer <- tokenizer_ws(na = na, comment = comment)
+  tokenizer <- tokenizer_ws(na = na)
   read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
     locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
       guess_max, progress = progress)

diff --git a/R/source.R b/R/source.R
@@ -16,6 +16,9 @@
 #'    Using a value of [clipboard()] will read from the system clipboard.
 #'
 #' @param skip Number of lines to skip before reading data.
+#' @param comment A string used to identify comments. Any text after the
+#'   comment characters will be silently ignored. Multiple comments can be given
+#'   using a character vector.
 #' @param encoding The text encoding of the data given in `file`. "UTF-8" as default.
 #' @keywords internal
 #' @export

diff --git a/R/tokenizer.R b/R/tokenizer.R
@@ -35,8 +35,8 @@ NULL
 
 #' @export
 #' @rdname Tokenizers
-#' @param comment A string used to identify comments. Any text after the
-#'   comment characters will be silently ignored.
+#' @param comment This argument is deprecated and will be ignored. Comments are
+#'   now handled by [datasource()].
 #' @param na Character vector of strings to use for missing values. Set this
 #'   option to `character()` to indicate no missing values.
 #' @param quoted_na Should missing values inside quotes be treated as missing
@@ -56,13 +56,15 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, com
                             trim_ws = TRUE,
                             escape_double = TRUE,
                             escape_backslash = FALSE) {
+  if (!missing(comment)) {
+    warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
+  }
   structure(
     list(
       delim = delim,
       quote = quote,
       na = na,
       quoted_na = quoted_na,
-      comment = comment,
       trim_ws = trim_ws,
       escape_double = escape_double,
       escape_backslash = escape_backslash
@@ -75,12 +77,14 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, com
 #' @rdname Tokenizers
 tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
                           comment = "", trim_ws = TRUE) {
+  if (!missing(comment)) {
+    warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
+  }
   tokenizer_delim(
     delim = ",",
     na = na,
     quoted_na = quoted_na,
     quote = quote,
-    comment = comment,
     trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
@@ -91,12 +95,14 @@ tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
 #' @rdname Tokenizers
 tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
                           comment = "", trim_ws = TRUE) {
+  if (!missing(comment)) {
+    warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
+  }
   tokenizer_delim(
     delim = "\t",
     na = na,
     quoted_na = quoted_na,
     quote = quote,
-    comment = comment,
     trim_ws = trim_ws,
     escape_double = TRUE,
     escape_backslash = FALSE
@@ -122,11 +128,17 @@ tokenizer_log <- function() {
 #'   offsets so the first column is column zero, and the ranges are
 #'   [begin, end) (i.e inclusive-exclusive).
 tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE) {
-  structure(list(begin = begin, end = end, na = na, comment = comment, trim_ws = trim_ws), class = "tokenizer_fwf")
+  if (!missing(comment)) {
+    warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
+  }
+  structure(list(begin = begin, end = end, na = na, trim_ws = trim_ws), class = "tokenizer_fwf")
 }
 
 #' @export
 #' @rdname Tokenizers
 tokenizer_ws <- function(na = "NA", comment = "") {
-  structure(list(na = na, comment = comment), class = "tokenizer_ws")
+  if (!missing(comment)) {
+    warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
+  }
+  structure(list(na = na), class = "tokenizer_ws")
 }
diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd
diff --git a/man/datasource.Rd b/man/datasource.Rd
diff --git a/man/read_delim.Rd b/man/read_delim.Rd
diff --git a/man/read_delim_chunked.Rd b/man/read_delim_chunked.Rd
diff --git a/man/read_fwf.Rd b/man/read_fwf.Rd
diff --git a/man/read_table.Rd b/man/read_table.Rd
diff --git a/man/spec_delim.Rd b/man/spec_delim.Rd
diff --git a/notes/design.Rmd b/notes/design.Rmd
@@ -99,7 +99,7 @@ The tokenizer (`Tokenizer.h`) turns a source (a stream of characters) into a str
 TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
 
 # Initialise it with a source
-tokenizer->tokenize(source->begin(), source->end());
+tokenizer->tokenize(source);
 
 # Call nextToken until there are no tokens left 
 for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken());

diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -17,30 +17,6 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// source_encoding
-CharacterVector source_encoding(List spec);
-RcppExport SEXP _readr_source_encoding(SEXP specSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< List >::type spec(specSEXP);
-    rcpp_result_gen = Rcpp::wrap(source_encoding(spec));
-    return rcpp_result_gen;
-END_RCPP
-}
-// whitespaceColumns
-List whitespaceColumns(List sourceSpec, int n, std::string comment);
-RcppExport SEXP _readr_whitespaceColumns(SEXP sourceSpecSEXP, SEXP nSEXP, SEXP commentSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< List >::type sourceSpec(sourceSpecSEXP);
-    Rcpp::traits::input_parameter< int >::type n(nSEXP);
-    Rcpp::traits::input_parameter< std::string >::type comment(commentSEXP);
-    rcpp_result_gen = Rcpp::wrap(whitespaceColumns(sourceSpec, n, comment));
-    return rcpp_result_gen;
-END_RCPP
-}
 // read_connection_
 RawVector read_connection_(RObject con, int chunk_size);
 RcppExport SEXP _readr_read_connection_(SEXP conSEXP, SEXP chunk_sizeSEXP) {
@@ -250,6 +226,29 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// source_encoding
+CharacterVector source_encoding(List spec);
+RcppExport SEXP _readr_source_encoding(SEXP specSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< List >::type spec(specSEXP);
+    rcpp_result_gen = Rcpp::wrap(source_encoding(spec));
+    return rcpp_result_gen;
+END_RCPP
+}
+// whitespaceColumns
+List whitespaceColumns(List sourceSpec, int n);
+RcppExport SEXP _readr_whitespaceColumns(SEXP sourceSpecSEXP, SEXP nSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< List >::type sourceSpec(sourceSpecSEXP);
+    Rcpp::traits::input_parameter< int >::type n(nSEXP);
+    rcpp_result_gen = Rcpp::wrap(whitespaceColumns(sourceSpec, n));
+    return rcpp_result_gen;
+END_RCPP
+}
 // type_convert_col
 RObject type_convert_col(CharacterVector x, List spec, List locale_, int col, const std::vector<std::string>& na, bool trim_ws);
 RcppExport SEXP _readr_type_convert_col(SEXP xSEXP, SEXP specSEXP, SEXP locale_SEXP, SEXP colSEXP, SEXP naSEXP, SEXP trim_wsSEXP) {
@@ -332,8 +331,6 @@ END_RCPP
 
 static const R_CallMethodDef CallEntries[] = {
     {"_readr_collectorGuess", (DL_FUNC) &_readr_collectorGuess, 2},
-    {"_readr_source_encoding", (DL_FUNC) &_readr_source_encoding, 1},
-    {"_readr_whitespaceColumns", (DL_FUNC) &_readr_whitespaceColumns, 3},
     {"_readr_read_connection_", (DL_FUNC) &_readr_read_connection_, 2},
     {"_readr_utctime", (DL_FUNC) &_readr_utctime, 7},
     {"_readr_dim_tokens_", (DL_FUNC) &_readr_dim_tokens_, 2},
@@ -349,6 +346,8 @@ static const R_CallMethodDef CallEntries[] = {
     {"_readr_read_tokens_", (DL_FUNC) &_readr_read_tokens_, 7},
     {"_readr_read_tokens_chunked_", (DL_FUNC) &_readr_read_tokens_chunked_, 8},
     {"_readr_guess_types_", (DL_FUNC) &_readr_guess_types_, 4},
+    {"_readr_source_encoding", (DL_FUNC) &_readr_source_encoding, 1},
+    {"_readr_whitespaceColumns", (DL_FUNC) &_readr_whitespaceColumns, 2},
     {"_readr_type_convert_col", (DL_FUNC) &_readr_type_convert_col, 6},
     {"_readr_write_lines_", (DL_FUNC) &_readr_write_lines_, 4},
     {"_readr_write_lines_raw_", (DL_FUNC) &_readr_write_lines_raw_, 3},

diff --git a/src/Reader.cpp b/src/Reader.cpp
@@ -30,7 +30,7 @@ Reader::Reader(
 }
 
 void Reader::init(CharacterVector colNames) {
-  tokenizer_->tokenize(source_->begin(), source_->end());
+  tokenizer_->tokenize(source_);
   tokenizer_->setWarnings(&warnings_);
 
   // Work out which output columns we are keeping and set warnings for each