Skip to content

Commit

Permalink
Allow multiple comment patterns
Browse files Browse the repository at this point in the history
This commit allows users to define multiple comments using a character vector.

Additionally, comment detection is now handled exclusively by the datasource, before it was splitted between both the datasource and the tokenizer.

The `comment` argument in the tokenizer functions is deprecated and will be removed in future versions.
  • Loading branch information
zeehio committed Dec 17, 2017
1 parent 1323253 commit b5760c3
Show file tree
Hide file tree
Showing 35 changed files with 318 additions and 257 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
* Allow files to be read via FTP over SSH by recognising `sftp` as a URL protocol (#707, @jdeboer).
* `read_*()` now converts string `file`s to UTF-8 before parsing, which is convenient for non-UTF-8 platforms
in most cases (#730, @yutannihilation).

* Comments are now handled by the `datasource` instead of by the `tokenizer` (@zeehio, #766)

* Allow a character vector of comments. Using `comment = c("//", "#")` will
skip all the lines starting with either `//` or `#`. (@zeehio, #766)

# readr 1.1.1

* Point release for test compatibility with tibble v1.3.1.
Expand Down
16 changes: 8 additions & 8 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@ collectorGuess <- function(input, locale_) {
.Call(`_readr_collectorGuess`, input, locale_)
}

source_encoding <- function(spec) {
.Call(`_readr_source_encoding`, spec)
}

whitespaceColumns <- function(sourceSpec, n = 100L, comment = "") {
.Call(`_readr_whitespaceColumns`, sourceSpec, n, comment)
}

read_connection_ <- function(con, chunk_size = 64 * 1024L) {
.Call(`_readr_read_connection_`, con, chunk_size)
}
Expand Down Expand Up @@ -73,6 +65,14 @@ guess_types_ <- function(sourceSpec, tokenizerSpec, locale_, n = 100L) {
.Call(`_readr_guess_types_`, sourceSpec, tokenizerSpec, locale_, n)
}

source_encoding <- function(spec) {
.Call(`_readr_source_encoding`, spec)
}

whitespaceColumns <- function(sourceSpec, n = 100L) {
.Call(`_readr_whitespaceColumns`, sourceSpec, n)
}

type_convert_col <- function(x, spec, locale_, col, na, trim_ws) {
.Call(`_readr_type_convert_col`, x, spec, locale_, col, na, trim_ws)
}
Expand Down
8 changes: 4 additions & 4 deletions R/read_delim.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ read_delim <- function(file, delim, quote = '"',
}
tokenizer <- tokenizer_delim(delim, quote = quote,
escape_backslash = escape_backslash, escape_double = escape_double,
na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws)
na = na, quoted_na = quoted_na, trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
guess_max, progress = progress)
Expand All @@ -115,7 +115,7 @@ read_csv <- function(file, col_names = TRUE, col_types = NULL,
skip = 0, n_max = Inf, guess_max = min(1000, n_max),
progress = show_progress()) {
tokenizer <- tokenizer_csv(na = na, quoted_na = quoted_na, quote = quote,
comment = comment, trim_ws = trim_ws)
trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
guess_max, progress = progress)
Expand All @@ -135,7 +135,7 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
locale$grouping_mark <- "."
}
tokenizer <- tokenizer_delim(delim = ";", na = na, quoted_na = quoted_na,
quote = quote, comment = comment, trim_ws = trim_ws)
quote = quote, trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
locale = locale, skip = skip, comment = comment, n_max = n_max,
guess_max = guess_max, progress = progress)
Expand All @@ -150,7 +150,7 @@ read_tsv <- function(file, col_names = TRUE, col_types = NULL,
comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max), progress = show_progress()) {
tokenizer <- tokenizer_tsv(na = na, quoted_na = quoted_na, quote = quote,
comment = comment, trim_ws = trim_ws)
trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
locale = locale, skip = skip, comment = comment, n_max = n_max,
guess_max = guess_max, progress = progress)
Expand Down
8 changes: 4 additions & 4 deletions R/read_fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ read_fwf <- function(file, col_positions, col_types = NULL,
locale = default_locale(), na = c("", "NA"),
comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(n_max, 1000), progress = show_progress()) {
ds <- datasource(file, skip = skip, encoding = locale$encoding)
ds <- datasource(file, skip = skip, comment = comment, encoding = locale$encoding)
if (inherits(ds, "source_file") && empty_file(file)) {
return(tibble::tibble())
}

tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment, trim_ws = trim_ws)
tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, trim_ws = trim_ws)

spec <- col_spec_standardise(
file,
Expand Down Expand Up @@ -73,9 +73,9 @@ read_fwf <- function(file, col_positions, col_types = NULL,
#' @param n Number of lines the tokenizer will read to determine file structure. By default
#' it is set to 100.
fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L, encoding = "UTF-8") {
ds <- datasource(file, skip = skip, encoding = encoding)
ds <- datasource(file, skip = skip, comment = comment, encoding = encoding)

out <- whitespaceColumns(ds, comment = comment, n = n)
out <- whitespaceColumns(ds, n = n)
out$end[length(out$end)] <- NA

col_names <- fwf_col_names(col_names, length(out$begin))
Expand Down
11 changes: 6 additions & 5 deletions R/read_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,20 @@ read_table <- function(file, col_names = TRUE, col_types = NULL,
locale = default_locale(), na = "NA", skip = 0,
n_max = Inf, guess_max = min(n_max, 1000),
progress = show_progress(), comment = "") {
ds <- datasource(file, skip = skip, encoding = locale$encoding)
ds <- datasource(file, skip = skip, comment = comment, encoding = locale$encoding)
columns <- fwf_empty(ds, skip = skip, n = guess_max, comment = comment)
skip <- skip + columns$skip

tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment)
tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na)

spec <- col_spec_standardise(
file = ds, skip = skip, guess_max = guess_max,
col_names = col_names, col_types = col_types,
comment = comment,
locale = locale, tokenizer = tokenizer
)

ds <- datasource(file = ds, skip = skip + isTRUE(col_names), encoding = locale$encoding)
ds <- datasource(file = ds, skip = skip + isTRUE(col_names), comment = comment,
encoding = locale$encoding)
if (is.null(col_types) && !inherits(ds, "source_string")) {
show_cols_spec(spec)
}
Expand All @@ -67,7 +68,7 @@ read_table2 <- function(file, col_names = TRUE, col_types = NULL,
n_max = Inf, guess_max = min(n_max, 1000),
progress = show_progress(), comment = "") {

tokenizer <- tokenizer_ws(na = na, comment = comment)
tokenizer <- tokenizer_ws(na = na)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
locale = locale, skip = skip, comment = comment, n_max = n_max, guess_max =
guess_max, progress = progress)
Expand Down
3 changes: 3 additions & 0 deletions R/source.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#' Using a value of [clipboard()] will read from the system clipboard.
#'
#' @param skip Number of lines to skip before reading data.
#' @param comment A string used to identify comments. Any text after the
#' comment characters will be silently ignored. Multiple comments can be given
#' using a character vector.
#' @param encoding The text encoding of the data given in `file`. "UTF-8" as default.
#' @keywords internal
#' @export
Expand Down
26 changes: 19 additions & 7 deletions R/tokenizer.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ NULL

#' @export
#' @rdname Tokenizers
#' @param comment A string used to identify comments. Any text after the
#' comment characters will be silently ignored.
#' @param comment This argument is deprecated and will be ignored. Comments are
#' now handled by [datasource()].
#' @param na Character vector of strings to use for missing values. Set this
#' option to `character()` to indicate no missing values.
#' @param quoted_na Should missing values inside quotes be treated as missing
Expand All @@ -56,13 +56,15 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, com
trim_ws = TRUE,
escape_double = TRUE,
escape_backslash = FALSE) {
if (!missing(comment)) {
warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
}
structure(
list(
delim = delim,
quote = quote,
na = na,
quoted_na = quoted_na,
comment = comment,
trim_ws = trim_ws,
escape_double = escape_double,
escape_backslash = escape_backslash
Expand All @@ -75,12 +77,14 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, com
#' @rdname Tokenizers
tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
comment = "", trim_ws = TRUE) {
if (!missing(comment)) {
warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
}
tokenizer_delim(
delim = ",",
na = na,
quoted_na = quoted_na,
quote = quote,
comment = comment,
trim_ws = trim_ws,
escape_double = TRUE,
escape_backslash = FALSE
Expand All @@ -91,12 +95,14 @@ tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
#' @rdname Tokenizers
tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
comment = "", trim_ws = TRUE) {
if (!missing(comment)) {
warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
}
tokenizer_delim(
delim = "\t",
na = na,
quoted_na = quoted_na,
quote = quote,
comment = comment,
trim_ws = trim_ws,
escape_double = TRUE,
escape_backslash = FALSE
Expand All @@ -122,11 +128,17 @@ tokenizer_log <- function() {
#' offsets so the first column is column zero, and the ranges are
#' [begin, end) (i.e inclusive-exclusive).
tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE) {
structure(list(begin = begin, end = end, na = na, comment = comment, trim_ws = trim_ws), class = "tokenizer_fwf")
if (!missing(comment)) {
warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
}
structure(list(begin = begin, end = end, na = na, trim_ws = trim_ws), class = "tokenizer_fwf")
}

#' @export
#' @rdname Tokenizers
tokenizer_ws <- function(na = "NA", comment = "") {
structure(list(na = na, comment = comment), class = "tokenizer_ws")
if (!missing(comment)) {
warning("comment argument in tokenizer is deprecated and will be ignored. Give it to the datasource instead.")
}
structure(list(na = na), class = "tokenizer_ws")
}
4 changes: 2 additions & 2 deletions man/Tokenizers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/datasource.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/read_delim.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/read_delim_chunked.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/read_fwf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/read_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/spec_delim.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion notes/design.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ The tokenizer (`Tokenizer.h`) turns a source (a stream of characters) into a str
TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);

# Initialise it with a source
tokenizer->tokenize(source->begin(), source->end());
tokenizer->tokenize(source);

# Call nextToken until there are no tokens left
for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken());
Expand Down
51 changes: 25 additions & 26 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,6 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// source_encoding
CharacterVector source_encoding(List spec);
RcppExport SEXP _readr_source_encoding(SEXP specSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type spec(specSEXP);
rcpp_result_gen = Rcpp::wrap(source_encoding(spec));
return rcpp_result_gen;
END_RCPP
}
// whitespaceColumns
List whitespaceColumns(List sourceSpec, int n, std::string comment);
RcppExport SEXP _readr_whitespaceColumns(SEXP sourceSpecSEXP, SEXP nSEXP, SEXP commentSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type sourceSpec(sourceSpecSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
Rcpp::traits::input_parameter< std::string >::type comment(commentSEXP);
rcpp_result_gen = Rcpp::wrap(whitespaceColumns(sourceSpec, n, comment));
return rcpp_result_gen;
END_RCPP
}
// read_connection_
RawVector read_connection_(RObject con, int chunk_size);
RcppExport SEXP _readr_read_connection_(SEXP conSEXP, SEXP chunk_sizeSEXP) {
Expand Down Expand Up @@ -250,6 +226,29 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// source_encoding
CharacterVector source_encoding(List spec);
RcppExport SEXP _readr_source_encoding(SEXP specSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type spec(specSEXP);
rcpp_result_gen = Rcpp::wrap(source_encoding(spec));
return rcpp_result_gen;
END_RCPP
}
// whitespaceColumns
List whitespaceColumns(List sourceSpec, int n);
RcppExport SEXP _readr_whitespaceColumns(SEXP sourceSpecSEXP, SEXP nSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type sourceSpec(sourceSpecSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
rcpp_result_gen = Rcpp::wrap(whitespaceColumns(sourceSpec, n));
return rcpp_result_gen;
END_RCPP
}
// type_convert_col
RObject type_convert_col(CharacterVector x, List spec, List locale_, int col, const std::vector<std::string>& na, bool trim_ws);
RcppExport SEXP _readr_type_convert_col(SEXP xSEXP, SEXP specSEXP, SEXP locale_SEXP, SEXP colSEXP, SEXP naSEXP, SEXP trim_wsSEXP) {
Expand Down Expand Up @@ -332,8 +331,6 @@ END_RCPP

static const R_CallMethodDef CallEntries[] = {
{"_readr_collectorGuess", (DL_FUNC) &_readr_collectorGuess, 2},
{"_readr_source_encoding", (DL_FUNC) &_readr_source_encoding, 1},
{"_readr_whitespaceColumns", (DL_FUNC) &_readr_whitespaceColumns, 3},
{"_readr_read_connection_", (DL_FUNC) &_readr_read_connection_, 2},
{"_readr_utctime", (DL_FUNC) &_readr_utctime, 7},
{"_readr_dim_tokens_", (DL_FUNC) &_readr_dim_tokens_, 2},
Expand All @@ -349,6 +346,8 @@ static const R_CallMethodDef CallEntries[] = {
{"_readr_read_tokens_", (DL_FUNC) &_readr_read_tokens_, 7},
{"_readr_read_tokens_chunked_", (DL_FUNC) &_readr_read_tokens_chunked_, 8},
{"_readr_guess_types_", (DL_FUNC) &_readr_guess_types_, 4},
{"_readr_source_encoding", (DL_FUNC) &_readr_source_encoding, 1},
{"_readr_whitespaceColumns", (DL_FUNC) &_readr_whitespaceColumns, 2},
{"_readr_type_convert_col", (DL_FUNC) &_readr_type_convert_col, 6},
{"_readr_write_lines_", (DL_FUNC) &_readr_write_lines_, 4},
{"_readr_write_lines_raw_", (DL_FUNC) &_readr_write_lines_raw_, 3},
Expand Down
2 changes: 1 addition & 1 deletion src/Reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Reader::Reader(
}

void Reader::init(CharacterVector colNames) {
tokenizer_->tokenize(source_->begin(), source_->end());
tokenizer_->tokenize(source_);
tokenizer_->setWarnings(&warnings_);

// Work out which output columns we are keeping and set warnings for each
Expand Down
Loading

0 comments on commit b5760c3

Please sign in to comment.