Skip to content

Commit

Permalink
Selectively trim_ws. Parse NA in parse_vector.
Browse files Browse the repository at this point in the history
Fixes #137
  • Loading branch information
hadley committed Sep 3, 2015
1 parent 6b11595 commit f05d4ce
Show file tree
Hide file tree
Showing 17 changed files with 124 additions and 53 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
@@ -1,5 +1,11 @@
# readr 0.1.1.9000

* `read_delim()` etc gains a `trim_ws` argument that controls whether leading
and trailing whitespace is automatically removed. It defaults to `TRUE`. (#137)

* `parse_*` gain a `na` argument that allows you to specify which values should
be converted to missing.

* Month and abbreviated month name formats (`%b` and `%B`) now ignore
case when matching (#219).

Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Expand Up @@ -29,8 +29,8 @@ tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) {
.Call('readr_tokenize_', PACKAGE = 'readr', sourceSpec, tokenizerSpec, n_max)
}

parse_vector_ <- function(x, collectorSpec) {
.Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec)
parse_vector_ <- function(x, collectorSpec, na) {
.Call('readr_parse_vector_', PACKAGE = 'readr', x, collectorSpec, na)
}

read_file_ <- function(sourceSpec) {
Expand Down
38 changes: 21 additions & 17 deletions R/collectors.R
Expand Up @@ -15,10 +15,11 @@ collector_find <- function(name) {
#'
#' @param x Character vector of elements to parse.
#' @param collector Column specification.
#' @param na Character vector giving strings to treat as missing.
#' @keywords internal
#' @export
#' @examples
#' x <- c("1", "2", "3", NA)
#' x <- c("1", "2", "3", "NA")
#' parse_vector(x, col_integer())
#' parse_vector(x, col_double())
#' parse_vector(x, col_character())
Expand All @@ -27,8 +28,10 @@ collector_find <- function(name) {
#' # Invalid values are replaced with missing values with a warning.
#' x <- c("1", "2", "3", "-")
#' parse_vector(x, col_double())
parse_vector <- function(x, collector) {
warn_problems(parse_vector_(x, collector))
#' # Or flag values as missing
#' parse_vector(x, col_double(), na = "-")
parse_vector <- function(x, collector, na = "NA") {
warn_problems(parse_vector_(x, collector, na = na))
}

#' Parse character vectors into typed columns.
Expand All @@ -39,6 +42,7 @@ parse_vector <- function(x, collector) {
#'
#' @name collector
#' @param x Character vector of values to parse.
#' @param na Character vector giving strings to treat as missing.
#' @seealso \code{\link{parse_datetime}}, \code{\link{type_convert}} to
#' automatically re-parse all character columns in a data frame.
#' @examples
Expand All @@ -62,8 +66,8 @@ col_character <- function() {

#' @rdname collector
#' @export
parse_character <- function(x) {
parse_vector(x, col_character())
parse_character <- function(x, na = c("", "NA")) {
parse_vector(x, col_character(), na = na)
}

#' @rdname collector
Expand All @@ -74,8 +78,8 @@ col_integer <- function() {

#' @rdname collector
#' @export
parse_integer <- function(x) {
parse_vector(x, col_integer())
parse_integer <- function(x, na = c("", "NA")) {
parse_vector(x, col_integer(), na = na)
}

#' @rdname collector
Expand All @@ -86,8 +90,8 @@ col_double <- function() {

#' @rdname collector
#' @export
parse_double <- function(x) {
parse_vector(x, col_double())
parse_double <- function(x, na = c("", "NA")) {
parse_vector(x, col_double(), na = na)
}

#' @rdname collector
Expand All @@ -98,8 +102,8 @@ col_euro_double <- function() {

#' @rdname collector
#' @export
parse_euro_double <- function(x) {
parse_vector(x, col_euro_double())
parse_euro_double <- function(x, na = c("", "NA")) {
parse_vector(x, col_euro_double(), na = na)
}


Expand All @@ -111,8 +115,8 @@ col_numeric <- function() {

#' @rdname collector
#' @export
parse_numeric <- function(x) {
parse_vector(x, col_numeric())
parse_numeric <- function(x, na = c("", "NA")) {
parse_vector(x, col_numeric(), na = na)
}

#' @rdname collector
Expand All @@ -123,8 +127,8 @@ col_logical <- function() {

#' @rdname collector
#' @export
parse_logical <- function(x) {
parse_vector(x, col_logical())
parse_logical <- function(x, na = c("", "NA")) {
parse_vector(x, col_logical(), na = na)
}

#' @param levels Character vector providing set of allowed levels.
Expand All @@ -137,8 +141,8 @@ col_factor <- function(levels, ordered = FALSE) {

#' @rdname collector
#' @export
parse_factor <- function(x, levels, ordered = FALSE) {
parse_vector(x, col_factor(levels, ordered))
parse_factor <- function(x, levels, ordered = FALSE, na = c("", "NA")) {
parse_vector(x, col_factor(levels, ordered), na = na)
}

#' @rdname collector
Expand Down
14 changes: 8 additions & 6 deletions R/read_delim.R
Expand Up @@ -69,20 +69,21 @@ read_delim <- function(file, delim, quote = '"', escape_backslash = FALSE,
#' @rdname read_delim
#' @export
read_csv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
skip = 0, n_max = -1, progress = interactive()) {
trim_ws = TRUE, skip = 0, n_max = -1,
progress = interactive()) {

tokenizer <- tokenizer_csv(na = na)
tokenizer <- tokenizer_csv(na = na, trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
skip = skip, n_max = n_max, progress = progress)
}

#' @rdname read_delim
#' @export
read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
na = c("", "NA"), skip = 0, n_max = -1,
na = c("", "NA"), trim_ws = TRUE, skip = 0, n_max = -1,
progress = interactive()) {

tokenizer <- tokenizer_delim(delim = ";", na = na)
tokenizer <- tokenizer_delim(delim = ";", na = na, trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
skip = skip, n_max = n_max, progress = progress)
}
Expand All @@ -91,9 +92,10 @@ read_csv2 <- function(file, col_names = TRUE, col_types = NULL,
#' @rdname read_delim
#' @export
read_tsv <- function(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
skip = 0, n_max = -1, progress = interactive()) {
trim_ws = TRUE, skip = 0, n_max = -1,
progress = interactive()) {

tokenizer <- tokenizer_tsv(na = na)
tokenizer <- tokenizer_tsv(na = na, trim_ws = trim_ws)
read_delimited(file, tokenizer, col_names = col_names, col_types = col_types,
skip = skip, n_max = n_max, progress = progress)
}
Expand Down
13 changes: 10 additions & 3 deletions R/tokenizer.R
Expand Up @@ -38,6 +38,8 @@ NULL
#' option to \code{character()} to indicate no missing values.
#' @param delim Single character used to separate fields within a record.
#' @param quote Single character used to quote strings.
#' @param trim_ws Should leading and trailing whitespace be trimmed from
#' each field before parsing it?
#' @param escape_double Does the file escape quotes by doubling them?
#' i.e. If this option is \code{TRUE}, the value \code{""""} represents
#' a single quote, \code{\"}.
Expand All @@ -46,12 +48,15 @@ NULL
#' can be used to escape the delimeter character, the quote characer, or
#' to add special characters like \code{\\n}.
tokenizer_delim <- function(delim, quote = '"', na = "NA",
escape_double = TRUE, escape_backslash = FALSE) {
trim_ws = TRUE,
escape_double = TRUE,
escape_backslash = FALSE) {
structure(
list(
delim = delim,
quote = quote,
na = na,
trim_ws = trim_ws,
escape_double = escape_double,
escape_backslash = escape_backslash
),
Expand All @@ -61,23 +66,25 @@ tokenizer_delim <- function(delim, quote = '"', na = "NA",

#' @export
#' @rdname Tokenizers
tokenizer_csv <- function(na = "NA") {
tokenizer_csv <- function(na = "NA", trim_ws = TRUE) {
tokenizer_delim(
delim = ",",
quote = '"',
na = na,
trim_ws = trim_ws,
escape_double = TRUE,
escape_backslash = FALSE
)
}

#' @export
#' @rdname Tokenizers
tokenizer_tsv <- function(na = "NA") {
tokenizer_tsv <- function(na = "NA", trim_ws = TRUE) {
tokenizer_delim(
delim = "\t",
quote = '"',
na = na,
trim_ws = trim_ws,
escape_double = TRUE,
escape_backslash = FALSE
)
Expand Down
3 changes: 3 additions & 0 deletions R/type_convert.R
Expand Up @@ -15,6 +15,9 @@
#' )
#' str(df)
#' str(type_convert(df))
#'
#' df <- data.frame(x = c("NA", "10"), stringsAsFactors= FALSE)
#' type_convert(df)
type_convert <- function(df, col_types = NULL) {
is_character <- vapply(df, is.character, logical(1))

Expand Down
9 changes: 6 additions & 3 deletions man/Tokenizers.Rd
Expand Up @@ -11,11 +11,11 @@
\title{Tokenizers.}
\usage{
tokenizer_delim(delim,
quote = "\\"", na = "NA", escape_double = TRUE, escape_backslash = FALSE)
quote = "\\"", na = "NA", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE)
tokenizer_csv(na = "NA")
tokenizer_csv(na = "NA", trim_ws = TRUE)
tokenizer_tsv(na = "NA")
tokenizer_tsv(na = "NA", trim_ws = TRUE)
tokenizer_line()
Expand All @@ -31,6 +31,9 @@ tokenizer_fwf(begin, end, na = "NA")
\item{na}{Character vector of strings to use for missing values. Set this
option to \code{character()} to indicate no missing values.}
\item{trim_ws}{Should leading and trailing whitespace be trimmed from
each field before parsing it?}
\item{escape_double}{Does the file escape quotes by doubling them?
i.e. If this option is \code{TRUE}, the value \code{""""} represents
a single quote, \code{\"}.}
Expand Down
16 changes: 9 additions & 7 deletions man/collector.Rd
Expand Up @@ -21,37 +21,39 @@
\usage{
col_character()

parse_character(x)
parse_character(x, na = c("", "NA"))

col_integer()

parse_integer(x)
parse_integer(x, na = c("", "NA"))

col_double()

parse_double(x)
parse_double(x, na = c("", "NA"))

col_euro_double()

parse_euro_double(x)
parse_euro_double(x, na = c("", "NA"))

col_numeric()

parse_numeric(x)
parse_numeric(x, na = c("", "NA"))

col_logical()

parse_logical(x)
parse_logical(x, na = c("", "NA"))

col_factor(levels, ordered = FALSE)

parse_factor(x, levels, ordered = FALSE)
parse_factor(x, levels, ordered = FALSE, na = c("", "NA"))

col_skip()
}
\arguments{
\item{x}{Character vector of values to parse.}

\item{na}{Character vector giving strings to treat as missing.}

\item{levels}{Character vector providing set of allowed levels.}

\item{ordered}{Is it an ordered factor?}
Expand Down
8 changes: 6 additions & 2 deletions man/parse_vector.Rd
Expand Up @@ -4,18 +4,20 @@
\alias{parse_vector}
\title{Parse a character vector.}
\usage{
parse_vector(x, collector)
parse_vector(x, collector, na = "NA")
}
\arguments{
\item{x}{Character vector of elements to parse.}

\item{collector}{Column specification.}

\item{na}{Character vector giving strings to treat as missing.}
}
\description{
Parse a character vector.
}
\examples{
x <- c("1", "2", "3", NA)
x <- c("1", "2", "3", "NA")
parse_vector(x, col_integer())
parse_vector(x, col_double())
parse_vector(x, col_character())
Expand All @@ -24,6 +26,8 @@ parse_vector(x, col_skip())
# Invalid values are replaced with missing values with a warning.
x <- c("1", "2", "3", "-")
parse_vector(x, col_double())
# Or flag values as missing
parse_vector(x, col_double(), na = "-")
}
\keyword{internal}

9 changes: 6 additions & 3 deletions man/read_delim.Rd
Expand Up @@ -12,13 +12,13 @@ read_delim(file, delim, quote = '\"', escape_backslash = FALSE,
skip = 0, n_max = -1, progress = interactive())

read_csv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
skip = 0, n_max = -1, progress = interactive())
trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())

read_csv2(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
skip = 0, n_max = -1, progress = interactive())
trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())

read_tsv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"),
skip = 0, n_max = -1, progress = interactive())
trim_ws = TRUE, skip = 0, n_max = -1, progress = interactive())
}
\arguments{
\item{file}{Either a path to a file, a connection, or literal data
Expand Down Expand Up @@ -82,6 +82,9 @@ option to \code{character()} to indicate no missing values.}
\item{progress}{Display a progress bar? By default it will only display
in an interactive session. The display is updated every 50,000 values
and will only display if estimated reading time is 5 seconds or more.}
\item{trim_ws}{Should leading and trailing whitespace be trimmed from
each field before parsing it?}
}
\value{
A data frame. If there are parsing problems, a warning tells you
Expand Down
3 changes: 3 additions & 0 deletions man/type_convert.Rd
Expand Up @@ -38,5 +38,8 @@ df <- data.frame(
)
str(df)
str(type_convert(df))
df <- data.frame(x = c("NA", "10"), stringsAsFactors= FALSE)
type_convert(df)
}
7 changes: 4 additions & 3 deletions src/RcppExports.cpp
Expand Up @@ -94,14 +94,15 @@ BEGIN_RCPP
END_RCPP
}
// parse_vector_
SEXP parse_vector_(CharacterVector x, List collectorSpec);
RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP) {
SEXP parse_vector_(CharacterVector x, List collectorSpec, const std::vector<std::string>& na);
RcppExport SEXP readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP, SEXP naSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP);
Rcpp::traits::input_parameter< List >::type collectorSpec(collectorSpecSEXP);
__result = Rcpp::wrap(parse_vector_(x, collectorSpec));
Rcpp::traits::input_parameter< const std::vector<std::string>& >::type na(naSEXP);
__result = Rcpp::wrap(parse_vector_(x, collectorSpec, na));
return __result;
END_RCPP
}
Expand Down

0 comments on commit f05d4ce

Please sign in to comment.