Skip to content

Commit

Permalink
Trim leading and trailing whitespace, by default; closes #211 (#326)
Browse files Browse the repository at this point in the history
* Trim leading and trailing whitespace, by default; closes #211

* Honor trim_ws when checking against NA values
  • Loading branch information
jennybc committed Apr 8, 2017
1 parent 38e3510 commit 00a8891
Show file tree
Hide file tree
Showing 18 changed files with 182 additions and 70 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -30,6 +30,8 @@

* `guess_max` is a new argument that lets user adjust the number of rows used to guess column types. (#223, #257 @tklebel and @jennybc)

* `trim_ws` is a new argument to remove leading and trailing whitespace. It defaults to `TRUE`. (#326, #211)

* `na` can now hold multiple NA values, e.g., `read_excel("missing-values.xls", na = c("NA", "1"))`. (#13, #56, @jmarshallnz)

* Coercions and cell data:
Expand Down
8 changes: 4 additions & 4 deletions R/RcppExports.R
Expand Up @@ -9,8 +9,8 @@ xls_date_formats <- function(path) {
.Call('readxl_xls_date_formats', PACKAGE = 'readxl', path)
}

read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, guess_max)
read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max)
}

xlsx_sheets <- function(path) {
Expand All @@ -29,8 +29,8 @@ parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, guess_max)
read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max)
}

zip_xml <- function(zip_path, file_path) {
Expand Down
24 changes: 17 additions & 7 deletions R/read_excel.R
Expand Up @@ -29,6 +29,7 @@ NULL
#' on a cell-by-cell basis.
#' @param na Character vector of strings to use for missing values. By default,
#' readxl treats blank cells as missing data.
#' @param trim_ws Should leading and trailing whitespace be trimmed?
#' @param skip Minimum number of rows to skip before reading anything, be it
#' column names or data. Leading empty rows are automatically skipped, so this
#' is a lower bound. Ignored if `range` is given.
Expand Down Expand Up @@ -79,12 +80,13 @@ NULL
#' read_excel(datasets, range = cell_cols("B:D"))
read_excel <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
col_names = col_names, col_types = col_types,
na = na, skip = skip, n_max = n_max, guess_max = guess_max,
na = na, trim_ws = trim_ws, skip = skip,
n_max = n_max, guess_max = guess_max,
excel_format(path)
)
}
Expand All @@ -97,7 +99,7 @@ read_excel <- function(path, sheet = NULL, range = NULL,
#' @export
read_xls <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -111,7 +113,7 @@ read_xls <- function(path, sheet = NULL, range = NULL,
#' @export
read_xlsx <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -123,7 +125,7 @@ read_xlsx <- function(path, sheet = NULL, range = NULL,

read_excel_ <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max), format) {
if (format == "xls") {
sheets_fun <- xls_sheets
Expand All @@ -135,14 +137,15 @@ read_excel_ <- function(path, sheet = NULL, range = NULL,
sheet <- standardise_sheet(sheet, range, sheets_fun(path))
shim <- !is.null(range)
limits <- standardise_limits(range, skip, n_max, has_col_names = isTRUE(col_names))
guess_max <- check_guess_max(guess_max)
col_types <- check_col_types(col_types)
guess_max <- check_guess_max(guess_max)
trim_ws <- check_bool(trim_ws, "trim_ws")
tibble::repair_names(
tibble::as_tibble(
read_fun(path = path, sheet = sheet,
limits = limits, shim = shim,
col_names = col_names, col_types = col_types,
na = na, guess_max = guess_max),
na = na, trim_ws = trim_ws, guess_max = guess_max),
validate = FALSE
),
prefix = "X", sep = "__"
Expand Down Expand Up @@ -255,6 +258,13 @@ check_col_types <- function(col_types) {
col_types
}

check_bool <- function(bool, arg_name) {
if (!isTRUE(bool) && !identical(bool, FALSE)) {
stop("`", arg_name, "` must be either TRUE or FALSE", call. = FALSE)
}
bool
}

check_non_negative_integer <- function(i, arg_name) {
if (length(i) != 1 || !is.numeric(i) || !is_integerish(i) ||
is.na(i) || i < 0) {
Expand Down
1 change: 1 addition & 0 deletions docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 7 additions & 3 deletions docs/reference/read_excel.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/read_excel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 8 additions & 6 deletions src/RcppExports.cpp
Expand Up @@ -28,8 +28,8 @@ BEGIN_RCPP
END_RCPP
}
// read_xls_
List read_xls_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, int guess_max);
RcppExport SEXP readxl_read_xls_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP guess_maxSEXP) {
List read_xls_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, bool trim_ws, int guess_max);
RcppExport SEXP readxl_read_xls_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP trim_wsSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -40,8 +40,9 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< bool >::type trim_ws(trim_wsSEXP);
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(read_xls_(path, sheet_i, limits, shim, col_names, col_types, na, guess_max));
rcpp_result_gen = Rcpp::wrap(read_xls_(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -90,8 +91,8 @@ BEGIN_RCPP
END_RCPP
}
// read_xlsx_
List read_xlsx_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, int guess_max);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP guess_maxSEXP) {
List read_xlsx_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, bool trim_ws, int guess_max);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP trim_wsSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -102,8 +103,9 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< bool >::type trim_ws(trim_wsSEXP);
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet_i, limits, shim, col_names, col_types, na, guess_max));
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max));
return rcpp_result_gen;
END_RCPP
}
Expand Down
4 changes: 4 additions & 0 deletions src/StringSet.h
Expand Up @@ -2,6 +2,7 @@
#define READXL_STRINGSET_

#include <Rcpp.h>
#include "utils.h"

class StringSet
{
Expand All @@ -22,6 +23,9 @@ class StringSet
bool contains(const std::string &s) const {
return set_.find(s) != set_.end();
}
bool contains(const std::string &s, const bool trimWs) const {
return trimWs ? contains(trim(s)) : contains(s);
}
bool contains(const double d) const {
std::ostringstream str; str << d;
return contains(str.str());
Expand Down
17 changes: 10 additions & 7 deletions src/XlsCell.h
Expand Up @@ -49,6 +49,7 @@ class XlsCell {
}

void inferType(const StringSet& na,
const bool trimWs,
const std::set<int>& dateFormats) {
// 1. Review of Excel's declared cell types, then
// 2. Summary of how Excel's cell types map to our CellType enum
Expand Down Expand Up @@ -129,7 +130,7 @@ class XlsCell {
switch(cell_->id) {
case XLS_RECORD_LABELSST:
case XLS_RECORD_LABEL:
ct = na.contains((char*) cell_->str) ? CELL_BLANK : CELL_TEXT;
ct = na.contains((char*) cell_->str, trimWs) ? CELL_BLANK : CELL_TEXT;
break;

case XLS_RECORD_FORMULA:
Expand Down Expand Up @@ -176,7 +177,7 @@ class XlsCell {

// string (or #NULL! error)
// d = 0 and str holds string formula result
ct = na.contains((char*) cell_->str) ? CELL_BLANK : CELL_TEXT;
ct = na.contains((char*) cell_->str, trimWs) ? CELL_BLANK : CELL_TEXT;
}
break;

Expand Down Expand Up @@ -221,7 +222,7 @@ class XlsCell {
type_ = ct;
}

std::string asStdString() const {
std::string asStdString(const bool trimWs) const {
switch(type_) {

case CELL_UNKNOWN:
Expand Down Expand Up @@ -252,8 +253,10 @@ class XlsCell {
return out_string;
}

case CELL_TEXT:
return std::string((char*) cell_->str);
case CELL_TEXT: {
std::string out_string = (char*) cell_->str;
return trimWs ? trim(out_string) : out_string;
}

default:
Rcpp::warning("Unrecognized cell type at [%i, %i]: '%s'",
Expand All @@ -262,8 +265,8 @@ class XlsCell {
}
}

Rcpp::RObject asCharSxp() const {
std::string out_string = asStdString();
Rcpp::RObject asCharSxp(const bool trimWs) const {
std::string out_string = asStdString(trimWs);
return out_string.empty() ? NA_STRING : Rf_mkCharCE(out_string.c_str(), CE_UTF8);
}

Expand Down
8 changes: 4 additions & 4 deletions src/XlsWorkSheet.cpp
Expand Up @@ -9,7 +9,7 @@ using namespace Rcpp;
List read_xls_(std::string path, int sheet_i,
IntegerVector limits, bool shim,
RObject col_names, RObject col_types,
std::vector<std::string> na, int guess_max = 1000) {
std::vector<std::string> na, bool trim_ws, int guess_max = 1000) {

// Construct worksheet ----------------------------------------------
XlsWorkSheet ws(path, sheet_i, limits, shim);
Expand All @@ -28,7 +28,7 @@ List read_xls_(std::string path, int sheet_i,
break;
case LGLSXP:
has_col_names = as<bool>(col_names);
colNames = has_col_names ? ws.colNames(na) : CharacterVector(ws.ncol(), "");
colNames = has_col_names ? ws.colNames(na, trim_ws) : CharacterVector(ws.ncol(), "");
break;
default:
Rcpp::stop("`col_names` must be a logical or character vector");
Expand All @@ -45,13 +45,13 @@ List read_xls_(std::string path, int sheet_i,
sheet_i + 1, ws.ncol(), colTypes.size());
}
if (requiresGuess(colTypes)) {
colTypes = ws.colTypes(colTypes, na, guess_max, has_col_names);
colTypes = ws.colTypes(colTypes, na, trim_ws, guess_max, has_col_names);
}
colTypes = finalizeTypes(colTypes);

// Reconcile column names and types ----------------------------------
colNames = reconcileNames(colNames, colTypes, sheet_i);

// Get data ----------------------------------------------------------
return ws.readCols(colNames, colTypes, na, has_col_names);
return ws.readCols(colNames, colTypes, na, trim_ws, has_col_names);
}

0 comments on commit 00a8891

Please sign in to comment.