Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trim leading and trailing whitespace, by default; closes #211 #326

Merged
merged 2 commits into from Apr 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -30,6 +30,8 @@

* `guess_max` is a new argument that lets user adjust the number of rows used to guess column types. (#223, #257 @tklebel and @jennybc)

* `trim_ws` is a new argument to remove leading and trailing whitespace. It defaults to `TRUE`. (#326, #211)

* `na` can now hold multiple NA values, e.g., `read_excel("missing-values.xls", na = c("NA", "1"))`. (#13, #56, @jmarshallnz)

* Coercions and cell data:
Expand Down
8 changes: 4 additions & 4 deletions R/RcppExports.R
Expand Up @@ -9,8 +9,8 @@ xls_date_formats <- function(path) {
.Call('readxl_xls_date_formats', PACKAGE = 'readxl', path)
}

read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, guess_max)
read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max)
}

xlsx_sheets <- function(path) {
Expand All @@ -29,8 +29,8 @@ parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, guess_max)
read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max)
}

zip_xml <- function(zip_path, file_path) {
Expand Down
24 changes: 17 additions & 7 deletions R/read_excel.R
Expand Up @@ -29,6 +29,7 @@ NULL
#' on a cell-by-cell basis.
#' @param na Character vector of strings to use for missing values. By default,
#' readxl treats blank cells as missing data.
#' @param trim_ws Should leading and trailing whitespace be trimmed?
#' @param skip Minimum number of rows to skip before reading anything, be it
#' column names or data. Leading empty rows are automatically skipped, so this
#' is a lower bound. Ignored if `range` is given.
Expand Down Expand Up @@ -79,12 +80,13 @@ NULL
#' read_excel(datasets, range = cell_cols("B:D"))
read_excel <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
col_names = col_names, col_types = col_types,
na = na, skip = skip, n_max = n_max, guess_max = guess_max,
na = na, trim_ws = trim_ws, skip = skip,
n_max = n_max, guess_max = guess_max,
excel_format(path)
)
}
Expand All @@ -97,7 +99,7 @@ read_excel <- function(path, sheet = NULL, range = NULL,
#' @export
read_xls <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -111,7 +113,7 @@ read_xls <- function(path, sheet = NULL, range = NULL,
#' @export
read_xlsx <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -123,7 +125,7 @@ read_xlsx <- function(path, sheet = NULL, range = NULL,

read_excel_ <- function(path, sheet = NULL, range = NULL,
col_names = TRUE, col_types = NULL,
na = "", skip = 0, n_max = Inf,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max), format) {
if (format == "xls") {
sheets_fun <- xls_sheets
Expand All @@ -135,14 +137,15 @@ read_excel_ <- function(path, sheet = NULL, range = NULL,
sheet <- standardise_sheet(sheet, range, sheets_fun(path))
shim <- !is.null(range)
limits <- standardise_limits(range, skip, n_max, has_col_names = isTRUE(col_names))
guess_max <- check_guess_max(guess_max)
col_types <- check_col_types(col_types)
guess_max <- check_guess_max(guess_max)
trim_ws <- check_bool(trim_ws, "trim_ws")
tibble::repair_names(
tibble::as_tibble(
read_fun(path = path, sheet = sheet,
limits = limits, shim = shim,
col_names = col_names, col_types = col_types,
na = na, guess_max = guess_max),
na = na, trim_ws = trim_ws, guess_max = guess_max),
validate = FALSE
),
prefix = "X", sep = "__"
Expand Down Expand Up @@ -255,6 +258,13 @@ check_col_types <- function(col_types) {
col_types
}

check_bool <- function(bool, arg_name) {
if (!isTRUE(bool) && !identical(bool, FALSE)) {
stop("`", arg_name, "` must be either TRUE or FALSE", call. = FALSE)
}
bool
}

check_non_negative_integer <- function(i, arg_name) {
if (length(i) != 1 || !is.numeric(i) || !is_integerish(i) ||
is.na(i) || i < 0) {
Expand Down
1 change: 1 addition & 0 deletions docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 7 additions & 3 deletions docs/reference/read_excel.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/read_excel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 8 additions & 6 deletions src/RcppExports.cpp
Expand Up @@ -28,8 +28,8 @@ BEGIN_RCPP
END_RCPP
}
// read_xls_
List read_xls_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, int guess_max);
RcppExport SEXP readxl_read_xls_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP guess_maxSEXP) {
List read_xls_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, bool trim_ws, int guess_max);
RcppExport SEXP readxl_read_xls_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP trim_wsSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -40,8 +40,9 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< bool >::type trim_ws(trim_wsSEXP);
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(read_xls_(path, sheet_i, limits, shim, col_names, col_types, na, guess_max));
rcpp_result_gen = Rcpp::wrap(read_xls_(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -90,8 +91,8 @@ BEGIN_RCPP
END_RCPP
}
// read_xlsx_
List read_xlsx_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, int guess_max);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP guess_maxSEXP) {
List read_xlsx_(std::string path, int sheet_i, IntegerVector limits, bool shim, RObject col_names, RObject col_types, std::vector<std::string> na, bool trim_ws, int guess_max);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheet_iSEXP, SEXP limitsSEXP, SEXP shimSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP trim_wsSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -102,8 +103,9 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< bool >::type trim_ws(trim_wsSEXP);
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet_i, limits, shim, col_names, col_types, na, guess_max));
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max));
return rcpp_result_gen;
END_RCPP
}
Expand Down
4 changes: 4 additions & 0 deletions src/StringSet.h
Expand Up @@ -2,6 +2,7 @@
#define READXL_STRINGSET_

#include <Rcpp.h>
#include "utils.h"

class StringSet
{
Expand All @@ -22,6 +23,9 @@ class StringSet
bool contains(const std::string &s) const {
return set_.find(s) != set_.end();
}
bool contains(const std::string &s, const bool trimWs) const {
return trimWs ? contains(trim(s)) : contains(s);
}
bool contains(const double d) const {
std::ostringstream str; str << d;
return contains(str.str());
Expand Down
17 changes: 10 additions & 7 deletions src/XlsCell.h
Expand Up @@ -49,6 +49,7 @@ class XlsCell {
}

void inferType(const StringSet& na,
const bool trimWs,
const std::set<int>& dateFormats) {
// 1. Review of Excel's declared cell types, then
// 2. Summary of how Excel's cell types map to our CellType enum
Expand Down Expand Up @@ -129,7 +130,7 @@ class XlsCell {
switch(cell_->id) {
case XLS_RECORD_LABELSST:
case XLS_RECORD_LABEL:
ct = na.contains((char*) cell_->str) ? CELL_BLANK : CELL_TEXT;
ct = na.contains((char*) cell_->str, trimWs) ? CELL_BLANK : CELL_TEXT;
break;

case XLS_RECORD_FORMULA:
Expand Down Expand Up @@ -176,7 +177,7 @@ class XlsCell {

// string (or #NULL! error)
// d = 0 and str holds string formula result
ct = na.contains((char*) cell_->str) ? CELL_BLANK : CELL_TEXT;
ct = na.contains((char*) cell_->str, trimWs) ? CELL_BLANK : CELL_TEXT;
}
break;

Expand Down Expand Up @@ -221,7 +222,7 @@ class XlsCell {
type_ = ct;
}

std::string asStdString() const {
std::string asStdString(const bool trimWs) const {
switch(type_) {

case CELL_UNKNOWN:
Expand Down Expand Up @@ -252,8 +253,10 @@ class XlsCell {
return out_string;
}

case CELL_TEXT:
return std::string((char*) cell_->str);
case CELL_TEXT: {
std::string out_string = (char*) cell_->str;
return trimWs ? trim(out_string) : out_string;
}

default:
Rcpp::warning("Unrecognized cell type at [%i, %i]: '%s'",
Expand All @@ -262,8 +265,8 @@ class XlsCell {
}
}

Rcpp::RObject asCharSxp() const {
std::string out_string = asStdString();
Rcpp::RObject asCharSxp(const bool trimWs) const {
std::string out_string = asStdString(trimWs);
return out_string.empty() ? NA_STRING : Rf_mkCharCE(out_string.c_str(), CE_UTF8);
}

Expand Down
8 changes: 4 additions & 4 deletions src/XlsWorkSheet.cpp
Expand Up @@ -9,7 +9,7 @@ using namespace Rcpp;
List read_xls_(std::string path, int sheet_i,
IntegerVector limits, bool shim,
RObject col_names, RObject col_types,
std::vector<std::string> na, int guess_max = 1000) {
std::vector<std::string> na, bool trim_ws, int guess_max = 1000) {

// Construct worksheet ----------------------------------------------
XlsWorkSheet ws(path, sheet_i, limits, shim);
Expand All @@ -28,7 +28,7 @@ List read_xls_(std::string path, int sheet_i,
break;
case LGLSXP:
has_col_names = as<bool>(col_names);
colNames = has_col_names ? ws.colNames(na) : CharacterVector(ws.ncol(), "");
colNames = has_col_names ? ws.colNames(na, trim_ws) : CharacterVector(ws.ncol(), "");
break;
default:
Rcpp::stop("`col_names` must be a logical or character vector");
Expand All @@ -45,13 +45,13 @@ List read_xls_(std::string path, int sheet_i,
sheet_i + 1, ws.ncol(), colTypes.size());
}
if (requiresGuess(colTypes)) {
colTypes = ws.colTypes(colTypes, na, guess_max, has_col_names);
colTypes = ws.colTypes(colTypes, na, trim_ws, guess_max, has_col_names);
}
colTypes = finalizeTypes(colTypes);

// Reconcile column names and types ----------------------------------
colNames = reconcileNames(colNames, colTypes, sheet_i);

// Get data ----------------------------------------------------------
return ws.readCols(colNames, colTypes, na, has_col_names);
return ws.readCols(colNames, colTypes, na, trim_ws, has_col_names);
}