Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add n_max argument; closes #281 #306

Merged
merged 1 commit into from Mar 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
@@ -1,5 +1,7 @@
# readxl 0.1.1.9000

* Limit the rows read: new argument `n_max` puts a maximum on the number of rows read from the spreadsheet. (#306, #281 @jennybc)

* Nonstandard XML namespace prefixes in xlsx: Some 3rd party tools use different XML namespace prefixes than MS Excel. These are now stripped from element names and attributes during parsing. (#295, #268, #202, #80 @jennybc)

* The [Lotus 1-2-3 leap year bug](https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year) is accounted for. Date-times prior to March 1, 1900 import correctly. Date-times on the non-existent leap day February 29, 1900 import as NA and throw a warning. (#264, #148, #292 @jennybc)
Expand Down
28 changes: 12 additions & 16 deletions R/RcppExports.R
Expand Up @@ -9,16 +9,16 @@ xls_date_formats <- function(path) {
.Call('readxl_xls_date_formats', PACKAGE = 'readxl', path)
}

xls_col_names <- function(path, na, sheet_i = 0L, skip = 0L) {
.Call('readxl_xls_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip)
xls_col_names <- function(path, na, sheet_i = 0L, skip = 0L, n_max = 1L) {
.Call('readxl_xls_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max)
}

xls_col_types <- function(path, na, sheet_i = 0L, skip = 0L, guess_max = 1000L, has_col_names = FALSE) {
.Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet_i, skip, guess_max, has_col_names)
xls_col_types <- function(path, na, sheet_i = 0L, skip = 0L, n_max = -1L, guess_max = 1000L, has_col_names = FALSE) {
.Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max, guess_max, has_col_names)
}

read_xls_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, guess_max)
read_xls_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, n_max = -1L, guess_max = 1000L) {
.Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, n_max, guess_max)
}

xlsx_sheets <- function(path) {
Expand All @@ -33,24 +33,20 @@ xlsx_date_formats <- function(path) {
.Call('readxl_xlsx_date_formats', PACKAGE = 'readxl', path)
}

xlsx_dim <- function(path, sheet_i = 0L, skip = 0L) {
.Call('readxl_xlsx_dim', PACKAGE = 'readxl', path, sheet_i, skip)
}

parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

xlsx_col_types <- function(path, sheet_i = 0L, na = character(), skip = 0L, guess_max = 1000L, has_col_names = FALSE) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet_i, na, skip, guess_max, has_col_names)
xlsx_col_types <- function(path, sheet_i = 0L, na = character(), skip = 0L, n_max = -1L, guess_max = 1000L, has_col_names = FALSE) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet_i, na, skip, n_max, guess_max, has_col_names)
}

xlsx_col_names <- function(path, na = character(), sheet_i = 0L, skip = 0L) {
.Call('readxl_xlsx_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip)
xlsx_col_names <- function(path, na = character(), sheet_i = 0L, skip = 0L, n_max = 1L) {
.Call('readxl_xlsx_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max)
}

read_xlsx_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, guess_max)
read_xlsx_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, n_max = -1L, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, n_max, guess_max)
}

zip_xml <- function(zip_path, file_path) {
Expand Down
64 changes: 44 additions & 20 deletions R/read_excel.R
Expand Up @@ -14,44 +14,51 @@ NULL
#' unskipped column.
#' @param col_types Either `NULL` to guess all from the spreadsheet or a
#' character vector containing one entry per column from these options:
#' "skip", "guess", "logical", "numeric", "date", "text" or "list". The
#' content of a cell in a skipped column is never read and that column will
#' not appear in the data frame output. A list cell loads a column as a list
#' of length 1 vectors, which are typed using the type guessing logic from
#' `col_types = NULL`, but on a cell-by-cell basis.
#' "skip", "guess", "logical", "numeric", "date", "text" or "list". If exactly
#' on `col_type` is specified, it will be recycled. The content of a cell in a
#' skipped column is never read and that column will not appear in the data
#' frame output. A list cell loads a column as a list of length 1 vectors,
#' which are typed using the type guessing logic from `col_types = NULL`, but
#' on a cell-by-cell basis.
#' @param na Character vector of strings to use for missing values. By default,
#' readxl treats blank cells as missing data.
#' @param skip Number of rows to skip before reading any data. Leading blank
#' rows are automatically skipped.
#' @param guess_max Maximum number of rows to use for guessing column types.
#' @param skip Number of rows to skip before reading anything (column names or
#' data). Leading blank rows are automatically skipped.
#' @param n_max Maximum number of data rows to read.
#' @param guess_max Maximum number of data rows to use for guessing column
#' types.
#' @export
#' @examples
#' datasets <- readxl_example("datasets.xlsx")
#' read_excel(datasets)
#'
#' # Specific sheet either by position or by name
#' # Specify sheet either by position or by name
#' read_excel(datasets, 2)
#' read_excel(datasets, "mtcars")
#'
#' # Skipping rows and using default column names
#' # Skip rows and use default column names
#' read_excel(datasets, skip = 148, col_names = FALSE)
#'
#' # if col_types is of length one, it will be recycled
#' # Recycle a length-one col_types
#' read_excel(datasets, col_types = "text")
#'
#' # you can specify some col_types and guess others
#' # Specify some col_types and guess others
#' read_excel(datasets, col_types = c("text", "guess", "numeric", "guess", "guess"))
#'
#' # "list" col_type can handle information of disparate types
#' df <- read_excel(readxl_example("clippy.xlsx"), col_types = c("text", "list"))
#' df
#' df$value
#'
#' # Limit the number of data rows read
#' read_excel(datasets, n_max = 3)
read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0, guess_max = 1000) {
na = "", skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet,
col_names = col_names, col_types = col_types,
na = na, skip = skip, guess_max = guess_max,
na = na, skip = skip, n_max = n_max, guess_max = guess_max,
excel_format(path)
)
}
Expand All @@ -63,29 +70,32 @@ read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
#' @rdname read_excel
#' @export
read_xls <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0, guess_max = 1000) {
na = "", skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet,
col_names = col_names, col_types = col_types,
na = na, skip = skip, guess_max = guess_max,
na = na, skip = skip, n_max = n_max, guess_max = guess_max,
format = "xls"
)
}

#' @rdname read_excel
#' @export
read_xlsx <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0, guess_max = 1000) {
na = "", skip = 0, n_max = Inf,
guess_max = min(1000, n_max)) {
read_excel_(
path = path, sheet = sheet,
col_names = col_names, col_types = col_types,
na = na, skip = skip, guess_max = guess_max,
na = na, skip = skip, n_max = n_max, guess_max = guess_max,
format = "xlsx"
)
}

read_excel_ <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0, guess_max = 1000, format) {
na = "", skip = 0, n_max = Inf,
guess_max = min(1000, n_max), format) {
if (format == "xls") {
sheets_fun <- xls_sheets
read_fun <- read_xls_
Expand All @@ -94,13 +104,14 @@ read_excel_ <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
read_fun <- read_xlsx_
}
sheet <- standardise_sheet(sheet, sheets_fun(path))
n_max <- check_n_max(n_max)
guess_max <- check_guess_max(guess_max)
col_types <- check_col_types(col_types)
tibble::repair_names(
tibble::as_tibble(
read_fun(path = path, sheet = sheet,
col_names = col_names, col_types = col_types,
na = na, skip = skip, guess_max = guess_max),
na = na, skip = skip, n_max = n_max, guess_max = guess_max),
validate = FALSE
),
prefix = "X", sep = "__"
Expand Down Expand Up @@ -170,6 +181,19 @@ check_col_types <- function(col_types) {
col_types
}

check_n_max <- function(n_max) {

if (length(n_max) != 1 || !is.numeric(n_max) || !is_integerish(n_max) ||
is.na(n_max) || n_max < 0) {
stop("`n_max` must be a positive integer", call. = FALSE)
}

if (n_max == Inf) {
n_max <- -1
}
n_max
}

## from readr
check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) {

Expand Down
Binary file modified docs/favicon.ico
Binary file not shown.
1 change: 1 addition & 0 deletions docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 27 additions & 16 deletions docs/reference/read_excel.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.