tidyverse · jennybc · Mar 29, 2017 · Mar 29, 2017
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # readxl 0.1.1.9000
 
+* Limit the rows read: new argument `n_max` puts a maximum on the number of rows read from the spreadsheet. (#306, #281 @jennybc)
+
 * Nonstandard XML namespace prefixes in xlsx: Some 3rd party tools use different XML namespace prefixes than MS Excel. These are now stripped from element names and attributes during parsing. (#295, #268, #202, #80 @jennybc)
 
 * The [Lotus 1-2-3 leap year bug](https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year) is accounted for. Date-times prior to March 1, 1900 import correctly. Date-times on the non-existent leap day February 29, 1900 import as NA and throw a warning. (#264, #148, #292 @jennybc)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -9,16 +9,16 @@ xls_date_formats <- function(path) {
     .Call('readxl_xls_date_formats', PACKAGE = 'readxl', path)
 }
 
-xls_col_names <- function(path, na, sheet_i = 0L, skip = 0L) {
-    .Call('readxl_xls_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip)
+xls_col_names <- function(path, na, sheet_i = 0L, skip = 0L, n_max = 1L) {
+    .Call('readxl_xls_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max)
 }
 
-xls_col_types <- function(path, na, sheet_i = 0L, skip = 0L, guess_max = 1000L, has_col_names = FALSE) {
-    .Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet_i, skip, guess_max, has_col_names)
+xls_col_types <- function(path, na, sheet_i = 0L, skip = 0L, n_max = -1L, guess_max = 1000L, has_col_names = FALSE) {
+    .Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max, guess_max, has_col_names)
 }
 
-read_xls_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, guess_max = 1000L) {
-    .Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, guess_max)
+read_xls_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, n_max = -1L, guess_max = 1000L) {
+    .Call('readxl_read_xls_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, n_max, guess_max)
 }
 
 xlsx_sheets <- function(path) {
@@ -33,24 +33,20 @@ xlsx_date_formats <- function(path) {
     .Call('readxl_xlsx_date_formats', PACKAGE = 'readxl', path)
 }
 
-xlsx_dim <- function(path, sheet_i = 0L, skip = 0L) {
-    .Call('readxl_xlsx_dim', PACKAGE = 'readxl', path, sheet_i, skip)
-}
-
 parse_ref <- function(ref) {
     .Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
 }
 
-xlsx_col_types <- function(path, sheet_i = 0L, na = character(), skip = 0L, guess_max = 1000L, has_col_names = FALSE) {
-    .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet_i, na, skip, guess_max, has_col_names)
+xlsx_col_types <- function(path, sheet_i = 0L, na = character(), skip = 0L, n_max = -1L, guess_max = 1000L, has_col_names = FALSE) {
+    .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet_i, na, skip, n_max, guess_max, has_col_names)
 }
 
-xlsx_col_names <- function(path, na = character(), sheet_i = 0L, skip = 0L) {
-    .Call('readxl_xlsx_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip)
+xlsx_col_names <- function(path, na = character(), sheet_i = 0L, skip = 0L, n_max = 1L) {
+    .Call('readxl_xlsx_col_names', PACKAGE = 'readxl', path, na, sheet_i, skip, n_max)
 }
 
-read_xlsx_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, guess_max = 1000L) {
-    .Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, guess_max)
+read_xlsx_ <- function(path, sheet_i, col_names, col_types, na, skip = 0L, n_max = -1L, guess_max = 1000L) {
+    .Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet_i, col_names, col_types, na, skip, n_max, guess_max)
 }
 
 zip_xml <- function(zip_path, file_path) {

diff --git a/R/read_excel.R b/R/read_excel.R
@@ -14,44 +14,51 @@ NULL
 #'   unskipped column.
 #' @param col_types Either `NULL` to guess all from the spreadsheet or a
 #'   character vector containing one entry per column from these options:
-#'   "skip", "guess", "logical", "numeric", "date", "text" or "list". The
-#'   content of a cell in a skipped column is never read and that column will
-#'   not appear in the data frame output. A list cell loads a column as a list
-#'   of length 1 vectors, which are typed using the type guessing logic from
-#'   `col_types = NULL`, but on a cell-by-cell basis.
+#'   "skip", "guess", "logical", "numeric", "date", "text" or "list". If exactly
+#'   on `col_type` is specified, it will be recycled. The content of a cell in a
+#'   skipped column is never read and that column will not appear in the data
+#'   frame output. A list cell loads a column as a list of length 1 vectors,
+#'   which are typed using the type guessing logic from `col_types = NULL`, but
+#'   on a cell-by-cell basis.
 #' @param na Character vector of strings to use for missing values. By default,
 #'   readxl treats blank cells as missing data.
-#' @param skip Number of rows to skip before reading any data. Leading blank
-#'   rows are automatically skipped.
-#' @param guess_max Maximum number of rows to use for guessing column types.
+#' @param skip Number of rows to skip before reading anything (column names or
+#'   data). Leading blank rows are automatically skipped.
+#' @param n_max Maximum number of data rows to read.
+#' @param guess_max Maximum number of data rows to use for guessing column
+#'   types.
 #' @export
 #' @examples
 #' datasets <- readxl_example("datasets.xlsx")
 #' read_excel(datasets)
 #'
-#' # Specific sheet either by position or by name
+#' # Specify sheet either by position or by name
 #' read_excel(datasets, 2)
 #' read_excel(datasets, "mtcars")
 #'
-#' # Skipping rows and using default column names
+#' # Skip rows and use default column names
 #' read_excel(datasets, skip = 148, col_names = FALSE)
 #'
-#' # if col_types is of length one, it will be recycled
+#' # Recycle a length-one col_types
 #' read_excel(datasets, col_types = "text")
 #'
-#' # you can specify some col_types and guess others
+#' # Specify some col_types and guess others
 #' read_excel(datasets, col_types = c("text", "guess", "numeric", "guess", "guess"))
 #'
 #' # "list" col_type can handle information of disparate types
 #' df <- read_excel(readxl_example("clippy.xlsx"), col_types = c("text", "list"))
 #' df
 #' df$value
+#'
+#' # Limit the number of data rows read
+#' read_excel(datasets, n_max = 3)
 read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
-                       na = "", skip = 0, guess_max = 1000) {
+                       na = "", skip = 0, n_max = Inf,
+                       guess_max = min(1000, n_max)) {
   read_excel_(
     path = path, sheet = sheet,
     col_names = col_names, col_types = col_types,
-    na = na, skip = skip, guess_max = guess_max,
+    na = na, skip = skip, n_max = n_max, guess_max = guess_max,
     excel_format(path)
   )
 }
@@ -63,29 +70,32 @@ read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
 #' @rdname read_excel
 #' @export
 read_xls <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
-                     na = "", skip = 0, guess_max = 1000) {
+                     na = "", skip = 0, n_max = Inf,
+                     guess_max = min(1000, n_max)) {
   read_excel_(
     path = path, sheet = sheet,
     col_names = col_names, col_types = col_types,
-    na = na, skip = skip, guess_max = guess_max,
+    na = na, skip = skip, n_max = n_max, guess_max = guess_max,
     format = "xls"
   )
 }
 
 #' @rdname read_excel
 #' @export
 read_xlsx <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
-                      na = "", skip = 0, guess_max = 1000) {
+                      na = "", skip = 0, n_max = Inf,
+                      guess_max = min(1000, n_max)) {
   read_excel_(
     path = path, sheet = sheet,
     col_names = col_names, col_types = col_types,
-    na = na, skip = skip, guess_max = guess_max,
+    na = na, skip = skip, n_max = n_max, guess_max = guess_max,
     format = "xlsx"
   )
 }
 
 read_excel_ <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
-                        na = "", skip = 0, guess_max = 1000, format) {
+                        na = "", skip = 0, n_max = Inf,
+                        guess_max = min(1000, n_max), format) {
   if (format == "xls") {
     sheets_fun <- xls_sheets
     read_fun <- read_xls_
@@ -94,13 +104,14 @@ read_excel_ <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
     read_fun <- read_xlsx_
   }
   sheet <- standardise_sheet(sheet, sheets_fun(path))
+  n_max <- check_n_max(n_max)
   guess_max <- check_guess_max(guess_max)
   col_types <- check_col_types(col_types)
   tibble::repair_names(
     tibble::as_tibble(
       read_fun(path = path, sheet = sheet,
                col_names = col_names, col_types = col_types,
-               na = na, skip = skip, guess_max = guess_max),
+               na = na, skip = skip, n_max = n_max, guess_max = guess_max),
       validate = FALSE
     ),
     prefix = "X", sep = "__"
@@ -170,6 +181,19 @@ check_col_types <- function(col_types) {
   col_types
 }
 
+check_n_max <- function(n_max) {
+
+  if (length(n_max) != 1 || !is.numeric(n_max) || !is_integerish(n_max) ||
+      is.na(n_max) || n_max < 0) {
+    stop("`n_max` must be a positive integer", call. = FALSE)
+  }
+
+  if (n_max == Inf) {
+    n_max <- -1
+  }
+  n_max
+}
+
 ## from readr
 check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) {
 

diff --git a/docs/favicon.ico b/docs/favicon.ico
diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/docs/reference/read_excel.html b/docs/reference/read_excel.html