Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fwf_cols function #616

Merged
merged 24 commits into from Feb 27, 2017
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -40,6 +40,7 @@ export(default_locale)
export(format_csv)
export(format_delim)
export(format_tsv)
export(fwf_cols)
export(fwf_empty)
export(fwf_positions)
export(fwf_widths)
Expand Down
69 changes: 48 additions & 21 deletions R/read_fwf.R
@@ -1,3 +1,5 @@


#' Read a fixed width file into a tibble
#'
#' A fixed width file can be a very compact representation of numeric data.
Expand All @@ -20,20 +22,24 @@
#' fwf_sample <- readr_example("fwf-sample.txt")
#' cat(read_lines(fwf_sample))
#'
#' # You can specify column positions in three ways:
#' # You can specify column positions in several ways:
#' # 1. Guess based on position of empty columns
#' read_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
#' # 2. A vector of field widths
#' read_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
#' # 3. Paired vectors of start and end positions
#' read_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
#' # 4. Named arguments with start and end positions
#' read_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you include the width form here too?

#' # 5. Named arguments with column widths
#' read_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
read_fwf <- function(file, col_positions, col_types = NULL,
locale = default_locale(), na = c("", "NA"),
comment = "", skip = 0, n_max = Inf,
guess_max = min(n_max, 1000), progress = show_progress()) {
ds <- datasource(file, skip = skip)
if (inherits(ds, "source_file") && empty_file(file)) {
return(tibble::data_frame())
return(tibble::tibble())
}

tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment)
Expand All @@ -54,7 +60,8 @@ read_fwf <- function(file, col_positions, col_types = NULL,
}

out <- read_tokens(ds, tokenizer, spec$cols, names(spec$cols),
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, progress = progress)
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max,
progress = progress)

out <- name_problems(out, names(spec$cols), source_name(file))
attr(out, "spec") <- spec
Expand All @@ -71,13 +78,8 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L)
out <- whitespaceColumns(ds, comment = comment, n = n)
out$end[length(out$end)] <- NA

if (is.null(col_names)) {
col_names <- paste0("X", seq_along(out$begin))
} else {
stopifnot(length(out$begin) == length(col_names))
}
col_names <- fwf_col_names(col_names, length(out$begin))
out$col_names <- col_names

out
}

Expand All @@ -87,28 +89,53 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L)
#' reading a ragged fwf file.
#' @param col_names Either NULL, or a character vector column names.
fwf_widths <- function(widths, col_names = NULL) {
pos <- cumsum(c(1, abs(widths)))

fwf_positions(pos[-length(pos)], pos[-1] - 1, col_names)
pos <- cumsum(c(1L, abs(widths)))
fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names)
}

#' @rdname read_fwf
#' @export
#' @param start,end Starting and ending (inclusive) positions of each field.
#' Use NA as last end field when reading a ragged fwf file.
fwf_positions <- function(start, end, col_names = NULL) {
fwf_positions <- function(start, end = NULL, col_names = NULL) {

stopifnot(length(start) == length(end))
col_names <- fwf_col_names(col_names, length(start))

if (is.null(col_names)) {
col_names <- paste0("X", seq_along(start))
} else {
stopifnot(length(start) == length(col_names))
}

list(
begin = start - 1,
tibble(
begin = start - 1L,
end = end, # -1 to change to 0 offset, +1 to be exclusive,
col_names = col_names
)
}


#' @rdname read_fwf
#' @export
#' @param ... If the first element is a data frame,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels too flexible to me. But if you really think it's a good idea to keep it, the function signature should be x, ...

#' then it must have all numeric columns and either one or two rows.
#' The column names are the variable names, and the column values are the
#' variable widths if a length one vector, and variable start and end
#' positions.
#' Otherwise, the elements of `...` are used to construct a data frame
#' with or or two rows as above.
fwf_cols <- function(...) {
x <- lapply(list(...), as.integer)
names(x) <- fwf_col_names(names(x), length(x))
x <- tibble::as_tibble(x)
if (nrow(x) == 2) {
fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x))
} else if (nrow(x) == 1) {
fwf_widths(as.integer(x[1, ]), names(x))
} else {
stop("All variables must have either one (width) two (start, end) values.",
call. = FALSE)
}
}

fwf_col_names <- function(nm, n) {
nm <- nm %||% rep("", n)
nm_empty <- (nm == "")
nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty]
nm
}
19 changes: 17 additions & 2 deletions man/read_fwf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions tests/testthat/test-read-fwf.R
Expand Up @@ -127,6 +127,35 @@ test_that("error on empty spec (#511, #519)", {
expect_error(read_fwf(txt, pos), "Zero-length.*specifications not supported")
})

# fwf_cols
test_that("fwf_cols produces correct fwf_positions object with elements of length 2", {
expected <- fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "b", "d"))
expect_equivalent(fwf_cols(a = c(1, 2), b = c(9, 12), d = c(4, 6)), expected)
})

test_that("fwf_cols produces correct fwf_positions object with elements of length 1", {
expected <- fwf_widths(c(2L, 4L, 3L), c("a", "b", "c"))
expect_equivalent(fwf_cols(a = 2, b = 4, c = 3), expected)
})


test_that("fwf_cols throws error when arguments are not length 1 or 2", {
pattern <- "Variables must be length 1 or .*"
expect_error(fwf_cols(a = 1:3, b = 4:5), pattern)
expect_error(fwf_cols(a = c(), b = 4:5), pattern)
})

test_that("fwf_cols works with unnamed columns", {
expect_equivalent(
fwf_cols(c(1, 2), c(9, 12), c(4, 6)),
fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("X1", "X2", "X3"))
)
expect_equivalent(
fwf_cols(a = c(1, 2), c(9, 12), c(4, 6)),
fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "X2", "X3"))
)
})

# read_table -------------------------------------------------------------------

test_that("read_table silently reads ragged last column", {
Expand Down