Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
444 lines (391 sloc) 14.4 KB
#' @useDynLib haven, .registration = TRUE
#' @import rlang
#' @import vctrs
#' @importFrom tibble tibble
#' @importFrom hms hms
NULL
#' Read and write SAS files.
#'
#' `read_sas()` supports both sas7bdat files and the accompanying sas7bcat files
#' that SAS uses to record value labels. `write_sas()` is currently experimental
#' and only works for limited datasets.
#'
#' @param data_file,catalog_file Path to data and catalog files. The files are
#' processed with [readr::datasource()].
#' @param data Data frame to write.
#' @param path Path to file where the data will be written.
#' @param encoding,catalog_encoding The character encoding used for the
#' `data_file` and `catalog_encoding` respectively. A value of `NULL` uses the
#' encoding specified in the file; use this argument to override it if it is
#' incorrect.
#' @inheritParams tibble::as_tibble
#' @param col_select One or more selection expressions, like in
#' [dplyr::select()]. Use `c()` or `list()` to use more than one expression.
#' See `?dplyr::select` for details on available selection options. Only the
#' specified columns will be read from `data_file`.
#' @param skip Number of lines to skip before reading data.
#' @param n_max Maximum number of lines to read.
#' @param cols_only **Deprecated**: Use `col_select` instead.
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable. It is
#' not printed on the console, but the RStudio viewer will show it.
#'
#' `write_sas()` returns the input `data` invisibly.
#' @export
#' @examples
#' path <- system.file("examples", "iris.sas7bdat", package = "haven")
#' read_sas(path)
read_sas <- function(data_file, catalog_file = NULL,
encoding = NULL, catalog_encoding = encoding,
col_select = NULL, skip = 0L, n_max = Inf, cols_only = "DEPRECATED",
.name_repair = "unique"
) {
if (!missing(cols_only)) {
warning("`cols_only` is deprecated. Please use `col_select` instead.", call. = FALSE)
stopifnot(is.character(cols_only)) # used to only work with a char vector
# guarantee a quosure to keep NULL and tidyselect logic clean downstream
col_select <- quo(c(!!!cols_only))
} else {
col_select <- enquo(col_select)
}
if (is.null(encoding)) {
encoding <- ""
}
cols_skip <- skip_cols(read_sas, !!col_select, data_file, encoding = encoding)
n_max <- validate_n_max(n_max)
spec_data <- readr::datasource(data_file)
if (is.null(catalog_file)) {
spec_cat <- list()
} else {
spec_cat <- readr::datasource(catalog_file)
}
switch(class(spec_data)[1],
source_file = df_parse_sas_file(spec_data, spec_cat, encoding = encoding, catalog_encoding = catalog_encoding, cols_skip = cols_skip, n_max = n_max, rows_skip = skip, name_repair = .name_repair),
source_raw = df_parse_sas_raw(spec_data, spec_cat, encoding = encoding, catalog_encoding = catalog_encoding, cols_skip = cols_skip, n_max = n_max, rows_skip = skip, name_repair = .name_repair),
stop("This kind of input is not handled", call. = FALSE)
)
}
#' @export
#' @rdname read_sas
write_sas <- function(data, path) {
validate_sas(data)
write_sas_(data, normalizePath(path, mustWork = FALSE))
invisible(data)
}
#' Read and write SAS transport files
#'
#' The SAS transport format is a open format, as is required for submission
#' of the data to the FDA.
#'
#' @inheritParams read_spss
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable.
#' It is not printed on the console, but the RStudio viewer will show it.
#'
#' `write_xpt()` returns the input `data` invisibly.
#' @export
#' @examples
#' tmp <- tempfile(fileext = ".xpt")
#' write_xpt(mtcars, tmp)
#' read_xpt(tmp)
read_xpt <- function(file, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique"
) {
cols_skip <- skip_cols(read_xpt, {{ col_select }}, file)
n_max <- validate_n_max(n_max)
spec <- readr::datasource(file)
switch(class(spec)[1],
source_file = df_parse_xpt_file(spec, cols_skip, n_max, skip, name_repair = .name_repair),
source_raw = df_parse_xpt_raw(spec, cols_skip, n_max, skip, name_repair = .name_repair),
stop("This kind of input is not handled", call. = FALSE)
)
}
#' @export
#' @rdname read_xpt
#' @param version Version of transport file specification to use: either 5 or 8.
#' @param name Member name to record in file. Defaults to file name sans
#' extension. Must be <= 8 characters for version 5, and <= 32 characters
#' for version 8.
write_xpt <- function(data, path, version = 8, name = NULL) {
stopifnot(version %in% c(5, 8))
if (is.null(name)) {
name <- tools::file_path_sans_ext(basename(path))
}
name <- validate_xpt_name(name, version)
write_xpt_(
data,
normalizePath(path, mustWork = FALSE),
version = version,
name = name
)
invisible(data)
}
validate_xpt_name <- function(name, version) {
if (version == 5) {
if (nchar(name) > 8) {
stop("`name` must be 8 characters or fewer", call. = FALSE)
}
} else {
if (nchar(name) > 32) {
stop("`name` must be 32 characters or fewer", call. = FALSE)
}
}
name
}
#' Read SPSS (`.sav`, `.zsav`, `.por`) files. Write `.sav` and `.zsav` files.
#'
#' `read_sav()` reads both `.sav` and `.zsav` files; `write_sav()` creates
#' `.zsav` files when `compress = TRUE`. `read_por()` reads `.por` files.
#' `read_spss()` uses either `read_por()` or `read_sav()` based on the
#' file extension.
#'
#' Currently haven can read and write logical, integer, numeric, character
#' and factors. See [labelled_spss()] for how labelled variables in
#' SPSS are handled in R.
#'
#' @inheritParams read_sas
#' @inheritParams readr::datasource
#' @param path Path to a file where the data will be written.
#' @param data Data frame to write.
#' @param encoding The character encoding used for the file. The default,
#' `NULL`, use the encoding specified in the file, but sometimes this
#' value is incorrect and it is useful to be able to override it.
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable.
#' It is not printed on the console, but the RStudio viewer will show it.
#'
#' `write_sav()` returns the input `data` invisibly.
#' @name read_spss
#' @examples
#' path <- system.file("examples", "iris.sav", package = "haven")
#' read_sav(path)
#'
#' tmp <- tempfile(fileext = ".sav")
#' write_sav(mtcars, tmp)
#' read_sav(tmp)
NULL
#' @export
#' @rdname read_spss
read_sav <- function(file, encoding = NULL, user_na = FALSE, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique") {
if (is.null(encoding)) {
encoding <- ""
}
cols_skip <- skip_cols(read_sav, {{ col_select }}, file, encoding)
n_max <- validate_n_max(n_max)
spec <- readr::datasource(file)
switch(class(spec)[1],
source_file = df_parse_sav_file(spec, encoding, user_na, cols_skip, n_max, skip, name_repair = .name_repair),
source_raw = df_parse_sav_raw(spec, encoding, user_na, cols_skip, n_max, skip, name_repair = .name_repair),
stop("This kind of input is not handled", call. = FALSE)
)
}
#' @export
#' @rdname read_spss
read_por <- function(file, user_na = FALSE, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique") {
cols_skip <- skip_cols(read_por, {{ col_select }}, file)
n_max <- validate_n_max(n_max)
spec <- readr::datasource(file)
switch(class(spec)[1],
source_file = df_parse_por_file(spec, encoding = "", user_na = user_na, cols_skip, n_max, skip, name_repair = .name_repair),
source_raw = df_parse_por_raw(spec, encoding = "", user_na = user_na, cols_skip, n_max, skip, name_repair = .name_repair),
stop("This kind of input is not handled", call. = FALSE)
)
}
#' @export
#' @rdname read_spss
#' @param compress If `TRUE`, will compress the file, resulting in a `.zsav`
#' file.
write_sav <- function(data, path, compress = FALSE) {
validate_sav(data)
write_sav_(data, normalizePath(path, mustWork = FALSE), compress = compress)
invisible(data)
}
#' @export
#' @rdname read_spss
#' @param user_na If `TRUE` variables with user defined missing will
#' be read into [labelled_spss()] objects. If `FALSE`, the
#' default, user-defined missings will be converted to `NA`.
read_spss <- function(file, user_na = FALSE, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique") {
ext <- tolower(tools::file_ext(file))
switch(ext,
sav = read_sav(file, user_na = user_na, col_select = {{ col_select }}, n_max = n_max, skip = skip, .name_repair = .name_repair),
zsav = read_sav(file, user_na = user_na, col_select = {{ col_select }}, n_max = n_max, skip = skip, .name_repair = .name_repair),
por = read_por(file, user_na = user_na, col_select = {{ col_select }}, n_max = n_max, skip = skip, .name_repair = .name_repair),
stop("Unknown extension '.", ext, "'", call. = FALSE)
)
}
#' Read and write Stata DTA files.
#'
#' Currently haven can read and write logical, integer, numeric, character
#' and factors. See [labelled()] for how labelled variables in
#' Stata are handled in R.
#'
#' @section Character encoding:
#' Prior to Stata 14, files did not declare a text encoding, and the
#' default encoding differed across platforms. If `encoding = NULL`,
#' haven assumes the encoding is windows-1252, the text encoding used by
#' Stata on Windows. Unfortunately Stata on Mac and Linux use a different
#' default encoding, "latin1". If you encounter an error such as
#' "Unable to convert string to the requested encoding", try
#' `encoding = "latin1"`
#'
#' For Stata 14 and later, you should not need to manually specify `encoding`
#' value unless the value was incorrectly recorded in the source file.
#'
#' @inheritParams readr::datasource
#' @inheritParams read_spss
#' @param encoding The character encoding used for the file. Generally,
#' only needed for Stata 13 files and earlier. See Encoding section
#' for details.
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable.
#' It is not printed on the console, but the RStudio viewer will show it.
#'
#' If a dataset label is defined in Stata, it will stored in the "label"
#' attribute of the tibble.
#'
#' `write_dta()` returns the input `data` invisibly.
#' @export
#' @examples
#' path <- system.file("examples", "iris.dta", package = "haven")
#' read_dta(path)
#'
#' tmp <- tempfile(fileext = ".dta")
#' write_dta(mtcars, tmp)
#' read_dta(tmp)
#' read_stata(tmp)
read_dta <- function(file, encoding = NULL, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique") {
if (is.null(encoding)) {
encoding <- ""
}
cols_skip <- skip_cols(read_dta, {{ col_select }}, file, encoding)
n_max <- validate_n_max(n_max)
spec <- readr::datasource(file)
switch(class(spec)[1],
source_file = df_parse_dta_file(spec, encoding, cols_skip, n_max, skip, name_repair = .name_repair),
source_raw = df_parse_dta_raw(spec, encoding, cols_skip, n_max, skip, name_repair = .name_repair),
stop("This kind of input is not handled", call. = FALSE)
)
}
#' @export
#' @rdname read_dta
read_stata <- read_dta
#' @export
#' @rdname read_dta
#' @param version File version to use. Supports versions 8-15.
#' @param label Dataset label to use, or `NULL`. Defaults to the value stored in
#' the "label" attribute of `data`. Must be <= 80 characters.
write_dta <- function(data, path, version = 14, label = attr(data, "label")) {
validate_dta(data, version = version)
validate_dta_label(label)
write_dta_(data,
normalizePath(path, mustWork = FALSE),
version = stata_file_format(version),
label = label
)
invisible(data)
}
stata_file_format <- function(version) {
stopifnot(is.numeric(version), length(version) == 1)
version <- as.integer(version)
if (version == 15L) {
119
} else if (version == 14L) {
118
} else if (version == 13L) {
117
} else if (version == 12L) {
115
} else if (version %in% c(10L, 11L)) {
114
} else if (version %in% c(8L, 9L)) {
113
} else {
stop("Version ", version, " not currently supported", call. = FALSE)
}
}
validate_dta <- function(data, version) {
stopifnot(is.data.frame(data))
# Check variable names
bad_names <- !grepl("^[A-Za-z_]{1}[A-Za-z0-9_]{0,31}$", names(data))
if (version < 14 && any(bad_names)) {
stop(
"The following variable names are not valid Stata variables: ",
var_names(data, bad_names),
call. = FALSE
)
}
# Check double vectors can only have labelled integers
bad_labels <- vapply(data, has_non_integer_labels, logical(1))
if (any(bad_labels)) {
stop(
"Stata only supports labelling with integers.\nProblems: ",
var_names(data, bad_labels),
call. = FALSE
)
}
}
validate_dta_label <- function(label) {
if (!is.null(label)) {
stopifnot(is.character(label), length(label) == 1)
if (nchar(label) > 80) {
stop("Stata data labels must be 80 characters or fewer", call. = FALSE)
}
}
}
has_non_integer_labels <- function(x) {
if (!is.labelled(x)) {
return(FALSE)
}
if (!is.double(x)) {
return(FALSE)
}
!is_integerish(attr(x, "labels"))
}
validate_sav <- function(data) {
stopifnot(is.data.frame(data))
# Check factor lengths
level_lengths <- vapply(data, max_level_length, integer(1))
bad_lengths <- level_lengths > 120
if (any(bad_lengths)) {
stop(
"SPSS only supports levels with <= 120 characters\n",
"Problems: ", var_names(data, bad_lengths),
call. = FALSE
)
}
}
validate_sas <- function(data) {
stopifnot(is.data.frame(data))
}
var_names <- function(data, i) {
x <- names(data)[i]
paste(encodeString(x, quote = "`"), collapse = ", ")
}
skip_cols <- function(reader, col_select = NULL, ...) {
col_select <- enquo(col_select)
if (quo_is_null(col_select)) {
return(character())
}
cols <- names(reader(..., n_max = 0L))
sels <- tidyselect::vars_select(cols, !!col_select)
if (length(sels) == 0) {
stop("Can't find any columns matching `col_select` in data.", call. = FALSE)
}
setdiff(cols, sels)
}
validate_n_max <- function(n) {
if (!is.numeric(n) && !is.na(n)) {
stop("`n_max` must be numeric, not ", class(n)[1], ".", call. = FALSE)
}
if (length(n) != 1) {
stop("`n_max` must have length 1, not ", length(n), ".", call. = FALSE)
}
if (is.na(n) || is.infinite(n) || n < 0) {
return(-1L)
}
as.integer(n)
}