R/pivot-wide.R

#' Pivot data from long to wide
#
#' @description
#' `pivot_wider()` "widens" data, increasing the number of columns and
#' decreasing the number of rows. The inverse transformation is
#' [pivot_longer()].
#'
#' Learn more in `vignette("pivot")`.
#'
#' @details
#' `pivot_wider()` is an updated approach to [spread()], designed to be both
#' simpler to use and to handle more use cases. We recommend you use
#' `pivot_wider()` for new code; `spread()` isn't going away but is no longer
#' under active development.
#'
#' @seealso [pivot_wider_spec()] to pivot "by hand" with a data frame that
#'   defines a pivotting specification.
#' @inheritParams pivot_longer
#' @param id_cols <[`tidy-select`][tidyr_tidy_select]> A set of columns that
#'   uniquely identifies each observation. Defaults to all columns in `data`
#'   except for the columns specified in `names_from` and `values_from`.
#'   Typically used when you have redundant variables, i.e. variables whose
#'   values are perfectly correlated with existing variables.
#' @param id_expand Should the values in the `id_cols` columns be expanded by
#'   [expand()] before pivoting? This results in more rows, the output will
#'   contain a complete expansion of all possible values in `id_cols`. Implicit
#'   factor levels that aren't represented in the data will become explicit.
#'   Additionally, the row values corresponding to the expanded `id_cols` will
#'   be sorted.
#' @param names_from,values_from <[`tidy-select`][tidyr_tidy_select]> A pair of
#'   arguments describing which column (or columns) to get the name of the
#'   output column (`names_from`), and which column (or columns) to get the
#'   cell values from (`values_from`).
#'
#'   If `values_from` contains multiple values, the value will be added to the
#'   front of the output column.
#' @param names_sep If `names_from` or `values_from` contains multiple
#'   variables, this will be used to join their values together into a single
#'   string to use as a column name.
#' @param names_prefix String added to the start of every variable name. This is
#'   particularly useful if `names_from` is a numeric vector and you want to
#'   create syntactic variable names.
#' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply
#'   a glue specification that uses the `names_from` columns (and special
#'   `.value`) to create custom column names.
#' @param names_sort Should the column names be sorted? If `FALSE`, the default,
#'   column names are ordered by first appearance.
#' @param names_vary When `names_from` identifies a column (or columns) with
#'   multiple unique values, and multiple `values_from` columns are provided,
#'   in what order should the resulting column names be combined?
#'
#'   - `"fastest"` varies `names_from` values fastest, resulting in a column
#'     naming scheme of the form: `value1_name1, value1_name2, value2_name1,
#'     value2_name2`. This is the default.
#'
#'   - `"slowest"` varies `names_from` values slowest, resulting in a column
#'     naming scheme of the form: `value1_name1, value2_name1, value1_name2,
#'     value2_name2`.
#' @param names_expand Should the values in the `names_from` columns be expanded
#'   by [expand()] before pivoting? This results in more columns, the output
#'   will contain column names corresponding to a complete expansion of all
#'   possible values in `names_from`. Implicit factor levels that aren't
#'   represented in the data will become explicit. Additionally, the column
#'   names will be sorted, identical to what `names_sort` would produce.
#' @param values_fill Optionally, a (scalar) value that specifies what each
#'   `value` should be filled in with when missing.
#'
#'   This can be a named list if you want to apply different fill values to
#'   different value columns.
#' @param values_fn Optionally, a function applied to the value in each cell
#'   in the output. You will typically use this when the combination of
#'   `id_cols` and `names_from` columns does not uniquely identify an
#'   observation.
#'
#'   This can be a named list if you want to apply different aggregations
#'   to different `values_from` columns.
#' @param unused_fn Optionally, a function applied to summarize the values from
#'   the unused columns (i.e. columns not identified by `id_cols`,
#'   `names_from`, or `values_from`).
#'
#'   The default drops all unused columns from the result.
#'
#'   This can be a named list if you want to apply different aggregations
#'   to different unused columns.
#'
#'   `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
#'   all unspecified columns will be considered `id_cols`.
#'
#'   This is similar to grouping by the `id_cols` then summarizing the
#'   unused columns using `unused_fn`.
#' @param ... Additional arguments passed on to methods.
#' @export
#' @examples
#' # See vignette("pivot") for examples and explanation
#'
#' fish_encounters
#' fish_encounters %>%
#'   pivot_wider(names_from = station, values_from = seen)
#' # Fill in missing values
#' fish_encounters %>%
#'   pivot_wider(names_from = station, values_from = seen, values_fill = 0)
#'
#' # Generate column names from multiple variables
#' us_rent_income
#' us_rent_income %>%
#'   pivot_wider(
#'     names_from = variable,
#'     values_from = c(estimate, moe)
#'   )
#'
#' # You can control whether `names_from` values vary fastest or slowest
#' # relative to the `values_from` column names using `names_vary`.
#' us_rent_income %>%
#'   pivot_wider(
#'     names_from = variable,
#'     values_from = c(estimate, moe),
#'     names_vary = "slowest"
#'   )
#'
#' # When there are multiple `names_from` or `values_from`, you can use
#' # use `names_sep` or `names_glue` to control the output variable names
#' us_rent_income %>%
#'   pivot_wider(
#'     names_from = variable,
#'     names_sep = ".",
#'     values_from = c(estimate, moe)
#'   )
#' us_rent_income %>%
#'   pivot_wider(
#'     names_from = variable,
#'     names_glue = "{variable}_{.value}",
#'     values_from = c(estimate, moe)
#'   )
#'
#' # Can perform aggregation with `values_fn`
#' warpbreaks <- as_tibble(warpbreaks[c("wool", "tension", "breaks")])
#' warpbreaks
#' warpbreaks %>%
#'   pivot_wider(
#'     names_from = wool,
#'     values_from = breaks,
#'     values_fn = mean
#'   )
#'
#' # Can pass an anonymous function to `values_fn` when you
#' # need to supply additional arguments
#' warpbreaks$breaks[1] <- NA
#' warpbreaks %>%
#'   pivot_wider(
#'     names_from = wool,
#'     values_from = breaks,
#'     values_fn = ~mean(.x, na.rm = TRUE)
#'   )
pivot_wider <- function(data,
                        id_cols = NULL,
                        id_expand = FALSE,
                        names_from = name,
                        names_prefix = "",
                        names_sep = "_",
                        names_glue = NULL,
                        names_sort = FALSE,
                        names_vary = "fastest",
                        names_expand = FALSE,
                        names_repair = "check_unique",
                        values_from = value,
                        values_fill = NULL,
                        values_fn = NULL,
                        unused_fn = NULL,
                        ...) {
  ellipsis::check_dots_used()
  UseMethod("pivot_wider")
}

#' @export
pivot_wider.data.frame <- function(data,
                                   id_cols = NULL,
                                   id_expand = FALSE,
                                   names_from = name,
                                   names_prefix = "",
                                   names_sep = "_",
                                   names_glue = NULL,
                                   names_sort = FALSE,
                                   names_vary = "fastest",
                                   names_expand = FALSE,
                                   names_repair = "check_unique",
                                   values_from = value,
                                   values_fill = NULL,
                                   values_fn = NULL,
                                   unused_fn = NULL,
                                   ...) {
  names_from <- enquo(names_from)
  values_from <- enquo(values_from)

  spec <- build_wider_spec(
    data = data,
    names_from = !!names_from,
    values_from = !!values_from,
    names_prefix = names_prefix,
    names_sep = names_sep,
    names_glue = names_glue,
    names_sort = names_sort,
    names_vary = names_vary,
    names_expand = names_expand
  )

  id_cols <- build_wider_id_cols_expr(
    data = data,
    id_cols = {{id_cols}},
    names_from = !!names_from,
    values_from = !!values_from
  )

  pivot_wider_spec(
    data = data,
    spec = spec,
    id_cols = !!id_cols,
    id_expand = id_expand,
    names_repair = names_repair,
    values_fill = values_fill,
    values_fn = values_fn,
    unused_fn = unused_fn
  )
}

#' Pivot data from long to wide using a spec
#'
#' This is a low level interface to pivotting, inspired by the cdata package,
#' that allows you to describe pivotting with a data frame.
#'
#' @keywords internal
#' @export
#' @inheritParams pivot_wider
#' @param spec A specification data frame. This is useful for more complex
#'  pivots because it gives you greater control on how metadata stored in the
#'  columns become column names in the result.
#'
#'   Must be a data frame containing character `.name` and `.value` columns.
#'   Additional columns in `spec` should be named to match columns in the
#'   long format of the dataset and contain values corresponding to columns
#'   pivoted from the wide format.
#'   The special `.seq` variable is used to disambiguate rows internally;
#'   it is automatically removed after pivotting.
#' @param id_cols <[`tidy-select`][tidyr_tidy_select]> A set of columns that
#'   uniquely identifies each observation. Defaults to all columns in `data`
#'   except for the columns specified in `spec$.value` and the columns of the
#'   `spec` that aren't named `.name` or `.value`. Typically used when you have
#'   redundant variables, i.e. variables whose values are perfectly correlated
#'   with existing variables.
#'
#' @examples
#' # See vignette("pivot") for examples and explanation
#'
#' us_rent_income
#' spec1 <- us_rent_income %>%
#'   build_wider_spec(names_from = variable, values_from = c(estimate, moe))
#' spec1
#'
#' us_rent_income %>%
#'   pivot_wider_spec(spec1)
#'
#' # Is equivalent to
#' us_rent_income %>%
#'   pivot_wider(names_from = variable, values_from = c(estimate, moe))
#'
#' # `pivot_wider_spec()` provides more control over column names and output format
#' # instead of creating columns with estimate_ and moe_ prefixes,
#' # keep original variable name for estimates and attach _moe as suffix
#' spec2 <- tibble(
#'   .name = c("income", "rent", "income_moe", "rent_moe"),
#'   .value = c("estimate", "estimate", "moe", "moe"),
#'   variable = c("income", "rent", "income", "rent")
#' )
#'
#' us_rent_income %>%
#'   pivot_wider_spec(spec2)
pivot_wider_spec <- function(data,
                             spec,
                             names_repair = "check_unique",
                             id_cols = NULL,
                             id_expand = FALSE,
                             values_fill = NULL,
                             values_fn = NULL,
                             unused_fn = NULL) {
  input <- data

  spec <- check_pivot_spec(spec)

  names_from_cols <- names(spec)[-(1:2)]
  values_from_cols <- vec_unique(spec$.value)
  non_id_cols <- c(names_from_cols, values_from_cols)

  id_cols <- select_wider_id_cols(
    data = data,
    id_cols = {{id_cols}},
    non_id_cols = non_id_cols
  )

  values_fn <- check_list_of_functions(values_fn, values_from_cols, "values_fn")

  unused_cols <- setdiff(names(data), c(id_cols, non_id_cols))
  unused_fn <- check_list_of_functions(unused_fn, unused_cols, "unused_fn")
  unused_cols <- names(unused_fn)

  if (is.null(values_fill)) {
    values_fill <- list()
  }
  if (is_scalar(values_fill)) {
    values_fill <- rep_named(values_from_cols, list(values_fill))
  }
  if (!vec_is_list(values_fill)) {
    abort("`values_fill` must be NULL, a scalar, or a named list")
  }
  values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]

  if (!is_bool(id_expand)) {
    abort("`id_expand` must be a single `TRUE` or `FALSE`.")
  }

  # Early conversion to tibble because data.table returns zero rows if
  # zero cols are selected. Also want to avoid the grouped-df behavior
  # of `complete()`.
  data <- as_tibble(data)
  data <- data[vec_unique(c(id_cols, names_from_cols, values_from_cols, unused_cols))]

  if (id_expand) {
    data <- complete(data, !!!syms(id_cols), fill = values_fill, explicit = FALSE)
  }

  # Figure out rows in output
  rows <- data[id_cols]
  row_id <- vec_group_id(rows)
  nrow <- attr(row_id, "n")
  rows <- vec_slice(rows, vec_unique_loc(row_id))

  n_unused_fn <- length(unused_fn)

  unused <- vector("list", length = n_unused_fn)
  names(unused) <- unused_cols

  if (n_unused_fn > 0L) {
    # This can be expensive, only compute if we are using `unused_fn`
    unused_locs <- vec_group_loc(row_id)$loc
  }

  for (i in seq_len(n_unused_fn)) {
    unused_col <- unused_cols[[i]]
    unused_fn_i <- unused_fn[[i]]

    unused_value <- data[[unused_col]]

    unused[[i]] <- value_summarize(
      value = unused_value,
      value_locs = unused_locs,
      value_name = unused_col,
      fn = unused_fn_i,
      fn_name = "unused_fn"
    )
  }

  unused <- tibble::new_tibble(unused, nrow = nrow)

  duplicate_names <- character(0L)

  value_specs <- unname(split(spec, spec$.value))
  value_out <- vec_init(list(), length(value_specs))

  for (i in seq_along(value_out)) {
    value_spec <- value_specs[[i]]
    value_name <- value_spec$.value[[1]]
    value <- data[[value_name]]

    cols <- data[names(value_spec)[-(1:2)]]
    col_id <- vec_match(as_tibble(cols), value_spec[-(1:2)])
    value_id <- data.frame(row = row_id, col = col_id)

    value_fn <- values_fn[[value_name]]

    if (is.null(value_fn) && vec_duplicate_any(value_id)) {
      # There are unhandled duplicates. Handle them with `list()` and warn.
      value_fn <- list
      duplicate_names <- c(duplicate_names, value_name)
    }

    if (!is.null(value_fn)) {
      result <- vec_group_loc(value_id)
      value_id <- result$key
      value_locs <- result$loc

      value <- value_summarize(
        value = value,
        value_locs = value_locs,
        value_name = value_name,
        fn = value_fn,
        fn_name = "values_fn"
      )
    }

    ncol <- nrow(value_spec)

    fill <- values_fill[[value_name]]
    if (is.null(fill)) {
      out <- vec_init(value, nrow * ncol)
    } else {
      stopifnot(vec_size(fill) == 1)
      fill <- vec_cast(fill, value)
      out <- vec_rep_each(fill, nrow * ncol)
    }
    vec_slice(out, value_id$row + nrow * (value_id$col - 1L)) <- value

    value_out[[i]] <- chop_rectangular_df(out, value_spec$.name)
  }

  if (length(duplicate_names) > 0L) {
    duplicate_names <- glue::backtick(duplicate_names)
    duplicate_names <- glue::glue_collapse(duplicate_names, sep = ", ", last = " and ")

    group_cols <- c(id_cols, names_from_cols)
    group_cols <- backtick_if_not_syntactic(group_cols)
    group_cols <- glue::glue_collapse(group_cols, sep = ", ")

    warn(glue::glue(
      "Values from {duplicate_names} are not uniquely identified; output will contain list-cols.\n",
      "* Use `values_fn = list` to suppress this warning.\n",
      "* Use `values_fn = {{summary_fun}}` to summarise duplicates.\n",
      "* Use the following dplyr code to identify duplicates.\n",
      "  {{data}} %>%\n",
      "    dplyr::group_by({group_cols}) %>%\n",
      "    dplyr::summarise(n = dplyr::n(), .groups = \"drop\") %>%\n",
      "    dplyr::filter(n > 1L)"
    ))
  }

  # `check_pivot_spec()` ensures `.name` is unique. Name repair shouldn't be needed.
  values <- vec_cbind(!!!value_out, .name_repair = "minimal")

  # Recreate desired column order of the new spec columns (#569)
  values <- values[spec$.name]

  out <- wrap_error_names(vec_cbind(
    rows,
    values,
    unused,
    .name_repair = names_repair
  ))

  reconstruct_tibble(input, out)
}

#' @export
#' @rdname pivot_wider_spec
#' @inheritParams pivot_wider
build_wider_spec <- function(data,
                             names_from = name,
                             values_from = value,
                             names_prefix = "",
                             names_sep = "_",
                             names_glue = NULL,
                             names_sort = FALSE,
                             names_vary = "fastest",
                             names_expand = FALSE) {
  names_from <- tidyselect::eval_select(enquo(names_from), data)
  values_from <- tidyselect::eval_select(enquo(values_from), data)

  if (is_empty(names_from)) {
    abort("`names_from` must select at least one column.")
  }
  if (is_empty(values_from)) {
    abort("`values_from` must select at least one column.")
  }

  names_vary <- arg_match0(names_vary, c("fastest", "slowest"), arg_nm = "names_vary")

  if (!is_bool(names_expand)) {
    abort("`names_expand` must be a single `TRUE` or `FALSE`.")
  }

  data <- as_tibble(data)
  data <- data[names_from]

  if (names_expand) {
    # `expand()` always does sort + unique
    row_ids <- expand(data, !!!syms(names(data)))
  } else {
    row_ids <- vec_unique(data)

    if (names_sort) {
      row_ids <- vec_sort(row_ids)
    }
  }

  row_names <- exec(paste, !!!row_ids, sep = names_sep)

  out <- tibble(
    .name = vec_paste0(names_prefix, row_names)
  )

  if (length(values_from) == 1) {
    out$.value <- names(values_from)
  } else {
    if (names_vary == "fastest") {
      out <- vec_rep(out, vec_size(values_from))
      out$.value <- vec_rep_each(names(values_from), vec_size(row_ids))
      row_ids <- vec_rep(row_ids, vec_size(values_from))
    } else {
      out <- vec_rep_each(out, vec_size(values_from))
      out$.value <- vec_rep(names(values_from), vec_size(row_ids))
      row_ids <- vec_rep_each(row_ids, vec_size(values_from))
    }

    out$.name <- vec_paste0(out$.value, names_sep, out$.name)
  }

  out <- vec_cbind(out, as_tibble(row_ids), .name_repair = "minimal")
  if (!is.null(names_glue)) {
    out$.name <- as.character(glue::glue_data(out, names_glue))
  }

  out
}

build_wider_id_cols_expr <- function(data,
                                     id_cols = NULL,
                                     names_from = name,
                                     values_from = value) {
  # TODO: Use `allow_rename = FALSE`.
  # Requires https://github.com/r-lib/tidyselect/issues/225.
  names_from <- names(tidyselect::eval_select(enquo(names_from), data))
  values_from <- names(tidyselect::eval_select(enquo(values_from), data))
  non_id_cols <- c(names_from, values_from)

  out <- select_wider_id_cols(
    data = data,
    id_cols = {{id_cols}},
    non_id_cols = non_id_cols
  )

  expr(c(!!!out))
}

select_wider_id_cols <- function(data,
                                 id_cols = NULL,
                                 non_id_cols = character()) {
  id_cols <- enquo(id_cols)

  # Remove known non-id-cols so they are never selected
  data <- data[setdiff(names(data), non_id_cols)]

  if (quo_is_null(id_cols)) {
    names(data)
  } else {
    # TODO: Use `allow_rename = FALSE`.
    # Requires https://github.com/r-lib/tidyselect/issues/225.
    names(tidyselect::eval_select(enquo(id_cols), data))
  }
}

# Helpers -----------------------------------------------------------------

value_summarize <- function(value, value_locs, value_name, fn, fn_name) {
  value <- vec_chop(value, value_locs)

  if (identical(fn, list)) {
    # The no-op case, for performance
    return(value)
  }

  value <- map(value, fn)

  sizes <- list_sizes(value)
  invalid_sizes <- sizes != 1L

  if (any(invalid_sizes)) {
    size <- sizes[invalid_sizes][[1]]

    header <- glue(
      "Applying `{fn_name}` to `{value_name}` must result in ",
      "a single summary value per key."
    )
    bullet <- c(
      x = glue("Applying `{fn_name}` resulted in a value with length {size}.")
    )

    abort(c(header, bullet))
  }

  value <- vec_c(!!!value)

  value
}

# Wrap a "rectangular" vector into a data frame
chop_rectangular_df <- function(x, names) {
  n_col <- vec_size(names)
  n_row <- vec_size(x) / n_col

  indices <- vector("list", n_col)

  start <- 1L
  stop <- n_row

  for (i in seq_len(n_col)) {
    indices[[i]] <- seq2(start, stop)
    start <- start + n_row
    stop <- stop + n_row
  }

  out <- vec_chop(x, indices)
  names(out) <- names
  tibble::new_tibble(out, nrow = n_row)
}

is_scalar <- function(x) {
  if (is.null(x)) {
    return(FALSE)
  }

  if (vec_is_list(x)) {
    (vec_size(x) == 1) && !have_name(x)
  } else {
    vec_size(x) == 1
  }
}

backtick_if_not_syntactic <- function(x) {
  ok <- make.names(x) == x
  ok[is.na(x)] <- FALSE

  x[!ok] <- glue::backtick(x[!ok])

  x
}