New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement multi-gather #150

Open
hadley opened this Issue Jan 2, 2016 · 11 comments

Comments

Projects
None yet
5 participants
@hadley
Copy link
Member

hadley commented Jan 2, 2016

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# From Jenny Bryan --------------------------------------------------------

input <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

# Want:
input %>%
  gather(key = element, value = score, mark, pr) %>%
  unite(thing, hw, element, remove = TRUE) %>%
  spread(thing, score, convert = TRUE)
#> # A tibble: 3 x 5
#>   name  hw1_mark hw1_pr hw2_mark hw2_pr
#>   <chr>    <int> <chr>     <int> <chr> 
#> 1 alan        90 meh          70 meh   
#> 2 anna        95 ok           NA <NA>  
#> 3 carl        85 ok           80 ok

# http://stackoverflow.com/questions/33599665 -----------------------------

anscombe %>%
  gather() %>%
  separate(key, c("var", "ex"), 1) %>%
  group_by(var) %>%
  mutate(id = row_number()) %>%
  spread(var, value)
#> # A tibble: 44 x 4
#>    ex       id     x     y
#>    <chr> <int> <dbl> <dbl>
#>  1 1         1    10  8.04
#>  2 1         2     8  6.95
#>  3 1         3    13  7.58
#>  4 1         4     9  8.81
#>  5 1         5    11  8.33
#>  6 1         6    14  9.96
#>  7 1         7     6  7.24
#>  8 1         8     4  4.26
#>  9 1         9    12 10.8 
#> 10 1        10     7  4.82
#> # … with 34 more rows

# http://stackoverflow.com/questions/27247078 -----------------------------

df <- tribble(
  ~id, ~type,     ~transactions, ~amount,
  20,  "income",  20,            100,
  20,  "expense", 25,            95,
  30,  "income",  50,            300,
  30,  "expense", 45,            250
)

df %>%
  gather(var, val, transactions:amount) %>%
  unite(var2, type, var) %>%
  spread(var2, val)
#> # A tibble: 2 x 5
#>      id expense_amount expense_transactio… income_amount income_transactio…
#>   <dbl>          <dbl>               <dbl>         <dbl>              <dbl>
#> 1    20             95                  25           100                 20
#> 2    30            250                  45           300                 50

# http://stackoverflow.com/questions/25925556 -----------------------------

df <- tibble(
  id = 1:10,
  time = as.Date('2009-01-01') + 0:9,
  Q3.2.1. = rnorm(10, 0, 1),
  Q3.2.2. = rnorm(10, 0, 1),
  Q3.2.3. = rnorm(10, 0, 1),
  Q3.3.1. = rnorm(10, 0, 1),
  Q3.3.2. = rnorm(10, 0, 1),
  Q3.3.3. = rnorm(10, 0, 1)
)

df %>%
  gather(-id, -time, key = key, value = value) %>%
  extract(key, c("question", "loop_number"), "(Q.\\..)\\.(.)", convert = TRUE) %>%
  spread(question, value)
#> # A tibble: 30 x 5
#>       id time       loop_number    Q3.2   Q3.3
#>    <int> <date>           <int>   <dbl>  <dbl>
#>  1     1 2009-01-01           1  0.847   0.196
#>  2     1 2009-01-01           2 -2.29   -1.94 
#>  3     1 2009-01-01           3  2.23    0.825
#>  4     2 2009-01-02           1 -0.620   1.36 
#>  5     2 2009-01-02           2  0.0409  0.731
#>  6     2 2009-01-02           3  0.625  -0.275
#>  7     3 2009-01-03           1  0.375  -0.365
#>  8     3 2009-01-03           2  0.414  -0.838
#>  9     3 2009-01-03           3 -0.0355 -0.757
#> 10     4 2009-01-04           1 -0.298  -2.13 
#> # … with 20 more rows

# http://stackoverflow.com/questions/32934400 -----------------------------

tibble(
  id = c("v1", "v2", "v3"),
  X_a = c(1,2,3),
  X_b = c(4,5,6),
  Y_a = c(7,8,9),
  Y_b = c(10,11,12)
) %>% 
  gather(key, val, X_a:Y_b) %>% 
  separate(key, c("type", "subtype")) %>% 
  spread(type, val)
#> # A tibble: 6 x 4
#>   id    subtype     X     Y
#>   <chr> <chr>   <dbl> <dbl>
#> 1 v1    a           1     7
#> 2 v1    b           4    10
#> 3 v2    a           2     8
#> 4 v2    b           5    11
#> 5 v3    a           3     9
#> 6 v3    b           6    12

# https://github.com/jennybc/lotr -----------------------------------------
# https://github.com/datacarpentry/archive-datacarpentry/tree/master/lessons/tidy-data

x <- tribble(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

x %>%
  gather(-Race, key = "key", value = "words") %>%
  separate(key, into = c("gender", "film")) %>%
  spread(key = "gender", value = "words")
#> # A tibble: 9 x 4
#>   Race   film  Female  Male
#>   <chr>  <chr>  <dbl> <dbl>
#> 1 Elf    LoTR    1229   971
#> 2 Elf    RoTK     183   510
#> 3 Elf    TT       331   513
#> 4 Hobbit LoTR      14  3644
#> 5 Hobbit RoTK       2  2673
#> 6 Hobbit TT         0  2463
#> 7 Man    LoTR       0  1995
#> 8 Man    RoTK     268  2459
#> 9 Man    TT       401  3589

Created on 2019-02-13 by the reprex package (v0.2.1.9000)

@hadley

This comment was marked as outdated.

Copy link
Member Author

hadley commented Jan 4, 2016

## https://github.com/jennybc/lotr
## https://github.com/datacarpentry/archive-datacarpentry/tree/master/lessons/tidy-data
x <- frame_data(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

x %>%
  gather(-Race, key = "key", value = "words") %>%
  separate(key, into = c("gender", "film")) %>%
  spread(key = "gender", value = "words")
@paleolimbot

This comment was marked as outdated.

Copy link

paleolimbot commented Aug 28, 2017

Here's a (very) hacky implentation I've been using for summarised chemistry data that I had no control over:

library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats

parallel_gather <- function(x, key, ..., convert = FALSE, factor_key = FALSE) {
  # enquos arguments
  lst <- rlang::quos(...)
  
  # check arguments
  if(length(lst) == 0) stop("Must pass at least one value = columns in parallel_gather()")
  if(is.null(names(lst)) || any(names(lst) == "")) {
    stop("All arguments to parallel_gather() must be named")
  }
  
  # use a hack to get column names as character using tidyeval and dplyr
  col_names <- tibble::as_tibble(stats::setNames(as.list(colnames(x)), colnames(x)))
  lst_as_colnames <- lapply(lst, function(name_quo) {
    dplyr::select(col_names, !!name_quo) %>% colnames()
  })
  
  # check length (each argument should refer to the same number of columns)
  arg_col_count <- vapply(lst_as_colnames, length, integer(1))
  if(!length(unique(arg_col_count)) == 1) {
    stop("All named arguments must refer to the same number of columns")
  }
  
  # id variables are those not mentioned in ...
  id_vars <- setdiff(colnames(x), unlist(lst_as_colnames))
  
  # do gather for each item in ..., using id_vars and cols mentioned in 
  # each argument
  gathered <- lapply(seq_along(lst_as_colnames), function(i) {
    tidyr::gather_(x[c(id_vars, lst_as_colnames[[i]])], 
                   key = key, value = names(lst_as_colnames)[i],
                   gather_cols = lst_as_colnames[[i]],
                   na.rm = FALSE, convert = convert, factor_key = factor_key)
  })
  
  # get id data
  id_data <- gathered[[1]][c(id_vars, key)]
  
  # select non-id vars for each melt operation
  gathered <- lapply(gathered, function(df) df[setdiff(colnames(df), c(id_vars, key))])
  
  # return cbind operation
  dplyr::bind_cols(id_data, gathered)
}


# some sample chemistry data, summarised to mean/sd
sample_data <- pocmaj_raw <- tribble(
  ~sample_id, ~Ca, ~Ti, ~V,  
  "poc15-2 0",  1036, 1337, 29,
  "poc15-2 0", 1951, 2427, 31,
  "poc15-2 0", 1879, 2350, 39,
  "poc15-2 1", 1488, 2016, 36,
  "poc15-2 2", 2416, 3270, 79,
  "poc15-2 3", 2253, 3197, 79,
  "poc15-2 4", 2372, 3536, 87,
  "poc15-2 5", 2621, 3850, 86,
  "poc15-2 5", 2785, 3939, 95,
  "poc15-2 5", 2500, 3881, 80,
  "maj15-1 0", 1623, 2104, 73,
  "maj15-1 0", 1624, 2174, 73,
  "maj15-1 0", 2407, 2831, 89,
  "maj15-1 1", 1418, 2409, 70,
  "maj15-1 2", 1550, 2376, 70,
  "maj15-1 3", 1448, 2485, 64,
  "maj15-1 4", 1247, 2414, 57,
  "maj15-1 5", 1463, 1869, 78,
  "maj15-1 5", 1269, 1834, 71,
  "maj15-1 5", 1505, 1989, 94
) %>% 
  group_by(sample_id) %>%
  summarise_all(.funs = list("mean", "sd"))

parallel_gather(sample_data, key = "param", value = ends_with("mean"), sd = ends_with("sd"))
#> # A tibble: 36 x 4
#>    sample_id   param    value       sd
#>        <chr>   <chr>    <dbl>    <dbl>
#>  1 maj15-1 0 Ca_mean 1884.667 452.3542
#>  2 maj15-1 1 Ca_mean 1418.000       NA
#>  3 maj15-1 2 Ca_mean 1550.000       NA
#>  4 maj15-1 3 Ca_mean 1448.000       NA
#>  5 maj15-1 4 Ca_mean 1247.000       NA
#>  6 maj15-1 5 Ca_mean 1412.333 125.8941
#>  7 poc15-2 0 Ca_mean 1622.000 508.7662
#>  8 poc15-2 1 Ca_mean 1488.000       NA
#>  9 poc15-2 2 Ca_mean 2416.000       NA
#> 10 poc15-2 3 Ca_mean 2253.000       NA
#> # ... with 26 more rows
@hadley

This comment has been minimized.

Copy link
Member Author

hadley commented Sep 11, 2017

Maybe we could have a tiny DSL like:

x <- frame_data(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

x %>%
  gather(Race, Female_LotR:Male_RoTK := multikey("(gender)_(film)"), value = "words")

x %>%
  gather(Race, colnames = multikey("(gender)_(film)"), value = "words")
@paleolimbot

This comment has been minimized.

Copy link

paleolimbot commented Sep 11, 2017

Not quite the same as what I was trying to do, but why not keep the key/value syntax with some special syntax on the 'key' argument similar to extract or separate?

x %>% gather(-Race, key = separate("_", into = c("gender", "film")), value = "words")
x %>% gather(-Race, key = extract("^(.*?)_(.*)$", into = c("gender", "film")), value = "words")
@hadley

This comment was marked as resolved.

Copy link
Member Author

hadley commented Sep 11, 2017

That would definitely be there - I was also wondering if we could be even more compact (would also bring the same syntax to a new separate() equivalent.

I wonder if there's some way to hack in named groups support to stringi.

@cjyetman

This comment was marked as off-topic.

Copy link

cjyetman commented May 3, 2018

Is it worth giving a warning that coercion is occurring when gathering columns of different types? I suppose that should be obvious, but it's not explicitly stated in the help file or anywhere I can find. In the first example above, you run the risk of losing precision on a numeric column (as alluded to in #174). The only ways around it (currently) that I can figure out are rather convoluted, like...

full_join(filter(input, hw == "hw1") %>% select(-hw),
          filter(input, hw == "hw2") %>% select(-hw),
          by = "name", suffix = c("_hw1", "_hw2"))

or

input %>%
  mutate(hw_pr = paste0(hw, '_pr')) %>%
  mutate(hw_mark = paste0(hw, '_mark')) %>%
  select(-hw) %>% 
  spread(hw_pr, pr) %>%
  spread(hw_mark, mark) %>%
  group_by(name) %>%
  summarise_all(funs(first(na.omit(.))))

something like this would be really nice/ideal (I suppose this what's proposed in #149?)...

input %>% 
  spread(key = hw, value = vars(mark, pr))
@krlmlr

This comment was marked as outdated.

Copy link
Member

krlmlr commented Jan 8, 2019

Hierarchical columns can help here too:

library(tidyverse)

x <- tribble(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

# Result of hierarchize(x, sep = "_", -Race):
xh <- tibble(
  Race = x$Race,
  Female = tibble(
    LoTR = x$Female_LoTR,
    TT = x$Female_TT,
    RoTK = x$Female_RoTK
  ),
  Male = tibble(
    LoTR = x$Male_LoTR,
    TT = x$Male_TT,
    RoTK = x$Male_RoTK
  )
)
xh
#> # A tibble: 3 x 3
#>   Race   Female$LoTR   $TT $RoTK Male$LoTR   $TT $RoTK
#>   <chr>        <dbl> <dbl> <dbl>     <dbl> <dbl> <dbl>
#> 1 Elf           1229   331   183       971   513   510
#> 2 Hobbit          14     0     2      3644  2463  2673
#> 3 Man              0   401   268      1995  3589  2459

# Result of gathering:
tibble(
  Race = c(xh$Race, xh$Race),
  key = c(rep("Female", 3), rep("Male", 3)),
  value = tibble(
    LoTR = c(xh$Female$LoTR, xh$Male$LoTR),
    TT = c(xh$Female$TT, xh$Male$TT),
    RoTK = c(xh$Female$RoTK, xh$Male$RoTK)
  )
)
#> # A tibble: 6 x 3
#>   Race   key    value$LoTR   $TT $RoTK
#>   <chr>  <chr>       <dbl> <dbl> <dbl>
#> 1 Elf    Female       1229   331   183
#> 2 Hobbit Female         14     0     2
#> 3 Man    Female          0   401   268
#> 4 Elf    Male          971   513   510
#> 5 Hobbit Male         3644  2463  2673
#> 6 Man    Male         1995  3589  2459

Created on 2019-01-08 by the reprex package (v0.2.1.9000)

@krlmlr

This comment was marked as outdated.

Copy link
Member

krlmlr commented Jan 19, 2019

@yutannihilation suggests that we also need a transpose_df() function to flip a data frame column inside out:

library(tidyverse)

x <- tribble(
  ~Race, ~Female_LoTR, ~Male_LoTR, ~Female_TT, ~Male_TT, ~Female_RoTK, ~Male_RoTK,
  "Elf", 1229, 971, 331, 513, 183, 510,
  "Hobbit", 14, 3644, 0, 2463, 2, 2673,
  "Man", 0, 1995, 401, 3589, 268, 2459
)

xh <- tibble(
  Race = x$Race,
  Female = tibble(
    LoTR = x$Female_LoTR,
    TT = x$Female_TT,
    RoTK = x$Female_RoTK
  ),
  Male = tibble(
    LoTR = x$Male_LoTR,
    TT = x$Male_TT,
    RoTK = x$Male_RoTK
  )
)
xh
#> # A tibble: 3 x 3
#>   Race   Female$LoTR   $TT $RoTK Male$LoTR   $TT $RoTK
#>   <chr>        <dbl> <dbl> <dbl>     <dbl> <dbl> <dbl>
#> 1 Elf           1229   331   183       971   513   510
#> 2 Hobbit          14     0     2      3644  2463  2673
#> 3 Man              0   401   268      1995  3589  2459

# Result of
# xh %>%
#   tie(Female, Male) %>%
#   mutate(.data = transpose_df(.data)) %>%
#   untie(.data)
# :
xht <- tibble(
  Race = x$Race,
  LoTR = tibble(
    Female = xh$Female$LoTR,
    Male = xh$Male$LoTR
  ),
  TT = tibble(
    Female = xh$Female$TT,
    Male = xh$Male$TT
  ),
  RoTK = tibble(
    Female = xh$Female$RoTK,
    Male = xh$Male$RoTK
  )
)
xht
#> # A tibble: 3 x 4
#>   Race   LoTR$Female $Male TT$Female $Male RoTK$Female $Male
#>   <chr>        <dbl> <dbl>     <dbl> <dbl>       <dbl> <dbl>
#> 1 Elf           1229   971       331   513         183   510
#> 2 Hobbit          14  3644         0  2463           2  2673
#> 3 Man              0  1995       401  3589         268  2459

Created on 2019-01-19 by the reprex package (v0.2.1)

@yutannihilation

This comment was marked as outdated.

Copy link
Member

yutannihilation commented Jan 19, 2019

tie <- function(data, ..., .key = "data") {
  key_var <- rlang::as_string(rlang::ensym(.key))
  
  tie_vars <- unname(tidyselect::vars_select(names(data), ...))
  if (rlang::is_empty(tie_vars)) {
    tie_vars <- names(data)
  }
  
  if (dplyr::is_grouped_df(data)) {
    group_vars <- dplyr::group_vars(data)
  } else {
    group_vars <- setdiff(names(data), tie_vars)
  }
  tie_vars <- setdiff(tie_vars, group_vars)
  
  data <- dplyr::ungroup(data)
  if (rlang::is_empty(group_vars)) {
    return(tibble::tibble(!! key_var := data))
  }

  out <- dplyr::select(data, !!! rlang::syms(group_vars))
  out[[key_var]] <-   tied <- dplyr::select(data, !!! rlang::syms(tie_vars))
  out
}

untie <- function(data, ...) {
  quos <- rlang::quos(...)
  if (rlang::is_empty(quos)) {
    list_cols <- names(data)[purrr::map_lgl(data, rlang::is_list)]
    quos <- rlang::syms(list_cols)
  }
  
  if (length(quos) == 0) {
    return(data)
  }
  
  tied <- as.list(dplyr::transmute(data, !!! quos))
  
  group_vars <- setdiff(names(data), names(tied))
  
  rest <- dplyr::select(data, !!!rlang::syms(group_vars))
  dplyr::bind_cols(rest, tied)
}

head(tie(iris, -Species))
#>   Species data.Sepal.Length data.Sepal.Width data.Petal.Length
#> 1  setosa               5.1              3.5               1.4
#> 2  setosa               4.9              3.0               1.4
#> 3  setosa               4.7              3.2               1.3
#> 4  setosa               4.6              3.1               1.5
#> 5  setosa               5.0              3.6               1.4
#> 6  setosa               5.4              3.9               1.7
#>   data.Petal.Width
#> 1              0.2
#> 2              0.2
#> 3              0.2
#> 4              0.2
#> 5              0.2
#> 6              0.4
head(untie(tie(iris, -Species)))
#>   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1  setosa          5.1         3.5          1.4         0.2
#> 2  setosa          4.9         3.0          1.4         0.2
#> 3  setosa          4.7         3.2          1.3         0.2
#> 4  setosa          4.6         3.1          1.5         0.2
#> 5  setosa          5.0         3.6          1.4         0.2
#> 6  setosa          5.4         3.9          1.7         0.4

Created on 2019-01-20 by the reprex package (v0.2.1)

@hadley

This comment has been minimized.

Copy link
Member Author

hadley commented Feb 13, 2019

See https://yutani.rbind.io/post/enhancing-gather-and-spread-by-using-bundled-data-frames/ for discussion about the interaction between df-cols and multi-spread/gather.

@hadley

This comment has been minimized.

Copy link
Member Author

hadley commented Feb 13, 2019

Note to self: multi-gather is primarily about generating multiple keys from column names. This naturally connects to packed data frames, because you end up with nested column (i.e. there is a hierarchy of names).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment