Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement multi-gather #150

Closed
hadley opened this issue Jan 2, 2016 · 12 comments
Closed

Implement multi-gather #150

hadley opened this issue Jan 2, 2016 · 12 comments

Comments

@hadley
Copy link
Member

@hadley hadley commented Jan 2, 2016

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# From Jenny Bryan --------------------------------------------------------

input <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

# Want:
input %>%
  gather(key = element, value = score, mark, pr) %>%
  unite(thing, hw, element, remove = TRUE) %>%
  spread(thing, score, convert = TRUE)
#> # A tibble: 3 x 5
#>   name  hw1_mark hw1_pr hw2_mark hw2_pr
#>   <chr>    <int> <chr>     <int> <chr> 
#> 1 alan        90 meh          70 meh   
#> 2 anna        95 ok           NA <NA>  
#> 3 carl        85 ok           80 ok

# http://stackoverflow.com/questions/33599665 -----------------------------

anscombe %>%
  gather() %>%
  separate(key, c("var", "ex"), 1) %>%
  group_by(var) %>%
  mutate(id = row_number()) %>%
  spread(var, value)
#> # A tibble: 44 x 4
#>    ex       id     x     y
#>    <chr> <int> <dbl> <dbl>
#>  1 1         1    10  8.04
#>  2 1         2     8  6.95
#>  3 1         3    13  7.58
#>  4 1         4     9  8.81
#>  5 1         5    11  8.33
#>  6 1         6    14  9.96
#>  7 1         7     6  7.24
#>  8 1         8     4  4.26
#>  9 1         9    12 10.8 
#> 10 1        10     7  4.82
#> # … with 34 more rows

# http://stackoverflow.com/questions/27247078 -----------------------------

df <- tribble(
  ~id, ~type,     ~transactions, ~amount,
  20,  "income",  20,            100,
  20,  "expense", 25,            95,
  30,  "income",  50,            300,
  30,  "expense", 45,            250
)

df %>%
  gather(var, val, transactions:amount) %>%
  unite(var2, type, var) %>%
  spread(var2, val)
#> # A tibble: 2 x 5
#>      id expense_amount expense_transactio… income_amount income_transactio…
#>   <dbl>          <dbl>               <dbl>         <dbl>              <dbl>
#> 1    20             95                  25           100                 20
#> 2    30            250                  45           300                 50

# http://stackoverflow.com/questions/25925556 -----------------------------

df <- tibble(
  id = 1:10,
  time = as.Date('2009-01-01') + 0:9,
  Q3.2.1. = rnorm(10, 0, 1),
  Q3.2.2. = rnorm(10, 0, 1),
  Q3.2.3. = rnorm(10, 0, 1),
  Q3.3.1. = rnorm(10, 0, 1),
  Q3.3.2. = rnorm(10, 0, 1),
  Q3.3.3. = rnorm(10, 0, 1)
)

df %>%
  gather(-id, -time, key = key, value = value) %>%
  extract(key, c("question", "loop_number"), "(Q.\\..)\\.(.)", convert = TRUE) %>%
  spread(question, value)
#> # A tibble: 30 x 5
#>       id time       loop_number    Q3.2   Q3.3
#>    <int> <date>           <int>   <dbl>  <dbl>
#>  1     1 2009-01-01           1  0.847   0.196
#>  2     1 2009-01-01           2 -2.29   -1.94 
#>  3     1 2009-01-01           3  2.23    0.825
#>  4     2 2009-01-02           1 -0.620   1.36 
#>  5     2 2009-01-02           2  0.0409  0.731
#>  6     2 2009-01-02           3  0.625  -0.275
#>  7     3 2009-01-03           1  0.375  -0.365
#>  8     3 2009-01-03           2  0.414  -0.838
#>  9     3 2009-01-03           3 -0.0355 -0.757
#> 10     4 2009-01-04           1 -0.298  -2.13 
#> # … with 20 more rows

# http://stackoverflow.com/questions/32934400 -----------------------------

tibble(
  id = c("v1", "v2", "v3"),
  X_a = c(1,2,3),
  X_b = c(4,5,6),
  Y_a = c(7,8,9),
  Y_b = c(10,11,12)
) %>% 
  gather(key, val, X_a:Y_b) %>% 
  separate(key, c("type", "subtype")) %>% 
  spread(type, val)
#> # A tibble: 6 x 4
#>   id    subtype     X     Y
#>   <chr> <chr>   <dbl> <dbl>
#> 1 v1    a           1     7
#> 2 v1    b           4    10
#> 3 v2    a           2     8
#> 4 v2    b           5    11
#> 5 v3    a           3     9
#> 6 v3    b           6    12

# https://github.com/jennybc/lotr -----------------------------------------
# https://github.com/datacarpentry/archive-datacarpentry/tree/master/lessons/tidy-data

x <- tribble(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

x %>%
  gather(-Race, key = "key", value = "words") %>%
  separate(key, into = c("gender", "film")) %>%
  spread(key = "gender", value = "words")
#> # A tibble: 9 x 4
#>   Race   film  Female  Male
#>   <chr>  <chr>  <dbl> <dbl>
#> 1 Elf    LoTR    1229   971
#> 2 Elf    RoTK     183   510
#> 3 Elf    TT       331   513
#> 4 Hobbit LoTR      14  3644
#> 5 Hobbit RoTK       2  2673
#> 6 Hobbit TT         0  2463
#> 7 Man    LoTR       0  1995
#> 8 Man    RoTK     268  2459
#> 9 Man    TT       401  3589

Created on 2019-02-13 by the reprex package (v0.2.1.9000)

@hadley

This comment has been hidden.

@paleolimbot

This comment has been hidden.

@hadley
Copy link
Member Author

@hadley hadley commented Sep 11, 2017

Maybe we could have a tiny DSL like:

x <- frame_data(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

x %>%
  gather(Race, Female_LotR:Male_RoTK := multikey("(gender)_(film)"), value = "words")

x %>%
  gather(Race, colnames = multikey("(gender)_(film)"), value = "words")
@paleolimbot
Copy link

@paleolimbot paleolimbot commented Sep 11, 2017

Not quite the same as what I was trying to do, but why not keep the key/value syntax with some special syntax on the 'key' argument similar to extract or separate?

x %>% gather(-Race, key = separate("_", into = c("gender", "film")), value = "words")
x %>% gather(-Race, key = extract("^(.*?)_(.*)$", into = c("gender", "film")), value = "words")
@hadley

This comment has been hidden.

@cjyetman

This comment was marked as off-topic.

@krlmlr

This comment has been hidden.

@krlmlr

This comment has been hidden.

@yutannihilation

This comment has been hidden.

@hadley
Copy link
Member Author

@hadley hadley commented Feb 13, 2019

See https://yutani.rbind.io/post/enhancing-gather-and-spread-by-using-bundled-data-frames/ for discussion about the interaction between df-cols and multi-spread/gather.

@hadley
Copy link
Member Author

@hadley hadley commented Feb 13, 2019

Note to self: multi-gather is primarily about generating multiple keys from column names. This naturally connects to packed data frames, because you end up with nested column (i.e. there is a hierarchy of names).

@hadley
Copy link
Member Author

@hadley hadley commented Mar 3, 2019

Now solvable with pivot():

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# http://stackoverflow.com/questions/33599665 -----------------------------

anscombe <- anscombe %>% 
  as_tibble() %>% 
  mutate(id = row_number())
anscombe
#> # A tibble: 11 x 9
#>       x1    x2    x3    x4    y1    y2    y3    y4    id
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#>  1    10    10    10     8  8.04  9.14  7.46  6.58     1
#>  2     8     8     8     8  6.95  8.14  6.77  5.76     2
#>  3    13    13    13     8  7.58  8.74 12.7   7.71     3
#>  4     9     9     9     8  8.81  8.77  7.11  8.84     4
#>  5    11    11    11     8  8.33  9.26  7.81  8.47     5
#>  6    14    14    14     8  9.96  8.1   8.84  7.04     6
#>  7     6     6     6     8  7.24  6.13  6.08  5.25     7
#>  8     4     4     4    19  4.26  3.1   5.39 12.5      8
#>  9    12    12    12     8 10.8   9.13  8.15  5.56     9
#> 10     7     7     7     8  4.82  7.26  6.42  7.91    10
#> 11     5     5     5     8  5.68  4.74  5.73  6.89    11

spec <- anscombe %>% 
  pivot_spec_long(-id) %>% 
  separate(variable, c("var", "ex"), 1, convert = TRUE)
spec
#> # A tibble: 8 x 4
#>   col_name measure var      ex
#>   <chr>    <chr>   <chr> <int>
#> 1 x1       value   x         1
#> 2 x2       value   x         2
#> 3 x3       value   x         3
#> 4 x4       value   x         4
#> 5 y1       value   y         1
#> 6 y2       value   y         2
#> 7 y3       value   y         3
#> 8 y4       value   y         4

anscombe %>% 
  pivot(spec) %>% 
  pivot(pivot_spec_wide(., var, value))
#> # A tibble: 44 x 4
#>       id    ex     x     y
#>    <int> <int> <dbl> <dbl>
#>  1     1     1    10    11
#>  2     1     2     8    14
#>  3     1     3    13     6
#>  4     1     4     9     4
#>  5     2     1    12     8
#>  6     2     2     7    13
#>  7     2     3     5     9
#>  8     2     4    10    11
#>  9     3     1    14     7
#> 10     3     2     6     5
#> # … with 34 more rows

# http://stackoverflow.com/questions/25925556 -----------------------------

df <- tibble(
  id = 1:10,
  time = as.Date('2009-01-01') + 0:9,
  Q3.2.1. = rnorm(10, 0, 1),
  Q3.2.2. = rnorm(10, 0, 1),
  Q3.2.3. = rnorm(10, 0, 1),
  Q3.3.1. = rnorm(10, 0, 1),
  Q3.3.2. = rnorm(10, 0, 1),
  Q3.3.3. = rnorm(10, 0, 1)
)

spec <- df %>% 
  pivot_spec_long(-c(id, time)) %>% 
  extract(variable, c("question", "loop_number"), "(Q.\\..)\\.(.)", convert = TRUE)
spec
#> # A tibble: 6 x 4
#>   col_name measure question loop_number
#>   <chr>    <chr>   <chr>          <int>
#> 1 Q3.2.1.  value   Q3.2               1
#> 2 Q3.2.2.  value   Q3.2               2
#> 3 Q3.2.3.  value   Q3.2               3
#> 4 Q3.3.1.  value   Q3.3               1
#> 5 Q3.3.2.  value   Q3.3               2
#> 6 Q3.3.3.  value   Q3.3               3

df %>% pivot(spec)
#> # A tibble: 60 x 5
#>       id time       question loop_number    value
#>    <int> <date>     <chr>          <int>    <dbl>
#>  1     1 2009-01-01 Q3.2               1  1.53   
#>  2     1 2009-01-01 Q3.2               2 -1.11   
#>  3     1 2009-01-01 Q3.2               3 -1.12   
#>  4     1 2009-01-01 Q3.3               1 -0.00144
#>  5     1 2009-01-01 Q3.3               2  0.191  
#>  6     1 2009-01-01 Q3.3               3  0.00916
#>  7     2 2009-01-02 Q3.2               1  1.00   
#>  8     2 2009-01-02 Q3.2               2  0.516  
#>  9     2 2009-01-02 Q3.2               3 -0.353  
#> 10     2 2009-01-02 Q3.3               1 -0.429  
#> # … with 50 more rows

# https://github.com/jennybc/lotr -----------------------------------------
# https://github.com/datacarpentry/archive-datacarpentry/tree/master/lessons/tidy-data

x <- tribble(
  ~Race,~Female_LoTR,~Male_LoTR,~Female_TT,~Male_TT,~Female_RoTK,~Male_RoTK,
  "Elf",        1229,       971,       331,     513,         183,       510,
  "Hobbit",       14,      3644,         0,    2463,           2,      2673,
  "Man",           0,      1995,       401,    3589,         268,      2459
)

spec <- x %>% 
  pivot_spec_long(-Race, measure = "words") %>% 
  separate(variable, into = c("gender", "film"))
spec
#> # A tibble: 6 x 4
#>   col_name    measure gender film 
#>   <chr>       <chr>   <chr>  <chr>
#> 1 Female_LoTR words   Female LoTR 
#> 2 Male_LoTR   words   Male   LoTR 
#> 3 Female_TT   words   Female TT   
#> 4 Male_TT     words   Male   TT   
#> 5 Female_RoTK words   Female RoTK 
#> 6 Male_RoTK   words   Male   RoTK

x %>% pivot(spec) %>% head()
#> # A tibble: 6 x 4
#>   Race  gender film  words
#>   <chr> <chr>  <chr> <dbl>
#> 1 Elf   Female LoTR   1229
#> 2 Elf   Male   LoTR     14
#> 3 Elf   Female TT        0
#> 4 Elf   Male   TT      971
#> 5 Elf   Female RoTK   3644
#> 6 Elf   Male   RoTK   1995

Created on 2019-03-03 by the reprex package (v0.2.1.9000)

@hadley hadley closed this Mar 3, 2019
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Linked pull requests

Successfully merging a pull request may close this issue.

None yet
5 participants