Allow bind_rows() to create an identifier column

Fixes #1337
tidyverse · Aug 27, 2015 · 6cf553a · 6cf553a
1 parent 07b723c
commit 6cf553a
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 19 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -21,6 +21,10 @@ Until now, dplyr's support for non-UTF8 encodings has been rather shaky. This re
   cleaning you no longer need to convert lists to data frames, but can
   instead feed them to `bind_rows()` directly.
 
+* `bind_rows()` gains a `.id` argument. When supplied, it creates a
+  new column of identifiers that links each row to its original data
+  frame (#1337, @lionel-).
+
 * `bind_rows()` respects the `ordered` attribute of factors (#1112), and
   does better at comparing `POSIXct`s (#1125). The `tz` attribute is ignored
   when determining if two `POSIXct` vectors are comparable. If the `tz` of

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -38,8 +38,8 @@ between <- function(x, left, right) {
 }
 
 #' @export
-rbind_all <- function(dots) {
-    .Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots)
+rbind_all <- function(dots, id = NULL) {
+    .Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots, id)
 }
 
 rbind_list__impl <- function(dots) {

diff --git a/R/rbind.r b/R/rbind.r
@@ -18,6 +18,14 @@
 #'   frames must have the same number of rows. To match by value, not
 #'   position, see \code{left_join} etc. When row-binding, columns are
 #'   matched by name, and any values that don't match will be filled with NA.
+#' @param .id Data frames identifier.
+#'
+#'   When \code{.id} is supplied, a new column of identifiers is
+#'   created to link each row to its original data frame. The labels
+#'   are taken from the named arguments to \code{bind_rows()}. When a
+#'   list of data frames is supplied, the labels are taken from the
+#'   names of the list. If no names are found a numeric sequence is
+#'   used instead.
 #' @return \code{bind_rows} and \code{bind_cols} always return a \code{tbl_df}
 #' @aliases rbind_all rbind_list
 #' @examples
@@ -30,6 +38,11 @@
 #' bind_rows(list(one, two))
 #' bind_rows(split(mtcars, mtcars$cyl))
 #'
+#' # When you supply a column name with the `.id` argument, a new
+#' # column is created to link each row to its original data frame
+#' bind_rows(list(a = one, b = two), .id = "id")
+#' bind_rows("group 1" = one, "group 2" = two, .id = "groups")
+#'
 #' # Columns don't need to match when row-binding
 #' bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
 #' \dontrun{
@@ -54,12 +67,31 @@ NULL
 
 #' @export
 #' @rdname bind
-bind_rows <- function(x, ...) {
-  if (is.list(x) && !is.data.frame(x) && !length(list(...)) ) {
-    rbind_all(x)
-  } else {
-    rbind_all(list(x, ...))
+bind_rows <- function(..., .id = NULL) {
+  dots <- list(...)
+  if (is.list(dots[[1]]) &&
+        !is.data.frame(dots[[1]]) &&
+        !length(dots[-1])) {
+    x <- dots[[1]]
   }
+  else {
+    x <- dots
+  }
+
+  if (!is.null(.id)) {
+    if (!(is.character(.id) && length(.id) == 1)) {
+      stop(".id is not a string", call. = FALSE)
+    }
+    names(x) <- names(x) %||% seq_along(x)
+    if (any(is.na(names(x)))) {
+      warning("Some ID labels are missing", call. = FALSE)
+    }
+    if (any(na.omit(names(x) == ""))) {
+      warning("Some ID labels are empty", call. = FALSE)
+    }
+  }
+
+  rbind_all(x, .id)
 }
 
 #' @export

diff --git a/inst/include/tools/DotsOf.h b/inst/include/tools/DotsOf.h
@@ -67,6 +67,10 @@ namespace Rcpp {
             return data.size();
         }
 
+        inline int names() const{
+            return data.names();
+        }
+
     private:
         List data ;
 

diff --git a/man/bind.Rd b/man/bind.Rd
@@ -9,13 +9,22 @@
 \alias{rbind_list}
 \title{Efficiently bind multiple data frames by row and column.}
 \usage{
-bind_rows(x, ...)
+bind_rows(..., .id = NULL)
 
 bind_cols(x, ...)
 
 combine(x, ...)
 }
 \arguments{
+\item{.id}{Data frames identifier.
+
+  When \code{.id} is supplied, a new column of identifiers is
+  created to link each row to its original data frame. The labels
+  are taken from the named arguments to \code{bind_rows()}. When a
+  list of data frames is supplied, the labels are taken from the
+  names of the list. If no names are found a numeric sequence is
+  used instead.}
+
 \item{x,...}{Data frames to combine.
 
   You can either supply one data frame per argument, or a list of
@@ -50,6 +59,11 @@ bind_rows(one, two)
 bind_rows(list(one, two))
 bind_rows(split(mtcars, mtcars$cyl))
 
+# When you supply a column name with the `.id` argument, a new
+# column is created to link each row to its original data frame
+bind_rows(list(a = one, b = two), .id = "id")
+bind_rows("group 1" = one, "group 2" = two, .id = "groups")
+
 # Columns don't need to match when row-binding
 bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
 \dontrun{

diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -76,13 +76,14 @@ BEGIN_RCPP
 END_RCPP
 }
 // rbind_all
-List rbind_all(List dots);
-RcppExport SEXP dplyr_rbind_all(SEXP dotsSEXP) {
+List rbind_all(List dots, SEXP id);
+RcppExport SEXP dplyr_rbind_all(SEXP dotsSEXP, SEXP idSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< List >::type dots(dotsSEXP);
-    __result = Rcpp::wrap(rbind_all(dots));
+    Rcpp::traits::input_parameter< SEXP >::type id(idSEXP);
+    __result = Rcpp::wrap(rbind_all(dots, id));
     return __result;
 END_RCPP
 }

diff --git a/src/bind.cpp b/src/bind.cpp
@@ -4,14 +4,18 @@ using namespace Rcpp ;
 using namespace dplyr ;
 
 template <typename Dots>
-List rbind__impl( Dots dots ){
+List rbind__impl( Dots dots, SEXP id = R_NilValue ){
     int ndata = dots.size() ;
     int n = 0 ;
     std::vector<DataFrameAble> chunks ;
+    std::vector<int> df_nrows ;
 
     for( int i=0; i<ndata; i++) {
-      chunks.push_back( DataFrameAble( dots[i] ) );
-      n += chunks[i].nrows() ;
+      chunks.push_back( DataFrameAble( dots[i] ) ) ;
+
+      int nrows = chunks[i].nrows() ;
+      df_nrows.push_back(nrows) ;
+      n += nrows ;
     }
     pointer_vector<Collecter> columns ;
 
@@ -82,23 +86,42 @@ List rbind__impl( Dots dots ){
 
         k += nrows ;
     }
+
     int nc = columns.size() ;
-    List out(nc) ;
-    CharacterVector out_names(nc) ;
+    int nc_id = nc + (Rf_isNull(id) ? 0 : 1) ;
+
+    List out(nc_id) ;
+    CharacterVector out_names(nc_id) ;
     for( int i=0; i<nc; i++){
         out[i] = columns[i]->get() ;
         out_names[i] = names[i] ;
     }
+
+    // Add vector of identifiers if .id is supplied
+    if (!Rf_isNull(id)) {
+      CharacterVector df_names = dots.names() ;
+      CharacterVector id_col = no_init(n) ;
+
+      CharacterVector::iterator it = id_col.begin() ;
+      for (int i=0; i<ndata; ++i) {
+        std::fill( it, it + df_nrows[i], df_names[i] ) ;
+        it += df_nrows[i] ;
+      }
+
+      out[nc_id - 1] = id_col ;
+      out_names[nc_id - 1] = Rcpp::as<std::string>(id) ;
+    }
+
     out.attr( "names" ) = out_names ;
-    set_rownames( out, n );
+    set_rownames( out, n ) ;
     out.attr( "class" ) = classes_not_grouped() ;
     return out ;
 }
 
 //' @export
 // [[Rcpp::export]]
-List rbind_all( List dots ){
-    return rbind__impl(dots) ;
+List rbind_all( List dots, SEXP id = R_NilValue ){
+    return rbind__impl(dots, id) ;
 }
 
 // [[Rcpp::export]]

diff --git a/tests/testthat/test-rbind.r b/tests/testthat/test-rbind.r
@@ -247,3 +247,24 @@ test_that("bind handles POSIXct of different tz ", {
   expect_equal( attr(res$date, "tzone"), "UTC" )
 
 })
+
+test_that("bind_rows() creates a column of identifiers (#1337)", {
+  data1 <- mtcars[c(2, 3), ]
+  data2 <- mtcars[1, ]
+
+  out <- bind_rows(data1, data2, .id = "col")
+  out_list <- bind_rows(list(data1, data2), .id = "col")
+  expect_equal(names(out)[12], "col")
+  expect_equal(out$col, c("1", "1", "2"))
+  expect_equal(out_list$col, c("1", "1", "2"))
+
+  out_labelled <- bind_rows(one = data1, two = data2, .id = "col")
+  out_list_labelled <- bind_rows(list(one = data1, two = data2), .id = "col")
+  expect_equal(out_labelled$col, c("one", "one", "two"))
+  expect_equal(out_list_labelled$col, c("one", "one", "two"))
+
+  list_missing <- list(data1, data2)
+  names(list_missing) <- c(NA, "two")
+  expect_warning(bind_rows(list_missing, .id = "col"))
+  expect_warning(bind_rows(list(data1, two = data2), .id = "col"))
+})