Skip to content

Commit

Permalink
Allow bind_rows() to create an identifier column
Browse files Browse the repository at this point in the history
Fixes #1337
  • Loading branch information
lionel- committed Aug 27, 2015
1 parent 07b723c commit 6cf553a
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 19 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ Until now, dplyr's support for non-UTF8 encodings has been rather shaky. This re
cleaning you no longer need to convert lists to data frames, but can
instead feed them to `bind_rows()` directly.

* `bind_rows()` gains a `.id` argument. When supplied, it creates a
new column of identifiers that links each row to its original data
frame (#1337, @lionel-).

* `bind_rows()` respects the `ordered` attribute of factors (#1112), and
does better at comparing `POSIXct`s (#1125). The `tz` attribute is ignored
when determining if two `POSIXct` vectors are comparable. If the `tz` of
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ between <- function(x, left, right) {
}

#' @export
rbind_all <- function(dots) {
.Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots)
rbind_all <- function(dots, id = NULL) {
.Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots, id)
}

rbind_list__impl <- function(dots) {
Expand Down
42 changes: 37 additions & 5 deletions R/rbind.r
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
#' frames must have the same number of rows. To match by value, not
#' position, see \code{left_join} etc. When row-binding, columns are
#' matched by name, and any values that don't match will be filled with NA.
#' @param .id Data frames identifier.
#'
#' When \code{.id} is supplied, a new column of identifiers is
#' created to link each row to its original data frame. The labels
#' are taken from the named arguments to \code{bind_rows()}. When a
#' list of data frames is supplied, the labels are taken from the
#' names of the list. If no names are found a numeric sequence is
#' used instead.
#' @return \code{bind_rows} and \code{bind_cols} always return a \code{tbl_df}
#' @aliases rbind_all rbind_list
#' @examples
Expand All @@ -30,6 +38,11 @@
#' bind_rows(list(one, two))
#' bind_rows(split(mtcars, mtcars$cyl))
#'
#' # When you supply a column name with the `.id` argument, a new
#' # column is created to link each row to its original data frame
#' bind_rows(list(a = one, b = two), .id = "id")
#' bind_rows("group 1" = one, "group 2" = two, .id = "groups")
#'
#' # Columns don't need to match when row-binding
#' bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
#' \dontrun{
Expand All @@ -54,12 +67,31 @@ NULL

#' @export
#' @rdname bind
bind_rows <- function(x, ...) {
if (is.list(x) && !is.data.frame(x) && !length(list(...)) ) {
rbind_all(x)
} else {
rbind_all(list(x, ...))
bind_rows <- function(..., .id = NULL) {
dots <- list(...)
if (is.list(dots[[1]]) &&
!is.data.frame(dots[[1]]) &&
!length(dots[-1])) {
x <- dots[[1]]
}
else {
x <- dots
}

if (!is.null(.id)) {
if (!(is.character(.id) && length(.id) == 1)) {
stop(".id is not a string", call. = FALSE)
}
names(x) <- names(x) %||% seq_along(x)
if (any(is.na(names(x)))) {
warning("Some ID labels are missing", call. = FALSE)
}
if (any(na.omit(names(x) == ""))) {
warning("Some ID labels are empty", call. = FALSE)
}
}

rbind_all(x, .id)
}

#' @export
Expand Down
4 changes: 4 additions & 0 deletions inst/include/tools/DotsOf.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ namespace Rcpp {
return data.size();
}

inline int names() const{
return data.names();
}

private:
List data ;

Expand Down
16 changes: 15 additions & 1 deletion man/bind.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,22 @@
\alias{rbind_list}
\title{Efficiently bind multiple data frames by row and column.}
\usage{
bind_rows(x, ...)
bind_rows(..., .id = NULL)

bind_cols(x, ...)

combine(x, ...)
}
\arguments{
\item{.id}{Data frames identifier.

When \code{.id} is supplied, a new column of identifiers is
created to link each row to its original data frame. The labels
are taken from the named arguments to \code{bind_rows()}. When a
list of data frames is supplied, the labels are taken from the
names of the list. If no names are found a numeric sequence is
used instead.}

\item{x,...}{Data frames to combine.

You can either supply one data frame per argument, or a list of
Expand Down Expand Up @@ -50,6 +59,11 @@ bind_rows(one, two)
bind_rows(list(one, two))
bind_rows(split(mtcars, mtcars$cyl))
# When you supply a column name with the `.id` argument, a new
# column is created to link each row to its original data frame
bind_rows(list(a = one, b = two), .id = "id")
bind_rows("group 1" = one, "group 2" = two, .id = "groups")
# Columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
\dontrun{
Expand Down
7 changes: 4 additions & 3 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,14 @@ BEGIN_RCPP
END_RCPP
}
// rbind_all
List rbind_all(List dots);
RcppExport SEXP dplyr_rbind_all(SEXP dotsSEXP) {
List rbind_all(List dots, SEXP id);
RcppExport SEXP dplyr_rbind_all(SEXP dotsSEXP, SEXP idSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< List >::type dots(dotsSEXP);
__result = Rcpp::wrap(rbind_all(dots));
Rcpp::traits::input_parameter< SEXP >::type id(idSEXP);
__result = Rcpp::wrap(rbind_all(dots, id));
return __result;
END_RCPP
}
Expand Down
39 changes: 31 additions & 8 deletions src/bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@ using namespace Rcpp ;
using namespace dplyr ;

template <typename Dots>
List rbind__impl( Dots dots ){
List rbind__impl( Dots dots, SEXP id = R_NilValue ){
int ndata = dots.size() ;
int n = 0 ;
std::vector<DataFrameAble> chunks ;
std::vector<int> df_nrows ;

for( int i=0; i<ndata; i++) {
chunks.push_back( DataFrameAble( dots[i] ) );
n += chunks[i].nrows() ;
chunks.push_back( DataFrameAble( dots[i] ) ) ;

int nrows = chunks[i].nrows() ;
df_nrows.push_back(nrows) ;
n += nrows ;
}
pointer_vector<Collecter> columns ;

Expand Down Expand Up @@ -82,23 +86,42 @@ List rbind__impl( Dots dots ){

k += nrows ;
}

int nc = columns.size() ;
List out(nc) ;
CharacterVector out_names(nc) ;
int nc_id = nc + (Rf_isNull(id) ? 0 : 1) ;

List out(nc_id) ;
CharacterVector out_names(nc_id) ;
for( int i=0; i<nc; i++){
out[i] = columns[i]->get() ;
out_names[i] = names[i] ;
}

// Add vector of identifiers if .id is supplied
if (!Rf_isNull(id)) {
CharacterVector df_names = dots.names() ;
CharacterVector id_col = no_init(n) ;

CharacterVector::iterator it = id_col.begin() ;
for (int i=0; i<ndata; ++i) {
std::fill( it, it + df_nrows[i], df_names[i] ) ;
it += df_nrows[i] ;
}

out[nc_id - 1] = id_col ;
out_names[nc_id - 1] = Rcpp::as<std::string>(id) ;
}

out.attr( "names" ) = out_names ;
set_rownames( out, n );
set_rownames( out, n ) ;
out.attr( "class" ) = classes_not_grouped() ;
return out ;
}

//' @export
// [[Rcpp::export]]
List rbind_all( List dots ){
return rbind__impl(dots) ;
List rbind_all( List dots, SEXP id = R_NilValue ){
return rbind__impl(dots, id) ;
}

// [[Rcpp::export]]
Expand Down
21 changes: 21 additions & 0 deletions tests/testthat/test-rbind.r
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,24 @@ test_that("bind handles POSIXct of different tz ", {
expect_equal( attr(res$date, "tzone"), "UTC" )

})

test_that("bind_rows() creates a column of identifiers (#1337)", {
data1 <- mtcars[c(2, 3), ]
data2 <- mtcars[1, ]

out <- bind_rows(data1, data2, .id = "col")
out_list <- bind_rows(list(data1, data2), .id = "col")
expect_equal(names(out)[12], "col")
expect_equal(out$col, c("1", "1", "2"))
expect_equal(out_list$col, c("1", "1", "2"))

out_labelled <- bind_rows(one = data1, two = data2, .id = "col")
out_list_labelled <- bind_rows(list(one = data1, two = data2), .id = "col")
expect_equal(out_labelled$col, c("one", "one", "two"))
expect_equal(out_list_labelled$col, c("one", "one", "two"))

list_missing <- list(data1, data2)
names(list_missing) <- c(NA, "two")
expect_warning(bind_rows(list_missing, .id = "col"))
expect_warning(bind_rows(list(data1, two = data2), .id = "col"))
})

0 comments on commit 6cf553a

Please sign in to comment.