Permalink
Browse files

Initial commit of dplyr

  • Loading branch information...
hadley committed Oct 28, 2012
0 parents commit 80dc69b144711ec095db1d62cf0b73e09560eaf0
Showing with 439 additions and 0 deletions.
  1. +10 −0 DESCRIPTION
  2. +34 −0 R/arrange.r
  3. +63 −0 R/mutate.r
  4. +135 −0 R/parse.r
  5. +36 −0 R/subset.r
  6. +85 −0 R/summarise.r
  7. +53 −0 bench/access.r
  8. +23 −0 bench/equality.r
@@ -0,0 +1,10 @@
+Package: plyr2
+Type: Package
+Title: Tools for splitting, applying and combining data
+Version: 0.01
+Author: Hadley Wickham <h.wickham@gmail.com>
+Maintainer: Hadley Wickham <h.wickham@gmail.com>
+Description: ddply on crack
+Depends:
+ R (>= 2.15.1)
+License: MIT
@@ -0,0 +1,34 @@
+# system.time({
+# grp <- plyr:::split_indices(id(baseball["id"]))
+# a <- arrange_by(baseball, grp, quote(g))
+# })
+# a <- arrange(a, id)
+# system.time(b <- ddply(baseball, "id", arrange, g))
+arrange_by <- function(data, groups, call) {
+ n <- length(groups)
+
+ out <- rep(NA_integer_, nrow(data))
+ order_call <- as.call(c(list(quote(order)), call))
+
+ grp <- new.env(size = ncol(data), parent = parent.frame())
+ get_input <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ # equivalent to data[[j]][rows] but avoids costly S3 dispatch
+ .subset2(data, j)[rows]
+ }
+ }
+ for (name in names(data)) {
+ makeActiveBinding(name, get_input(name), grp)
+ }
+
+ for (i in seq_len(n)) {
+ rows <- groups[[i]]
+
+ ord <- eval(order_call, grp)
+ out[rows] <- rows[ord]
+ }
+
+ data[out, ]
+}
@@ -0,0 +1,63 @@
+# system.time({
+# grp <- plyr:::split_indices(id(baseball["id"]))
+# vars <- list(cyear = quote(year - min(year) + 1))
+# a <- mutate_by(baseball, grp, vars)
+# })
+# a <- a[order(baseball$id), , drop = FALSE]
+#
+# system.time(b <- ddply(baseball, "id", mutate, cyear = year - min(year) + 1))
+#
+# all.equal(a$cyear, b$cyear)
+mutate_by <- function(data, groups, cols) {
+ n <- nrow(data)
+ p <- length(cols)
+
+ out <- vector("list", p)
+ names(out) <- names(cols)
+
+ grp <- new.env(size = p, parent = parent.frame())
+ get_input <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ # equivalent to data[[j]][rows] but avoids costly S3 dispatch
+ .subset2(data, j)[rows]
+ }
+ }
+ get_output <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ .subset2(out, j)[i]
+ }
+ }
+ for (name in names(data)) {
+ makeActiveBinding(name, get_input(name), grp)
+ }
+
+ for (i in seq_along(groups)) {
+ rows <- groups[[i]]
+
+ for (j in seq_len(p)) {
+ if (i == 1L) {
+ # Run mutate once to make vector of right type
+ template <- eval(cols[[j]], grp)
+ out[[j]] <- template[1]
+ length(out[[j]]) <- n
+ out[[j]][rows] <- template
+
+ name <- names(cols)[[i]]
+ makeActiveBinding(name, get_output(name), grp)
+ } else {
+ out[[j]][rows] <- eval(cols[[j]], grp)
+ }
+ }
+ }
+
+ # Coerce to data frame in place to avoid copying
+ class(out) <- "data.frame"
+ attr(out, "row.names") <- c(NA_integer_, -n)
+
+ out
+}
+
135 R/parse.r
@@ -0,0 +1,135 @@
+# For sql:
+#
+# y = var(x) -> sql_var(quote(x)) -> VAR(X) as y
+#
+# a <- 1:5
+# id %in% a -> sql_in(quote(id), a) -> id IN (1, 2, 3, 4, 5)
+#
+# x == y && x < 4 -> sql_and(sql_eq(quote(x), quote(y)), sql_lt(quote(x), 4))
+#
+# median(x) -> NO SQL EQUIV
+#
+# Challenge is distinguishing between local and remote vars - maybe best
+# solution is to delay translation until we know the data def. Still probably
+# want explicit way of specifying local/remote variables if you want to be
+# extra careful though.
+#
+# Need process to register UDFs:
+#
+# f <- function(x) mean(x) + sd(x) ->
+# sql_f <- function(x) sql_plus(sql_mean(x), sql_sd(x))
+#
+# (also need way for user to add their SQL UDFS)
+#
+# Standard set of renaming:
+# * > gt, >= gte, < lt, <= lte, == eq, != neq
+# * ! not, && and, || or
+# * %in% in, %% mod
+# * + plus, - minus, * times, / divide, ...
+# * ( = parens
+#
+# Otherwise everything gets standard prefix
+# Will need to write wrapper functions by hand to check for arity and type
+
+# Recurse through call, replacing function names with their
+# equivalents, local variables with their values, and remote vars with
+# quoted values
+translate_sql <- function(call, source_vars, env = parent.frame()) {
+ if (!is.recursive(call)) {
+ # Base case
+
+ if (is.atomic(call)) {
+ call
+ } else if (is.symbol(call)) {
+ name <- as.character(call)
+ if (name %in% source_vars) {
+ substitute(sql_var(var), list(var = as.character(call)))
+ } else if (exists(name, env)) {
+ get(name, env)
+ } else {
+ stop(name, " not defined locally or in data source")
+ }
+ } else {
+ message("How did you get here?")
+ browser()
+ }
+ } else {
+ # Recursive case
+ if (!is.call(call)) {
+ message("How did you get here?")
+ browser()
+ }
+
+ new_name <- trans_name(call[[1]], "sql")
+ if (!exists(as.character(new_name))) {
+ # Don't know how to turn into sql equivalent - so give up
+ stop("Don't know how to translate ", as.character(call[[1]]), " to sql",
+ call. = FALSE)
+ } else {
+ args <- lapply(call[-1], translate_sql,
+ source_vars = source_vars, env = env)
+ as.call(c(list(new_name), args))
+ }
+
+ }
+
+}
+
+sql_var <- function(x) {
+ x
+}
+
+sql_mean <- function(x) {
+ str_c("MEAN(", x, ")")
+}
+sql_sum <- function(x) {
+ str_c("SUM(", x, ")")
+}
+
+sql_and <- function(x, y) {
+ str_c(x, " AND ", y)
+}
+sql_or <- function(x, y) {
+ str_c(x, " OR ", y)
+}
+sql_not <- function(x) {
+ str_c("NOT", x)
+}
+sql_parens <- function(x) {
+ str_c("(", x, ")")
+}
+sql_eq <- function(x, y) {
+ str_c(x, " == ", y)
+}
+sql_gt <- function(x, y) {
+ str_c(x, " > ", y)
+}
+
+trans_name <- function(symbol, type) {
+ x <- as.character(symbol)
+ if (x %in% names(mappings)) x <- mappings[[x]]
+
+ as.name(str_c(type, "_", x))
+}
+mappings <- c(
+ # Logical operators
+ "==" = "eq",
+ "!=" = "neq",
+ "<" = "lt",
+ ">" = "gt",
+ "<=" = "lte",
+ ">=" = "gte",
+
+ # Boolean comparison
+ "&&" = "and",
+ "||" = "or",
+ "!" = "not",
+
+ # Numerical
+ "+" = "plus",
+ "%%" = "mod",
+
+ # Misc
+ "%in%" = "in",
+ "(" = "parens"
+)
@@ -0,0 +1,36 @@
+# system.time({
+# grp <- plyr:::split_indices(id(baseball["id"]))
+# a <- subset_by(baseball, grp, quote(g == max(g)))
+# })
+#
+# system.time(b <- ddply(baseball, "id", subset, g == max(g)))
+#
+# a <- arrange(a, id)
+subset_by <- function(data, groups, call) {
+ n <- length(groups)
+
+ out <- rep(NA, nrow(data))
+
+ grp <- new.env(size = ncol(data), parent = parent.frame())
+ get_input <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ # equivalent to data[[j]][rows] but avoids costly S3 dispatch
+ .subset2(data, j)[rows]
+ }
+ }
+ for (name in names(data)) {
+ makeActiveBinding(name, get_input(name), grp)
+ }
+
+ for (i in seq_len(n)) {
+ rows <- groups[[i]]
+
+ r <- eval(call, grp)
+ out[rows] <- r & !is.na(r)
+ }
+
+ data[out, ]
+}
+
@@ -0,0 +1,85 @@
+library(plyr)
+
+# Restrictions:
+# * summary functions must return single value
+# * types must be the same in all groups
+#
+# Need to benchmark with and without checks for broken assumptions.
+# Compare to aggregate, and to data.table
+#
+# Test with:
+# * dates and factors
+# * variables that depend on previous
+#
+# system.time({
+# grp <- plyr:::split_indices(id(baseball["id"]))
+# vars <- list(n = quote(length(id)), m = quote(n + 1))
+# a <- summarise_by(baseball, grp, vars)
+# })
+#
+# system.time(b <- ddply(baseball, "id", summarise, n = length(id)))
+# stopifnot(all.equal(a$n, b$n))
+# # ~20x slower
+#
+# system.time(count(baseball, "id"))
+# # ~2x faster - in this case it's basically id + tabulate
+# # so maybe able to eke out a little more with a C loop ?
+#
+# baseball2 <- data.table(baseball)
+# system.time(baseball2[, list(n = length(year), m = n + 1), by = id])
+# # ~ 0.007 - holy shit that's fast
+# # but now only ~10x faster than summarise_by
+# setkey(baseball2, id)
+# system.time(baseball2[, length(year), by = id])
+# # ~ 0.002 - even more insanely fast
+summarise_by <- function(data, groups, cols) {
+ n <- length(groups)
+ p <- length(cols)
+
+ out <- vector("list", p)
+ names(out) <- names(cols)
+
+ grp <- new.env(size = p, parent = parent.frame())
+ get_input <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ # equivalent to data[[j]][rows] but avoids costly S3 dispatch
+ .subset2(data, j)[rows]
+ }
+ }
+ get_output <- function(j) {
+ force(j)
+ function(v) {
+ if (!missing(v)) stop("Immutable view")
+ .subset2(out, j)[i]
+ }
+ }
+ for (name in names(data)) {
+ makeActiveBinding(name, get_input(name), grp)
+ }
+
+ for (i in seq_len(n)) {
+ rows <- groups[[i]]
+
+ for (j in seq_len(p)) {
+ if (i == 1L) {
+ # Run summarise once to make vector of right type
+ out[[j]] <- eval(cols[[j]], grp)
+ length(out[[j]]) <- n
+
+ name <- names(cols)[[i]]
+ makeActiveBinding(name, get_output(name), grp)
+ } else {
+ out[[j]][[i]] <- eval(cols[[j]], grp)
+ }
+ }
+ }
+
+ # Coerce to data frame in place to avoid copying
+ class(out) <- "data.frame"
+ attr(out, "row.names") <- c(NA_integer_, -n)
+
+ out
+}
+
Oops, something went wrong.

0 comments on commit 80dc69b

Please sign in to comment.