Skip to content
Permalink
Browse files
Initial commit of dplyr
  • Loading branch information
hadley committed Oct 28, 2012
0 parents commit 80dc69b144711ec095db1d62cf0b73e09560eaf0
Showing with 439 additions and 0 deletions.
  1. +10 −0 DESCRIPTION
  2. +34 −0 R/arrange.r
  3. +63 −0 R/mutate.r
  4. +135 −0 R/parse.r
  5. +36 −0 R/subset.r
  6. +85 −0 R/summarise.r
  7. +53 −0 bench/access.r
  8. +23 −0 bench/equality.r
@@ -0,0 +1,10 @@
Package: plyr2
Type: Package
Title: Tools for splitting, applying and combining data
Version: 0.01
Author: Hadley Wickham <h.wickham@gmail.com>
Maintainer: Hadley Wickham <h.wickham@gmail.com>
Description: ddply on crack
Depends:
R (>= 2.15.1)
License: MIT
@@ -0,0 +1,34 @@
# system.time({
# grp <- plyr:::split_indices(id(baseball["id"]))
# a <- arrange_by(baseball, grp, quote(g))
# })
# a <- arrange(a, id)
# system.time(b <- ddply(baseball, "id", arrange, g))
arrange_by <- function(data, groups, call) {
n <- length(groups)

out <- rep(NA_integer_, nrow(data))
order_call <- as.call(c(list(quote(order)), call))

grp <- new.env(size = ncol(data), parent = parent.frame())
get_input <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
# equivalent to data[[j]][rows] but avoids costly S3 dispatch
.subset2(data, j)[rows]
}
}
for (name in names(data)) {
makeActiveBinding(name, get_input(name), grp)
}

for (i in seq_len(n)) {
rows <- groups[[i]]

ord <- eval(order_call, grp)
out[rows] <- rows[ord]
}

data[out, ]
}
@@ -0,0 +1,63 @@
# system.time({
# grp <- plyr:::split_indices(id(baseball["id"]))
# vars <- list(cyear = quote(year - min(year) + 1))
# a <- mutate_by(baseball, grp, vars)
# })
# a <- a[order(baseball$id), , drop = FALSE]
#
# system.time(b <- ddply(baseball, "id", mutate, cyear = year - min(year) + 1))
#
# all.equal(a$cyear, b$cyear)
mutate_by <- function(data, groups, cols) {
n <- nrow(data)
p <- length(cols)

out <- vector("list", p)
names(out) <- names(cols)

grp <- new.env(size = p, parent = parent.frame())
get_input <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
# equivalent to data[[j]][rows] but avoids costly S3 dispatch
.subset2(data, j)[rows]
}
}
get_output <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
.subset2(out, j)[i]
}
}
for (name in names(data)) {
makeActiveBinding(name, get_input(name), grp)
}

for (i in seq_along(groups)) {
rows <- groups[[i]]

for (j in seq_len(p)) {
if (i == 1L) {
# Run mutate once to make vector of right type
template <- eval(cols[[j]], grp)
out[[j]] <- template[1]
length(out[[j]]) <- n
out[[j]][rows] <- template

name <- names(cols)[[i]]
makeActiveBinding(name, get_output(name), grp)
} else {
out[[j]][rows] <- eval(cols[[j]], grp)
}
}
}

# Coerce to data frame in place to avoid copying
class(out) <- "data.frame"
attr(out, "row.names") <- c(NA_integer_, -n)

out
}

135 R/parse.r
@@ -0,0 +1,135 @@
# For sql:
#
# y = var(x) -> sql_var(quote(x)) -> VAR(X) as y
#
# a <- 1:5
# id %in% a -> sql_in(quote(id), a) -> id IN (1, 2, 3, 4, 5)
#
# x == y && x < 4 -> sql_and(sql_eq(quote(x), quote(y)), sql_lt(quote(x), 4))
#
# median(x) -> NO SQL EQUIV
#
# Challenge is distinguishing between local and remote vars - maybe best
# solution is to delay translation until we know the data def. Still probably
# want explicit way of specifying local/remote variables if you want to be
# extra careful though.
#
# Need process to register UDFs:
#
# f <- function(x) mean(x) + sd(x) ->
# sql_f <- function(x) sql_plus(sql_mean(x), sql_sd(x))
#
# (also need way for user to add their SQL UDFS)
#
# Standard set of renaming:
# * > gt, >= gte, < lt, <= lte, == eq, != neq
# * ! not, && and, || or
# * %in% in, %% mod
# * + plus, - minus, * times, / divide, ...
# * ( = parens
#
# Otherwise everything gets standard prefix
# Will need to write wrapper functions by hand to check for arity and type

# Recurse through call, replacing function names with their
# equivalents, local variables with their values, and remote vars with
# quoted values
translate_sql <- function(call, source_vars, env = parent.frame()) {
if (!is.recursive(call)) {
# Base case

if (is.atomic(call)) {
call
} else if (is.symbol(call)) {
name <- as.character(call)
if (name %in% source_vars) {
substitute(sql_var(var), list(var = as.character(call)))
} else if (exists(name, env)) {
get(name, env)
} else {
stop(name, " not defined locally or in data source")
}
} else {
message("How did you get here?")
browser()
}
} else {
# Recursive case
if (!is.call(call)) {
message("How did you get here?")
browser()
}

new_name <- trans_name(call[[1]], "sql")
if (!exists(as.character(new_name))) {
# Don't know how to turn into sql equivalent - so give up
stop("Don't know how to translate ", as.character(call[[1]]), " to sql",
call. = FALSE)
} else {
args <- lapply(call[-1], translate_sql,
source_vars = source_vars, env = env)
as.call(c(list(new_name), args))
}

}

}

sql_var <- function(x) {
x
}

sql_mean <- function(x) {
str_c("MEAN(", x, ")")
}
sql_sum <- function(x) {
str_c("SUM(", x, ")")
}

sql_and <- function(x, y) {
str_c(x, " AND ", y)
}
sql_or <- function(x, y) {
str_c(x, " OR ", y)
}
sql_not <- function(x) {
str_c("NOT", x)
}
sql_parens <- function(x) {
str_c("(", x, ")")
}
sql_eq <- function(x, y) {
str_c(x, " == ", y)
}
sql_gt <- function(x, y) {
str_c(x, " > ", y)
}

trans_name <- function(symbol, type) {
x <- as.character(symbol)
if (x %in% names(mappings)) x <- mappings[[x]]

as.name(str_c(type, "_", x))
}
mappings <- c(
# Logical operators
"==" = "eq",
"!=" = "neq",
"<" = "lt",
">" = "gt",
"<=" = "lte",
">=" = "gte",

# Boolean comparison
"&&" = "and",
"||" = "or",
"!" = "not",

# Numerical
"+" = "plus",
"%%" = "mod",

# Misc
"%in%" = "in",
"(" = "parens"
)
@@ -0,0 +1,36 @@
# system.time({
# grp <- plyr:::split_indices(id(baseball["id"]))
# a <- subset_by(baseball, grp, quote(g == max(g)))
# })
#
# system.time(b <- ddply(baseball, "id", subset, g == max(g)))
#
# a <- arrange(a, id)
subset_by <- function(data, groups, call) {
n <- length(groups)

out <- rep(NA, nrow(data))

grp <- new.env(size = ncol(data), parent = parent.frame())
get_input <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
# equivalent to data[[j]][rows] but avoids costly S3 dispatch
.subset2(data, j)[rows]
}
}
for (name in names(data)) {
makeActiveBinding(name, get_input(name), grp)
}

for (i in seq_len(n)) {
rows <- groups[[i]]

r <- eval(call, grp)
out[rows] <- r & !is.na(r)
}

data[out, ]
}

@@ -0,0 +1,85 @@
library(plyr)

# Restrictions:
# * summary functions must return single value
# * types must be the same in all groups
#
# Need to benchmark with and without checks for broken assumptions.
# Compare to aggregate, and to data.table
#
# Test with:
# * dates and factors
# * variables that depend on previous
#
# system.time({
# grp <- plyr:::split_indices(id(baseball["id"]))
# vars <- list(n = quote(length(id)), m = quote(n + 1))
# a <- summarise_by(baseball, grp, vars)
# })
#
# system.time(b <- ddply(baseball, "id", summarise, n = length(id)))
# stopifnot(all.equal(a$n, b$n))
# # ~20x slower
#
# system.time(count(baseball, "id"))
# # ~2x faster - in this case it's basically id + tabulate
# # so maybe able to eke out a little more with a C loop ?
#
# baseball2 <- data.table(baseball)
# system.time(baseball2[, list(n = length(year), m = n + 1), by = id])
# # ~ 0.007 - holy shit that's fast
# # but now only ~10x faster than summarise_by
# setkey(baseball2, id)
# system.time(baseball2[, length(year), by = id])
# # ~ 0.002 - even more insanely fast
summarise_by <- function(data, groups, cols) {
n <- length(groups)
p <- length(cols)

out <- vector("list", p)
names(out) <- names(cols)

grp <- new.env(size = p, parent = parent.frame())
get_input <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
# equivalent to data[[j]][rows] but avoids costly S3 dispatch
.subset2(data, j)[rows]
}
}
get_output <- function(j) {
force(j)
function(v) {
if (!missing(v)) stop("Immutable view")
.subset2(out, j)[i]
}
}
for (name in names(data)) {
makeActiveBinding(name, get_input(name), grp)
}

for (i in seq_len(n)) {
rows <- groups[[i]]

for (j in seq_len(p)) {
if (i == 1L) {
# Run summarise once to make vector of right type
out[[j]] <- eval(cols[[j]], grp)
length(out[[j]]) <- n

name <- names(cols)[[i]]
makeActiveBinding(name, get_output(name), grp)
} else {
out[[j]][[i]] <- eval(cols[[j]], grp)
}
}
}

# Coerce to data frame in place to avoid copying
class(out) <- "data.frame"
attr(out, "row.names") <- c(NA_integer_, -n)

out
}

0 comments on commit 80dc69b

Please sign in to comment.