Skip to content

Commit

Permalink
Merge 7849688 into 5791429
Browse files Browse the repository at this point in the history
  • Loading branch information
robertzk committed Jan 18, 2016
2 parents 5791429 + 7849688 commit f9beaaf
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 6 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Version 0.1.0.9000
### Version 0.1.0.9001

* Fix a bug in `column_transformation` wherein `NULL` return
values corrupted the data.frame instead of dropping columns.

### Version 0.1.0.9000

* Start development version.
* Change `munge` function to use an [objectdiff](https://github.com/robertzk/objectdiff)-compatible
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: A way of thinking about data preparation that
online prediction so that both can be described by the same codebase.
With mungebits, you can save time on having to re-implement your
R code to work in production and instead re-use the same codebase.
Version: 0.1.0.9000
Version: 0.1.0.9001
Author: Robert Krzyzanowski <technoguyrob@gmail.com>
Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
Authors@R: c(person("Robert", "Krzyzanowski",
Expand Down
37 changes: 33 additions & 4 deletions R/column_transformation.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ column_transformation <- function(transformation, nonstandard = FALSE) {
full_transformation <- function(data, columns = colnames(data), ...) { }
was_debugged <- isdebugged(transformation)
environment(transformation) <- list2env(list(
input = NULL, trained = NULL
input = NULL, trained = NULL, has_no_null = NULL
), parent = environment(transformation) %||% baseenv())

## We create a copy of the `standard_column_format` helper
Expand Down Expand Up @@ -205,6 +205,17 @@ column_transformation_body <- quote({
), .Names = input$columns)
}

## Dataframe subset assignment (`[<-.data.frame`) does not behave in the
## same manner as list assignment (`[<-`). Since we stripped the data.frame
## of its class earlier, the next line will perform *list assignment*.
## This has advantageous speedups, but in particular if we drop some of
## the columns by including `NULL` in the output of the transformation,
## this will corrupt the data.frame with actual `NULL` values
## instead of dropping columns. We work around this with performance
## considerations by recording whether any of the values in the inner loop
## are `NULL`.
env$has_no_null <- TRUE

data[indices] <- lapply(seq_along(indices), function(j, ...) {
## Since `indices` match the column names to iterate over on
## the nose, `sub_inputs[[j]]` will be the correct environment to
Expand Down Expand Up @@ -246,17 +257,25 @@ column_transformation_body <- quote({
arguments[[1L]] <- bquote(.(data_expr)[[.(
if (named) arguments$name else .subset2(names(data), .subset2(indices, j))
)]])
.Internal(do.call(transformation, arguments, eval_frame))
result <- .Internal(do.call(transformation, arguments, eval_frame))
} else {
## If NSE should not be carried over we do not bother with the
## magic and simply send the function the value.
if (named) {
transformation(.subset2(data, .subset2(indices, j)), ...,
result <- transformation(.subset2(data, .subset2(indices, j)), ...,
name = .subset2(names(data), .subset2(indices, j)))
} else {
transformation(.subset2(data, .subset2(indices, j)), ...)
result <- transformation(.subset2(data, .subset2(indices, j)), ...)
}
}

## Using a `has_no_null` flag is slightly faster than `has_null`,
## since we can save on a call to `!` in the condition below.
if (env$has_no_null && is.null(result)) {
env$has_no_null <- FALSE
}

result
}, ...)

## After training, we lock the `input` environments so that the
Expand All @@ -265,6 +284,16 @@ column_transformation_body <- quote({
lapply(input$sub_inputs, lockEnvironment, bindings = TRUE)
}

## Finally, if some of the columns *were* dropped, explicitly
## remove them from the dataframe using `[[<-` list assignment.
## This ensures that we do not drop any attributes and is faster
## than subsetting to non-`NULL` columns.
if (!env$has_no_null) {
for (i in which(vapply(data, is.null, logical(1)))) {
data[[i]] <- NULL
}
}

## Finally, we reset the class to `data.frame` after stripping it
## for a speed optimization. If you study the code of ``(`[.data.frame`)``,
## you will see this is exactly the same trick the R base library uses
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/test-column_transformation.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
context("column transformation")

describe("dropping columns", {
test_that("it can drop columns using a column_transformation", {
dropper <- mungebit$new(column_transformation(function(x) NULL))
expect_equal(dropper$run(iris, 1), iris[-1])
})
})

describe("Simplest examples", {
test_that("it can run an identity column transformation", {
expect_equal(mungebit$new(column_transformation(identity))$run(iris), iris)
Expand Down Expand Up @@ -184,3 +191,4 @@ describe("debugging", {
})
})


0 comments on commit f9beaaf

Please sign in to comment.