Skip to content

Commit

Permalink
Merge pull request #21 from syberia/fix_column_transformation_nulls
Browse files Browse the repository at this point in the history
Ensure NULL values in column_transformation result in column dropping
  • Loading branch information
robertzk committed Jan 19, 2016
2 parents 5791429 + 73cfe3a commit c37fab6
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 6 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
@@ -1,4 +1,9 @@
# Version 0.1.0.9000
### Version 0.1.0.9001

* Fix a bug in `column_transformation` wherein `NULL` return
values corrupted the data.frame instead of dropping columns.

### Version 0.1.0.9000

* Start development version.
* Change `munge` function to use an [objectdiff](https://github.com/robertzk/objectdiff)-compatible
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Expand Up @@ -6,7 +6,7 @@ Description: A way of thinking about data preparation that
online prediction so that both can be described by the same codebase.
With mungebits, you can save time on having to re-implement your
R code to work in production and instead re-use the same codebase.
Version: 0.1.0.9000
Version: 0.1.0.9001
Author: Robert Krzyzanowski <technoguyrob@gmail.com>
Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
Authors@R: c(person("Robert", "Krzyzanowski",
Expand Down
37 changes: 33 additions & 4 deletions R/column_transformation.R
Expand Up @@ -110,7 +110,7 @@ column_transformation <- function(transformation, nonstandard = FALSE) {
full_transformation <- function(data, columns = colnames(data), ...) { }
was_debugged <- isdebugged(transformation)
environment(transformation) <- list2env(list(
input = NULL, trained = NULL
input = NULL, trained = NULL, has_no_null = NULL
), parent = environment(transformation) %||% baseenv())

## We create a copy of the `standard_column_format` helper
Expand Down Expand Up @@ -205,6 +205,17 @@ column_transformation_body <- quote({
), .Names = input$columns)
}

## Dataframe subset assignment (`[<-.data.frame`) does not behave in the
## same manner as list assignment (`[<-`). Since we stripped the data.frame
## of its class earlier, the next line will perform *list assignment*.
## This has advantageous speedups, but in particular if we drop some of
## the columns by including `NULL` in the output of the transformation,
## this will corrupt the data.frame with actual `NULL` values
## instead of dropping columns. We work around this with performance
## considerations by recording whether any of the values in the inner loop
## are `NULL`.
env$has_no_null <- TRUE

data[indices] <- lapply(seq_along(indices), function(j, ...) {
## Since `indices` match the column names to iterate over on
## the nose, `sub_inputs[[j]]` will be the correct environment to
Expand Down Expand Up @@ -246,17 +257,25 @@ column_transformation_body <- quote({
arguments[[1L]] <- bquote(.(data_expr)[[.(
if (named) arguments$name else .subset2(names(data), .subset2(indices, j))
)]])
.Internal(do.call(transformation, arguments, eval_frame))
result <- .Internal(do.call(transformation, arguments, eval_frame))
} else {
## If NSE should not be carried over we do not bother with the
## magic and simply send the function the value.
if (named) {
transformation(.subset2(data, .subset2(indices, j)), ...,
result <- transformation(.subset2(data, .subset2(indices, j)), ...,
name = .subset2(names(data), .subset2(indices, j)))
} else {
transformation(.subset2(data, .subset2(indices, j)), ...)
result <- transformation(.subset2(data, .subset2(indices, j)), ...)
}
}

## Using a `has_no_null` flag is slightly faster than `has_null`,
## since we can save on a call to `!` in the condition below.
if (env$has_no_null && is.null(result)) {
env$has_no_null <- FALSE
}

result
}, ...)

## After training, we lock the `input` environments so that the
Expand All @@ -265,6 +284,16 @@ column_transformation_body <- quote({
lapply(input$sub_inputs, lockEnvironment, bindings = TRUE)
}

## Finally, if some of the columns *were* dropped, explicitly
## remove them from the dataframe using `[[<-` list assignment.
## This ensures that we do not drop any attributes and is faster
## than subsetting to non-`NULL` columns.
if (!env$has_no_null) {
for (i in which(vapply(data, is.null, logical(1)))) {
data[[i]] <- NULL
}
}

## Finally, we reset the class to `data.frame` after stripping it
## for a speed optimization. If you study the code of ``(`[.data.frame`)``,
## you will see this is exactly the same trick the R base library uses
Expand Down
66 changes: 66 additions & 0 deletions tests/testthat/test-column_transformation.R
Expand Up @@ -87,6 +87,71 @@ describe("Simplest examples", {

})

describe("Dropping columns", {
test_that("it can drop columns using a column_transformation", {
dropper <- mungebit$new(column_transformation(function(x) NULL))
expect_equal(dropper$run(iris, 1), iris[-1])
})

test_that("it can partially drop columns using a column_transformation", {
dropper <- mungebit$new(column_transformation(function(x) if (is.numeric(x)) x))
expect_equal(dropper$run(iris), Filter(is.numeric, iris))
})

test_that("it can partially drop columns in predict using a column_transformation", {
dropper <- mungebit$new(column_transformation(function(x) if (is.numeric(x)) x))
dropper$run(iris)
expect_equal(dropper$run(iris), Filter(is.numeric, iris))
})

test_that("it can drop columns using a column_transformation in predict", {
dropper <- mungebit$new(column_transformation(function(x) NULL))
dropper$run(iris, 1)
expect_equal(dropper$run(iris, 1), iris[-1])
})
})


describe("Passing arguments", {

test_that("it correctly passes dots arguments", {
scaler <- mungebit$new(column_transformation(function(x, v) { v * x }))
iris2 <- scaler$run(iris[1:4], , 2)
expect_equal(iris2[1:4], 2 * iris[1:4],
info = "column_transformation must double first column of iris")
})

test_that('accepts transformation calls with missing arguments', {
doubler <- mungebit$new(column_transformation(function(x) { 2 * x }))
iris2 <- doubler$run(iris[1: 4])
expect_equal(iris2, 2 * iris[, 1:4],
info = "column_transformation must double first column of iris")
})

describe("Nonstandard evaluation pass-along", {
it("passes along nonstandard evaluation", {
nse <- mungebit$new(nse = TRUE, column_transformation(nonstandard = TRUE, function(x) {
paste0(deparse(substitute(x)), x) }))
balloo <- iris
iris2 <- nse$run(balloo, 5)
expect_equal(iris2, transform(iris, Species = paste0('balloo[["Species"]]', Species)),
info = "column_transformation should pass along non-standard evaluation")
})
})

describe("Nonstandard evaluation pass-along with a call rather than a name", {
it("passes along nonstandard evaluation", {
nse <- mungebit$new(nse = TRUE, column_transformation(nonstandard = TRUE, function(x) {
paste0(deparse(substitute(x)), x) }))
balloo <- function() iris
iris2 <- nse$run(balloo(), 5)
expect_equal(iris2, transform(iris, Species = paste0('balloo()[["Species"]]', Species)),
info = "column_transformation should pass along non-standard evaluation")
})
})
})


describe("Passing arguments", {

test_that("it correctly passes dots arguments", {
Expand Down Expand Up @@ -184,3 +249,4 @@ describe("debugging", {
})
})


0 comments on commit c37fab6

Please sign in to comment.