Merge pull request #21 from syberia/fix_column_transformation_nulls

Ensure NULL values in column_transformation result in column dropping
syberia · Jan 19, 2016 · c37fab6 · c37fab6
2 parents 5791429 + 73cfe3a
commit c37fab6
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,9 @@
-# Version 0.1.0.9000
+### Version 0.1.0.9001
+
+  * Fix a bug in `column_transformation` wherein `NULL` return
+    values corrupted the data.frame instead of dropping columns.
+
+### Version 0.1.0.9000
 
   * Start development version.
   * Change `munge` function to use an [objectdiff](https://github.com/robertzk/objectdiff)-compatible

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,7 @@ Description: A way of thinking about data preparation that
     online prediction so that both can be described by the same codebase.
     With mungebits, you can save time on having to re-implement your
     R code to work in production and instead re-use the same codebase.
-Version: 0.1.0.9000
+Version: 0.1.0.9001
 Author: Robert Krzyzanowski <technoguyrob@gmail.com>
 Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
 Authors@R: c(person("Robert", "Krzyzanowski",

diff --git a/R/column_transformation.R b/R/column_transformation.R
@@ -110,7 +110,7 @@ column_transformation <- function(transformation, nonstandard = FALSE) {
   full_transformation <- function(data, columns = colnames(data), ...) { }
   was_debugged <- isdebugged(transformation)
   environment(transformation) <- list2env(list(
-    input = NULL, trained = NULL
+    input = NULL, trained = NULL, has_no_null = NULL
   ), parent = environment(transformation) %||% baseenv())
 
   ## We create a copy of the `standard_column_format` helper
@@ -205,6 +205,17 @@ column_transformation_body <- quote({
     ), .Names = input$columns)
   }
 
+  ## Dataframe subset assignment (`[<-.data.frame`) does not behave in the
+  ## same manner as list assignment (`[<-`). Since we stripped the data.frame
+  ## of its class earlier, the next line will perform *list assignment*. 
+  ## This has advantageous speedups, but in particular if we drop some of
+  ## the columns by including `NULL` in the output of the transformation,
+  ## this will corrupt the data.frame with actual `NULL` values 
+  ## instead of dropping columns. We work around this with performance 
+  ## considerations by recording whether any of the values in the inner loop
+  ## are `NULL`.
+  env$has_no_null <- TRUE
+
   data[indices] <- lapply(seq_along(indices), function(j, ...) {
     ## Since `indices` match the column names to iterate over on
     ## the nose, `sub_inputs[[j]]` will be the correct environment to
@@ -246,17 +257,25 @@ column_transformation_body <- quote({
       arguments[[1L]] <- bquote(.(data_expr)[[.(
         if (named) arguments$name else .subset2(names(data), .subset2(indices, j))
       )]])
-      .Internal(do.call(transformation, arguments, eval_frame))
+      result <- .Internal(do.call(transformation, arguments, eval_frame))
     } else {
       ## If NSE should not be carried over we do not bother with the
       ## magic and simply send the function the value.
       if (named) {
-        transformation(.subset2(data, .subset2(indices, j)), ...,
+        result <- transformation(.subset2(data, .subset2(indices, j)), ...,
                        name = .subset2(names(data), .subset2(indices, j)))
       } else {
-        transformation(.subset2(data, .subset2(indices, j)), ...)
+        result <- transformation(.subset2(data, .subset2(indices, j)), ...)
       }
     }
+
+    ## Using a `has_no_null` flag is slightly faster than `has_null`,
+    ## since we can save on a call to `!` in the condition below.
+    if (env$has_no_null && is.null(result)) {
+      env$has_no_null <- FALSE
+    }
+
+    result
   }, ...)
 
   ## After training, we lock the `input` environments so that the
@@ -265,6 +284,16 @@ column_transformation_body <- quote({
     lapply(input$sub_inputs, lockEnvironment, bindings = TRUE)
   }
 
+  ## Finally, if some of the columns *were* dropped, explicitly 
+  ## remove them from the dataframe using `[[<-` list assignment.
+  ## This ensures that we do not drop any attributes and is faster
+  ## than subsetting to non-`NULL` columns.
+  if (!env$has_no_null) {
+    for (i in which(vapply(data, is.null, logical(1)))) {
+      data[[i]] <- NULL
+    }
+  }
+
   ## Finally, we reset the class to `data.frame` after stripping it
   ## for a speed optimization. If you study the code of ``(`[.data.frame`)``,
   ## you will see this is exactly the same trick the R base library uses

diff --git a/tests/testthat/test-column_transformation.R b/tests/testthat/test-column_transformation.R
@@ -87,6 +87,71 @@ describe("Simplest examples", {
 
 })
 
+describe("Dropping columns", {
+  test_that("it can drop columns using a column_transformation", {
+    dropper <- mungebit$new(column_transformation(function(x) NULL))
+    expect_equal(dropper$run(iris, 1), iris[-1])
+  })
+
+  test_that("it can partially drop columns using a column_transformation", {
+    dropper <- mungebit$new(column_transformation(function(x) if (is.numeric(x)) x))
+    expect_equal(dropper$run(iris), Filter(is.numeric, iris))
+  })
+
+  test_that("it can partially drop columns in predict using a column_transformation", {
+    dropper <- mungebit$new(column_transformation(function(x) if (is.numeric(x)) x))
+    dropper$run(iris)
+    expect_equal(dropper$run(iris), Filter(is.numeric, iris))
+  })
+
+  test_that("it can drop columns using a column_transformation in predict", {
+    dropper <- mungebit$new(column_transformation(function(x) NULL))
+    dropper$run(iris, 1)
+    expect_equal(dropper$run(iris, 1), iris[-1])
+  })
+})
+
+
+describe("Passing arguments", {
+
+  test_that("it correctly passes dots arguments", {
+    scaler <- mungebit$new(column_transformation(function(x, v) { v * x }))
+    iris2  <- scaler$run(iris[1:4], , 2)
+    expect_equal(iris2[1:4], 2 * iris[1:4],
+                 info = "column_transformation must double first column of iris")
+  })
+
+  test_that('accepts transformation calls with missing arguments', {
+    doubler <- mungebit$new(column_transformation(function(x) { 2 * x }))
+    iris2   <- doubler$run(iris[1: 4])
+    expect_equal(iris2, 2 * iris[, 1:4],
+                 info = "column_transformation must double first column of iris")
+  })
+
+  describe("Nonstandard evaluation pass-along", {
+    it("passes along nonstandard evaluation", {
+      nse <- mungebit$new(nse = TRUE, column_transformation(nonstandard = TRUE, function(x) {
+        paste0(deparse(substitute(x)), x) }))
+      balloo <- iris
+      iris2  <- nse$run(balloo, 5)
+      expect_equal(iris2, transform(iris, Species = paste0('balloo[["Species"]]', Species)),
+                   info = "column_transformation should pass along non-standard evaluation")
+    })
+  })
+
+  describe("Nonstandard evaluation pass-along with a call rather than a name", {
+    it("passes along nonstandard evaluation", {
+      nse <- mungebit$new(nse = TRUE, column_transformation(nonstandard = TRUE, function(x) {
+        paste0(deparse(substitute(x)), x) }))
+      balloo <- function() iris
+      iris2  <- nse$run(balloo(), 5)
+      expect_equal(iris2, transform(iris, Species = paste0('balloo()[["Species"]]', Species)),
+                   info = "column_transformation should pass along non-standard evaluation")
+    })
+  })
+})
+
+
 describe("Passing arguments", {
 
   test_that("it correctly passes dots arguments", {
@@ -184,3 +249,4 @@ describe("debugging", {
   })
 })
 
+