Merge 7849688 into 5791429

syberia · Jan 18, 2016 · f9beaaf · f9beaaf
2 parents 5791429 + 7849688
commit f9beaaf
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,9 @@
-# Version 0.1.0.9000
+### Version 0.1.0.9001
+
+  * Fix a bug in `column_transformation` wherein `NULL` return
+    values corrupted the data.frame instead of dropping columns.
+
+### Version 0.1.0.9000
 
   * Start development version.
   * Change `munge` function to use an [objectdiff](https://github.com/robertzk/objectdiff)-compatible

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,7 @@ Description: A way of thinking about data preparation that
     online prediction so that both can be described by the same codebase.
     With mungebits, you can save time on having to re-implement your
     R code to work in production and instead re-use the same codebase.
-Version: 0.1.0.9000
+Version: 0.1.0.9001
 Author: Robert Krzyzanowski <technoguyrob@gmail.com>
 Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
 Authors@R: c(person("Robert", "Krzyzanowski",

diff --git a/R/column_transformation.R b/R/column_transformation.R
@@ -110,7 +110,7 @@ column_transformation <- function(transformation, nonstandard = FALSE) {
   full_transformation <- function(data, columns = colnames(data), ...) { }
   was_debugged <- isdebugged(transformation)
   environment(transformation) <- list2env(list(
-    input = NULL, trained = NULL
+    input = NULL, trained = NULL, has_no_null = NULL
   ), parent = environment(transformation) %||% baseenv())
 
   ## We create a copy of the `standard_column_format` helper
@@ -205,6 +205,17 @@ column_transformation_body <- quote({
     ), .Names = input$columns)
   }
 
+  ## Dataframe subset assignment (`[<-.data.frame`) does not behave in the
+  ## same manner as list assignment (`[<-`). Since we stripped the data.frame
+  ## of its class earlier, the next line will perform *list assignment*. 
+  ## This has advantageous speedups, but in particular if we drop some of
+  ## the columns by including `NULL` in the output of the transformation,
+  ## this will corrupt the data.frame with actual `NULL` values 
+  ## instead of dropping columns. We work around this with performance 
+  ## considerations by recording whether any of the values in the inner loop
+  ## are `NULL`.
+  env$has_no_null <- TRUE
+
   data[indices] <- lapply(seq_along(indices), function(j, ...) {
     ## Since `indices` match the column names to iterate over on
     ## the nose, `sub_inputs[[j]]` will be the correct environment to
@@ -246,17 +257,25 @@ column_transformation_body <- quote({
       arguments[[1L]] <- bquote(.(data_expr)[[.(
         if (named) arguments$name else .subset2(names(data), .subset2(indices, j))
       )]])
-      .Internal(do.call(transformation, arguments, eval_frame))
+      result <- .Internal(do.call(transformation, arguments, eval_frame))
     } else {
       ## If NSE should not be carried over we do not bother with the
       ## magic and simply send the function the value.
       if (named) {
-        transformation(.subset2(data, .subset2(indices, j)), ...,
+        result <- transformation(.subset2(data, .subset2(indices, j)), ...,
                        name = .subset2(names(data), .subset2(indices, j)))
       } else {
-        transformation(.subset2(data, .subset2(indices, j)), ...)
+        result <- transformation(.subset2(data, .subset2(indices, j)), ...)
       }
     }
+
+    ## Using a `has_no_null` flag is slightly faster than `has_null`,
+    ## since we can save on a call to `!` in the condition below.
+    if (env$has_no_null && is.null(result)) {
+      env$has_no_null <- FALSE
+    }
+
+    result
   }, ...)
 
   ## After training, we lock the `input` environments so that the
@@ -265,6 +284,16 @@ column_transformation_body <- quote({
     lapply(input$sub_inputs, lockEnvironment, bindings = TRUE)
   }
 
+  ## Finally, if some of the columns *were* dropped, explicitly 
+  ## remove them from the dataframe using `[[<-` list assignment.
+  ## This ensures that we do not drop any attributes and is faster
+  ## than subsetting to non-`NULL` columns.
+  if (!env$has_no_null) {
+    for (i in which(vapply(data, is.null, logical(1)))) {
+      data[[i]] <- NULL
+    }
+  }
+
   ## Finally, we reset the class to `data.frame` after stripping it
   ## for a speed optimization. If you study the code of ``(`[.data.frame`)``,
   ## you will see this is exactly the same trick the R base library uses

diff --git a/tests/testthat/test-column_transformation.R b/tests/testthat/test-column_transformation.R
@@ -1,5 +1,12 @@
 context("column transformation")
 
+describe("dropping columns", {
+  test_that("it can drop columns using a column_transformation", {
+    dropper <- mungebit$new(column_transformation(function(x) NULL))
+    expect_equal(dropper$run(iris, 1), iris[-1])
+  })
+})
+
 describe("Simplest examples", {
   test_that("it can run an identity column transformation", {
     expect_equal(mungebit$new(column_transformation(identity))$run(iris), iris)
@@ -184,3 +191,4 @@ describe("debugging", {
   })
 })
 
+