-
Notifications
You must be signed in to change notification settings - Fork 2
/
mungepiece.R
95 lines (86 loc) · 3.21 KB
/
mungepiece.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#' @include mungepiece-initialize.R mungepiece-run.R
NULL
## Mungebits are intended to record the dichotomy between computations
## that occur at training time, such as computing the means of variables
## during imputation, and prediction time, such as restoring `NA` values
## with the precomputed means at prediction time.
##
## While a mungebit records the general computation that can apply to
## arbitrary datasets, a *mungepiece* records the training and prediction
## arguments applicable to the mungebit. For example, if we used an imputation
## mungebit that looked as follows
##
## ```r
## imputation_mungebit <- mungebit$new(function(data, columns) {
## data[columns] <- lapply(columns, function(column) {
## if (isTRUE(trained)) {
## input[[column]] <- mean(data[[column]], na.rm = TRUE)
## }
## ifelse(is.na(data[[column]]), input[[column]], data[[column]])
## })
## })
## ```
##
## then we may wish to record the columns to which the imputation
## applies. In this case, we can use a *mungepiece*.
##
## ```r
## piece <- mungepiece$new(imputation_mungebit, imputed_columns, imputed_columns)
## ```
##
## To run the mungepiece on our data set we can say `piece$run(data, column_names)`.
## The advantage of this approach is that after the mungepiece has been trained,
## it will remember the means and can be used on single row data.frames (i.e.,
## those coming in from production) without a change in syntax or calling
## convention. This means that the typical disproportion of spending the
## majority of one's time "munging data" is drastically reduced and no
## further code has to be written to ensure the transformations run correctly
## in a production setting.
#' Mungepiece.
#'
#' @name mungepiece
#' @docType class
#' @export
mungepiece <- R6::R6Class("mungepiece",
public = list(
.mungebit = NULL, # mungebit
.train_args = NULL, # list
.predict_args = NULL,
initialize = mungepiece_initialize,
run = mungepiece_run,
debug = function() { debug(self$.mungebit) },
undebug = function() { undebug(self$.mungebit) },
trained = function() { self$.mungebit$trained() },
mungebit = function() { self$.mungebit },
train_args = function() { env2list(self$.train_args) },
predict_args = function() { env2list(self$.predict_args) },
duplicate = function(...) { duplicate_mungepiece(self, ...) }
)
)
## A helper used to make a fresh untrained replica of an
## existing mungepiece.
duplicate_mungepiece <- function(piece, ...) {
## To ensure backwards compatibility with
## [legacy mungebits](https://github.com/robertzk/mungebits),
## we perform nothing is the piece is not an R6 object (and hence
## a new mungepiece in the style of this package).
if (is.legacy_mungepiece(piece)) {
piece
} else {
mungepiece$new(piece$mungebit()$duplicate(...),
piece$train_args(), piece$predict_args())
}
}
#' Determine whether an object is a mungepiece.
#'
#' @keywords typecheck
#' @param x ANY. An R object to check.
#' @return TRUE or FALSE according as it has class mungepiece
#' @export
is.mungepiece <- function(x) {
inherits(x, "mungepiece")
}
#' @export
print.mungepiece <- function(x, ...) {
print_mungepiece(x, ...)
}