diff --git a/.travis.yml b/.travis.yml index 8c66d198d..7871b5512 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,11 +8,16 @@ sudo: true warnings_are_errors: false r: +- 3.1 +- 3.2 +- oldrel - release - devel env: - KERAS_BACKEND="tensorflow" + global: + - MAKEFLAGS="-j 2" r_binary_packages: - rstan diff --git a/DESCRIPTION b/DESCRIPTION index e1635b0d1..c7c333099 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: parsnip -Version: 0.0.0.9003 +Version: 0.0.0.9004 Title: A Common API to Modeling and analysis Functions Description: A common interface is provided to allow users to specify a model without having to remember the different argument names across different functions or computational engines (e.g. R, spark, stan, etc). Authors@R: c( diff --git a/NAMESPACE b/NAMESPACE index c24da566c..1a9dc2bc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,12 +58,12 @@ S3method(varying_args,model_spec) S3method(varying_args,recipe) S3method(varying_args,step) export("%>%") +export(.cols) export(.dat) -export(.n_cols) -export(.n_facts) -export(.n_levs) -export(.n_obs) -export(.n_preds) +export(.facts) +export(.lvls) +export(.obs) +export(.preds) export(.x) export(.y) export(C5.0_train) diff --git a/NEWS.md b/NEWS.md index 523583198..b8bfad6f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,13 @@ +# parsnip 0.0.0.9004 + +* Arguments to modeling functions are now captured as quosures. +* `others` has been replaced by `...` +* Data descriptor names have beemn changed and are now functions. The descriptor definitions for "cols" and "preds" have been switched. + # parsnip 0.0.0.9003 * `regularization` was changed to `penalty` in a few models to be consistent with [this change](tidymodels/model-implementation-principles@08d3afd). -* if a mode is not chosen in the model specification, it is assigned at the time of fit. [51](https://github.com/topepo/parsnip/issues/51) +* If a mode is not chosen in the model specification, it is assigned at the time of fit. [51](https://github.com/topepo/parsnip/issues/51) * The underlying modeling packages now are loaded by namespace. There will be some exceptions noted in the documentation for each model. For example, in some `predict` methods, the `earth` package will need to be attached to be fully operational. # parsnip 0.0.0.9002 diff --git a/R/descriptors.R b/R/descriptors.R index 85444a776..9ff68f0df 100644 --- a/R/descriptors.R +++ b/R/descriptors.R @@ -1,21 +1,21 @@ #' @name descriptors -#' @aliases descriptors .n_obs .n_cols .n_preds .n_facts .n_levs .x .y .dat +#' @aliases descriptors .obs .cols .preds .facts .lvls .x .y .dat #' @title Data Set Characteristics Available when Fitting Models #' @description When using the `fit()` functions there are some #' variables that will be available for use in arguments. For #' example, if the user would like to choose an argument value -#' based on the current number of rows in a data set, the `.n_obs()` +#' based on the current number of rows in a data set, the `.obs()` #' function can be used. See Details below. #' @details #' Existing functions: #' \itemize{ -#' \item `.n_obs()`: The current number of rows in the data set. -#' \item `.n_cols()`: The number of columns in the data set that are +#' \item `.obs()`: The current number of rows in the data set. +#' \item `.preds()`: The number of columns in the data set that are #' associated with the predictors prior to dummy variable creation. -#' \item `.n_preds()`: The number of predictors after dummy variables -#' are created (if any). -#' \item `.n_facts()`: The number of factor predictors in the dat set. -#' \item `.n_levs()`: If the outcome is a factor, this is a table +#' \item `.cols()`: The number of predictor columns availible after dummy +#' variables are created (if any). +#' \item `.facts()`: The number of factor predictors in the dat set. +#' \item `.lvls()`: If the outcome is a factor, this is a table #' with the counts for each level (and `NA` otherwise). #' \item `.x()`: The predictors returned in the format given. Either a #' data frame or a matrix. @@ -29,26 +29,26 @@ #' For example, if you use the model formula `Sepal.Width ~ .` with the `iris` #' data, the values would be #' \preformatted{ -#' .n_cols() = 4 (the 4 columns in `iris`) -#' .n_preds() = 5 (3 numeric columns + 2 from Species dummy variables) -#' .n_obs() = 150 -#' .n_levs() = NA (no factor outcome) -#' .n_facts() = 1 (the Species predictor) -#' .y() = (Sepal.Width as a vector) -#' .x() = (The other 4 columns as a data frame) -#' .dat() = (The full data set) +#' .preds() = 4 (the 4 columns in `iris`) +#' .cols() = 5 (3 numeric columns + 2 from Species dummy variables) +#' .obs() = 150 +#' .lvls() = NA (no factor outcome) +#' .facts() = 1 (the Species predictor) +#' .y() = (Sepal.Width as a vector) +#' .x() = (The other 4 columns as a data frame) +#' .dat() = (The full data set) #' } #' #' If the formula `Species ~ .` where used: #' \preformatted{ -#' .n_cols() = 4 (the 4 numeric columns in `iris`) -#' .n_preds() = 4 (same) -#' .n_obs() = 150 -#' .n_levs() = c(setosa = 50, versicolor = 50, virginica = 50) -#' .n_facts() = 0 -#' .y() = (Species as a vector) -#' .x() = (The other 4 columns as a data frame) -#' .dat() = (The full data set) +#' .preds() = 4 (the 4 numeric columns in `iris`) +#' .cols() = 4 (same) +#' .obs() = 150 +#' .lvls() = c(setosa = 50, versicolor = 50, virginica = 50) +#' .facts() = 0 +#' .y() = (Species as a vector) +#' .x() = (The other 4 columns as a data frame) +#' .dat() = (The full data set) #' } #' #' To use these in a model fit, pass them to a model specification. @@ -60,7 +60,7 @@ #' #' data("lending_club") #' -#' rand_forest(mode = "classification", mtry = .n_cols() - 2) +#' rand_forest(mode = "classification", mtry = .cols() - 2) #' } #' #' When no descriptors are found, the computation of the descriptor values @@ -70,23 +70,23 @@ NULL #' @export #' @rdname descriptors -.n_cols <- function() descr_env$.n_cols() +.cols <- function() descr_env$.cols() #' @export #' @rdname descriptors -.n_preds <- function() descr_env$.n_preds() +.preds <- function() descr_env$.preds() #' @export #' @rdname descriptors -.n_obs <- function() descr_env$.n_obs() +.obs <- function() descr_env$.obs() #' @export #' @rdname descriptors -.n_levs <- function() descr_env$.n_levs() +.lvls <- function() descr_env$.lvls() #' @export #' @rdname descriptors -.n_facts <- function() descr_env$.n_facts() +.facts <- function() descr_env$.facts() #' @export #' @rdname descriptors @@ -116,24 +116,24 @@ get_descr_df <- function(formula, data) { tmp_dat <- convert_form_to_xy_fit(formula, data, indicators = FALSE) if(is.factor(tmp_dat$y)) { - .n_levs <- function() { + .lvls <- function() { table(tmp_dat$y, dnn = NULL) } - } else .n_levs <- function() { NA } + } else .lvls <- function() { NA } - .n_cols <- function() { + .preds <- function() { ncol(tmp_dat$x) } - .n_preds <- function() { + .cols <- function() { ncol(convert_form_to_xy_fit(formula, data, indicators = TRUE)$x) } - .n_obs <- function() { + .obs <- function() { nrow(data) } - .n_facts <- function() { + .facts <- function() { sum(vapply(tmp_dat$x, is.factor, logical(1))) } @@ -150,11 +150,11 @@ get_descr_df <- function(formula, data) { } list( - .n_cols = .n_cols, - .n_preds = .n_preds, - .n_obs = .n_obs, - .n_levs = .n_levs, - .n_facts = .n_facts, + .cols = .cols, + .preds = .preds, + .obs = .obs, + .lvls = .lvls, + .facts = .facts, .dat = .dat, .x = .x, .y = .y @@ -233,11 +233,11 @@ get_descr_spark <- function(formula, data) { obs <- dplyr::tally(data) %>% dplyr::pull() - .n_cols <- function() length(f_term_labels) - .n_preds <- function() all_preds - .n_obs <- function() obs - .n_levs <- function() y_vals - .n_facts <- function() factor_pred + .cols <- function() all_preds + .preds <- function() length(f_term_labels) + .obs <- function() obs + .lvls <- function() y_vals + .facts <- function() factor_pred .x <- function() abort("Descriptor `.x()` not defined for Spark.") .y <- function() abort("Descriptor `.y()` not defined for Spark.") .dat <- function() abort("Descriptor `.dat()` not defined for Spark.") @@ -245,11 +245,11 @@ get_descr_spark <- function(formula, data) { # still need .x(), .y(), .dat() ? list( - .n_cols = .n_cols, - .n_preds = .n_preds, - .n_obs = .n_obs, - .n_levs = .n_levs, - .n_facts = .n_facts, + .cols = .cols, + .preds = .preds, + .obs = .obs, + .lvls = .lvls, + .facts = .facts, .dat = .dat, .x = .x, .y = .y @@ -258,25 +258,25 @@ get_descr_spark <- function(formula, data) { get_descr_xy <- function(x, y) { - .n_levs <- if (is.factor(y)) { + .lvls <- if (is.factor(y)) { function() table(y, dnn = NULL) } else { function() NA } - .n_cols <- function() { + .cols <- function() { ncol(x) } - .n_preds <- function() { + .preds <- function() { ncol(x) } - .n_obs <- function() { + .obs <- function() { nrow(x) } - .n_facts <- function() { + .facts <- function() { if(is.data.frame(x)) sum(vapply(x, is.factor, logical(1))) else @@ -296,11 +296,11 @@ get_descr_xy <- function(x, y) { } list( - .n_cols = .n_cols, - .n_preds = .n_preds, - .n_obs = .n_obs, - .n_levs = .n_levs, - .n_facts = .n_facts, + .cols = .cols, + .preds = .preds, + .obs = .obs, + .lvls = .lvls, + .facts = .facts, .dat = .dat, .x = .x, .y = .y @@ -363,11 +363,11 @@ has_any_descrs <- function(x) { is_descr <- function(x) { descrs <- list( - ".n_cols", - ".n_preds", - ".n_obs", - ".n_levs", - ".n_facts", + ".cols", + ".preds", + ".obs", + ".lvls", + ".facts", ".x", ".y", ".dat" @@ -378,7 +378,7 @@ is_descr <- function(x) { # Helpers for overwriting descriptors temporarily ------------------------------ -# descrs = list of functions that actually eval to .n_cols() +# descrs = list of functions that actually eval to .cols() poke_descrs <- function(descrs) { descr_names <- names(descr_env) @@ -414,13 +414,13 @@ scoped_descrs <- function(descrs, frame = caller_env()) { # with their actual implementations descr_env <- rlang::new_environment( data = list( - .n_cols = function() abort("Descriptor context not set"), - .n_preds = function() abort("Descriptor context not set"), - .n_obs = function() abort("Descriptor context not set"), - .n_levs = function() abort("Descriptor context not set"), - .n_facts = function() abort("Descriptor context not set"), - .x = function() abort("Descriptor context not set"), - .y = function() abort("Descriptor context not set"), - .dat = function() abort("Descriptor context not set") + .cols = function() abort("Descriptor context not set"), + .preds = function() abort("Descriptor context not set"), + .obs = function() abort("Descriptor context not set"), + .lvls = function() abort("Descriptor context not set"), + .facts = function() abort("Descriptor context not set"), + .x = function() abort("Descriptor context not set"), + .y = function() abort("Descriptor context not set"), + .dat = function() abort("Descriptor context not set") ) ) diff --git a/R/model_object_docs.R b/R/model_object_docs.R index 308ce15c3..ed563f788 100644 --- a/R/model_object_docs.R +++ b/R/model_object_docs.R @@ -1,73 +1,186 @@ #' Model Specification Information -#' -#' +#' +#' #' An object with class "model_spec" is a container for #' information about a model that will be fit. -#' +#' #' The main elements of the object are: -#' -#' * `args`: A vector of the main arguments for the model. The +#' +#' * `args`: A vector of the main arguments for the model. The #' names of these arguments may be different form their #' counterparts n the underlying model function. For example, for a #' `glmnet` model, the argument name for the amount of the penalty -#' is called "penalty" instead of "lambda" to make it more -#' general and usable across different types of models (and to not -#' be specific to a particular model function). The elements of -#' `args` can be quoted expressions or `varying()`. If left to -#' their defaults (`NULL`), the arguments will use the underlying -#' model functions default value. -#' -#' * `other`: An optional vector of model-function-specific -#' parameters. As with `args`, these can also be quoted or +#' is called "penalty" instead of "lambda" to make it more general +#' and usable across different types of models (and to not be +#' specific to a particular model function). The elements of `args` +#' can `varying()`. If left to their defaults (`NULL`), the +#' arguments will use the underlying model functions default value. +#' As discussed below, the arguments in `args` are captured as +#' quosures and are not immediately executed. +#' +#' * `...`: Optional model-function-specific +#' parameters. As with `args`, these will be quosures and can be #' `varying()`. -#' +#' #' * `mode`: The type of model, such as "regression" or #' "classification". Other modes will be added once the package #' adds more functionality. - -#' +#' #' * `method`: This is a slot that is filled in later by the #' model's constructor function. It generally contains lists of #' information that are used to create the fit and prediction code #' as well as required packages and similar data. -#' +#' #' * `engine`: This character string declares exactly what #' software will be used. It can be a package name or a technology #' type. -#' +#' #' This class and structure is the basis for how \pkg{parsnip} #' stores model objects prior to seeing the data. -#' @rdname model_spec +#' +#' @section Argument Details: +#' +#' An important detail to understand when creating model +#' specifications is that they are intended to be functionally +#' independent of the data. While it is true that some tuning +#' parameters are _data dependent_, the model specification does +#' not interact with the data at all. +#' +#' For example, most R functions immediately evaluate their +#' arguments. For example, when calling `mean(dat_vec)`, the object +#' `dat_vec` is immediately evaluated inside of the function. +#' +#' `parsnip` model functions do not do this. For example, using +#' +#'\preformatted{ +#' rand_forest(mtry = ncol(iris) - 1) +#' } +#' +#' **does not** execute `ncol(iris) - 1` when creating the specification. +#' This can be seen in the output: +#' +#'\preformatted{ +#' > rand_forest(mtry = ncol(iris) - 1) +#' Random Forest Model Specification (unknown) +#' +#' Main Arguments: +#' mtry = ncol(iris) - 1 +#'} +#' +#' The model functions save the argument _expressions_ and their +#' associated environments (a.k.a. a quosure) to be evaluated later +#' when either [fit()] or [fit_xy()] are called with the actual +#' data. +#' +#' The consequence of this strategy is that any data required to +#' get the parameter values must be available when the model is +#' fit. The two main ways that this can fail is if: +#' +#' \enumerate{ +#' \item The data have been modified between the creation of the +#' model specification and when the model fit function is invoked. +#' +#' \item If the model specification is saved and loaded into a new +#' session where those same data objects do not exist. +#' } +#' +#' The best way to avoid these issues is to not reference any data +#' objects in the global environment but to use data descriptors +#' such as `.cols()`. Another way of writing the previous +#' specification is +#' +#'\preformatted{ +#' rand_forest(mtry = .cols() - 1) +#' } +#' +#' This is not dependent on any specific data object and +#' is evaluated immediately before the model fitting process begins. +#' +#' One less advantageous approach to solving this issue is to use +#' quasiquotation. This would insert the actual R object into the +#' model specification and might be the best idea when the data +#' object is small. For example, using +#' +#'\preformatted{ +#' rand_forest(mtry = ncol(!!iris) - 1) +#' } +#' +#' would work (and be reproducible between sessions) but embeds +#' the entire iris data set into the `mtry` expression: +#' +#'\preformatted{ +#' > rand_forest(mtry = ncol(!!iris) - 1) +#' Random Forest Model Specification (unknown) +#' +#' Main Arguments: +#' mtry = ncol(structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5, +#'} +#' +#' However, if there were an object with the number of columns in +#' it, this wouldn't be too bad: +#' +#'\preformatted{ +#' > mtry_val <- ncol(iris) - 1 +#' > mtry_val +#' [1] 4 +#' > rand_forest(mtry = !!mtry_val) +#' Random Forest Model Specification (unknown) +#' +#' Main Arguments: +#' mtry = 4 +#'} +#' +#' More information on quosures and quasiquotation can be found at +#' \url{https://tidyeval.tidyverse.org}. +#' +#' @rdname model_spec #' @name model_spec NULL #' Model Fit Object Information -#' -#' +#' +#' #' An object with class "model_fit" is a container for #' information about a model that has been fit to the data. -#' +#' #' The main elements of the object are: -#' -#' * `lvl`: A vector of factor levels when the outcome is +#' +#' * `lvl`: A vector of factor levels when the outcome is #' is a factor. This is `NULL` when the outcome is not a factor -#' vector. -#' +#' vector. +#' #' * `spec`: A `model_spec` object. -#' +#' #' * `fit`: The object produced by the fitting function. -#' +#' #' * `preproc`: This contains any data-specific information #' required to process new a sample point for prediction. For #' example, if the underlying model function requires arguments `x` #' and `y` and the user passed a formula to `fit`, the `preproc` #' object would contain items such as the terms object and so on. #' When no information is required, this is `NA`. -#' -#' +#' +#' As discussed in the documentation for [`model_spec`], the +#' original arguments to the specification are saved as quosures. +#' These are evaluated for the `model_fit` object prior to fitting. +#' If the resulting model object prints its call, any user-defined +#' options are shown in the call preceded by a tilde (see the +#' example below). This is a result of the use of quosures in the +#' specification. +#' #' This class and structure is the basis for how \pkg{parsnip} #' stores model objects after to seeing the data and applying a model. -#' @rdname model_fit +#' @rdname model_fit #' @name model_fit +#' @examples +#' +#' # Keep the `x` matrix if the data are not too big. +#' spec_obj <- linear_reg(x = ifelse(.obs() < 500, TRUE, FALSE)) +#' spec_obj +#' +#' fit_obj <- fit(spec_obj, mpg ~ ., data = mtcars, engine = "lm") +#' fit_obj +#' +#' nrow(fit_obj$fit$x) NULL diff --git a/README.md b/README.md index f31c44574..4905606f3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -# parsnip - [![Travis build status](https://travis-ci.org/topepo/parsnip.svg?branch=master)](https://travis-ci.org/topepo/parsnip) [![Coverage status](https://codecov.io/gh/topepo/parsnip/branch/master/graph/badge.svg)](https://codecov.io/github/topepo/parsnip?branch=master) ![](https://img.shields.io/badge/lifecycle-experimental-orange.svg) diff --git a/_pkgdown.yml b/_pkgdown.yml index aaae7fa7a..054c37756 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,8 @@ template: package: tidytemplate - default_assets: false + params: + part_of: tidymodels + footer: parsnip is a part of the tidymodels ecosystem, a collection of modeling packages designed with common APIs and a shared philosophy. # https://github.com/tidyverse/tidytemplate for css diff --git a/docs/articles/articles/Classification.html b/docs/articles/articles/Classification.html index 9b3e6090c..4728624de 100644 --- a/docs/articles/articles/Classification.html +++ b/docs/articles/articles/Classification.html @@ -1,5 +1,5 @@ - + @@ -8,18 +8,29 @@ Classification Example • parsnip - - + + + + + +
diff --git a/docs/articles/articles/Models.html b/docs/articles/articles/Models.html index f65b97003..d93d853e0 100644 --- a/docs/articles/articles/Models.html +++ b/docs/articles/articles/Models.html @@ -1,5 +1,5 @@ - + @@ -8,18 +8,29 @@ List of Models • parsnip - - + + + + + +