From d18f581ab7d763279c3fa5646b111d9075b7b2c0 Mon Sep 17 00:00:00 2001 From: qiushi Date: Wed, 15 Jun 2022 17:53:41 -0500 Subject: [PATCH 1/6] add auto_ml --- NAMESPACE | 1 + R/auto_ml.R | 32 +++++++++++++ R/auto_ml_h2o.R | 12 +++++ R/print.R | 1 + inst/models.tsv | 2 + man/auto_ml.Rd | 33 ++++++++++++++ man/details_auto_ml_h2o.Rd | 92 ++++++++++++++++++++++++++++++++++++++ man/rmd/auto_ml_h2o.Rmd | 45 +++++++++++++++++++ man/rmd/auto_ml_h2o.md | 70 +++++++++++++++++++++++++++++ 9 files changed, 288 insertions(+) create mode 100644 R/auto_ml.R create mode 100644 R/auto_ml_h2o.R create mode 100644 man/auto_ml.Rd create mode 100644 man/details_auto_ml_h2o.Rd create mode 100644 man/rmd/auto_ml_h2o.Rmd create mode 100644 man/rmd/auto_ml_h2o.md diff --git a/NAMESPACE b/NAMESPACE index 4bc035a24..95291f6ef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -137,6 +137,7 @@ export(C5.0_train) export(C5_rules) export(add_rowindex) export(augment) +export(auto_ml) export(autoplot) export(bag_mars) export(bag_tree) diff --git a/R/auto_ml.R b/R/auto_ml.R new file mode 100644 index 000000000..65ee06223 --- /dev/null +++ b/R/auto_ml.R @@ -0,0 +1,32 @@ +#' Automatic Machine Learning +#' +#' @description +#' +#' `auto_ml` defines an automated searching and tuning process where +#' many models of different families are trained and ranked given their +#' performance on the training data. +#' +#' \Sexpr[stage=render,results=rd]{parsnip:::make_engine_list("auto_ml")} +#' +#' More information on how \pkg{parsnip} is used for modeling is at +#' \url{https://www.tidymodels.org/}. +#' +#' +#' @template spec-details +#' +#' @template spec-references +#' +#' @seealso \Sexpr[stage=render,results=rd]{parsnip:::make_seealso_list("auto_ml")} +#' @export +auto_ml <- function(mode = "unknown", engine = "h2o") { + args <- list() + out <- list(args = args, eng_args = NULL, + mode = mode, method = NULL, engine = NULL) + class(out) <- make_classes("auto_ml") + out +} + +# ------------------------------------------------------------------------------ +set_new_model("auto_ml") +set_model_mode("auto_ml", "regression") +set_model_mode("auto_ml", "classification") diff --git a/R/auto_ml_h2o.R b/R/auto_ml_h2o.R new file mode 100644 index 000000000..3ac860244 --- /dev/null +++ b/R/auto_ml_h2o.R @@ -0,0 +1,12 @@ +#' Automatic machine learning via h2o +#' +#' [h2o::h2o.automl] defines an automated model training process and returns a +#' leaderboard of models with best performances. +#' +#' @includeRmd man/rmd/auto_ml_h2o.md details +#' +#' @name details_auto_ml_h2o +#' @keywords internal +NULL + +# See inst/README-DOCS.md for a description of how these files are processed diff --git a/R/print.R b/R/print.R index 005ed0bfa..ff6236f97 100644 --- a/R/print.R +++ b/R/print.R @@ -32,6 +32,7 @@ get_model_desc <- function(cls) { model_descs <- tibble::tribble( ~cls, ~desc, + "auto_ml", "Automatic Machine Learning", "bag_mars", "Bagged MARS", "bag_tree", "Bagged Decision Tree", "bart", "BART", diff --git a/inst/models.tsv b/inst/models.tsv index 402febbe4..e823ac58c 100644 --- a/inst/models.tsv +++ b/inst/models.tsv @@ -1,4 +1,6 @@ "model" "mode" "engine" "pkg" +"auto_ml" "classification" "h2o" "agua" +"auto_ml" "regression" "h2o" "agua" "bag_mars" "classification" "earth" "baguette" "bag_mars" "regression" "earth" "baguette" "bag_tree" "censored regression" "rpart" "censored" diff --git a/man/auto_ml.Rd b/man/auto_ml.Rd new file mode 100644 index 000000000..0bd7d18ef --- /dev/null +++ b/man/auto_ml.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/auto_ml.R +\name{auto_ml} +\alias{auto_ml} +\title{Automatic Machine Learning} +\usage{ +auto_ml(mode = "unknown", engine = "h2o") +} +\description{ +\code{auto_ml} defines an automated searching and tuning process where +many models of different families are trained and ranked given their +performance on the training data. + +\Sexpr[stage=render,results=rd]{parsnip:::make_engine_list("auto_ml")} + +More information on how \pkg{parsnip} is used for modeling is at +\url{https://www.tidymodels.org/}. +} +\details{ +This function only defines what \emph{type} of model is being fit. Once an engine +is specified, the \emph{method} to fit the model is also defined. See +\code{\link[=set_engine]{set_engine()}} for more on setting the engine, including how to set engine +arguments. + +The model is not trained or fit until the \code{\link[=fit.model_spec]{fit()}} function is used +with the data. +} +\references{ +\url{https://www.tidymodels.org}, \href{https://www.tmwr.org/}{\emph{Tidy Modeling with R}}, \href{https://www.tidymodels.org/find/parsnip/}{searchable table of parsnip models} +} +\seealso{ +\Sexpr[stage=render,results=rd]{parsnip:::make_seealso_list("auto_ml")} +} diff --git a/man/details_auto_ml_h2o.Rd b/man/details_auto_ml_h2o.Rd new file mode 100644 index 000000000..ec43558ea --- /dev/null +++ b/man/details_auto_ml_h2o.Rd @@ -0,0 +1,92 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/auto_ml_h2o.R +\name{details_auto_ml_h2o} +\alias{details_auto_ml_h2o} +\title{Automatic machine learning via h2o} +\description{ +\link[h2o:h2o.automl]{h2o::h2o.automl} defines an automated model training process and returns a +leaderboard of models with best performances. +} +\details{ +For this engine, there are multiple modes: classification and regression +\subsection{Tuning Parameters}{ + +This model has no tuning parameters. + +Engine arguments of interest +\itemize{ +\item \code{max_runtime_secs} and \code{max_models}: controls the maximum running +time and number of models to build in the automatic process. +\item \code{exclude_algos} and \code{include_algos}: a character vector indicating +the excluded or included algorithms during model building. To see a +full list of supported models, see the details section in +\code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. +} +} + +\subsection{Translation from parsnip to the original package (regression)}{ + +\code{\link[agua:h2o_train]{agua::h2o_train_auto()}} is a wrapper around +\code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. + +\if{html}{\out{
}}\preformatted{auto_ml() \%>\% + set_engine("h2o") \%>\% + set_mode("regression") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Automatic Machine Learning Model Specification (regression) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg()) +}\if{html}{\out{
}} +} + +\subsection{Translation from parsnip to the original package (classification)}{ + +\if{html}{\out{
}}\preformatted{auto_ml() \%>\% + set_engine("h2o") \%>\% + set_mode("classification") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Automatic Machine Learning Model Specification (classification) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## verbosity = NULL) +}\if{html}{\out{
}} +} + +\subsection{Preprocessing requirements}{ + +Factor/categorical predictors need to be converted to numeric values +(e.g., dummy or indicator variables) for this engine. When using the +formula method via \code{\link[=fit.model_spec]{fit()}}, parsnip +will convert factor columns to indicators. +} + +\subsection{Initializing h2o}{ + +To use the h2o engine with tidymodels, please run \code{h2o::h2o.init()} +first. By default, This connects R to the local h2o server. This needs +to be done in every new R session. You can also connect to a remote h2o +server with an IP address, for more details see +\code{\link[h2o:h2o.init]{h2o::h2o.init()}}. + +You can control the number of threads in the thread pool used by h2o +with the \code{nthreads} argument. By default, it uses all CPUs on the host. +This is different from the usual parallel processing mechanism in +tidymodels for tuning, while tidymodels parallelizes over resamples, h2o +parallelizes over hyperparameter combinations for a given resample. + +h2o will automatically shut down the local h2o instance started by R +when R is terminated. To manually stop the h2o server, run +\code{h2o::h2o.shutdown()}. +} +} +\keyword{internal} diff --git a/man/rmd/auto_ml_h2o.Rmd b/man/rmd/auto_ml_h2o.Rmd new file mode 100644 index 000000000..43965c75d --- /dev/null +++ b/man/rmd/auto_ml_h2o.Rmd @@ -0,0 +1,45 @@ +```{r, child = "aaa.Rmd", include = FALSE} +``` + +`r descr_models("auto_ml", "h2o")` + +## Tuning Parameters + +This model has no tuning parameters. + +Engine arguments of interest + +- `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. + +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +## Translation from parsnip to the original package (regression) + +[agua::h2o_train_auto()] is a wrapper around [h2o::h2o.automl()]. + +```{r h2o-reg} +auto_ml() %>% + set_engine("h2o") %>% + set_mode("regression") %>% + translate() +``` + + +## Translation from parsnip to the original package (classification) + +```{r h2o-cls} +auto_ml() %>% + set_engine("h2o") %>% + set_mode("classification") %>% + translate() +``` + +## Preprocessing requirements + +```{r child = "template-makes-dummies.Rmd"} +``` + +## Initializing h2o + +```{r child = "template-h2o-init.Rmd"} +``` diff --git a/man/rmd/auto_ml_h2o.md b/man/rmd/auto_ml_h2o.md new file mode 100644 index 000000000..f7ff63269 --- /dev/null +++ b/man/rmd/auto_ml_h2o.md @@ -0,0 +1,70 @@ + + + +For this engine, there are multiple modes: classification and regression + +## Tuning Parameters + +This model has no tuning parameters. + +Engine arguments of interest + +- `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. + +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +## Translation from parsnip to the original package (regression) + +[agua::h2o_train_auto()] is a wrapper around [h2o::h2o.automl()]. + + +```r +auto_ml() %>% + set_engine("h2o") %>% + set_mode("regression") %>% + translate() +``` + +``` +## Automatic Machine Learning Model Specification (regression) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg()) +``` + + +## Translation from parsnip to the original package (classification) + + +```r +auto_ml() %>% + set_engine("h2o") %>% + set_mode("classification") %>% + translate() +``` + +``` +## Automatic Machine Learning Model Specification (classification) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## verbosity = NULL) +``` + +## Preprocessing requirements + + +Factor/categorical predictors need to be converted to numeric values (e.g., dummy or indicator variables) for this engine. When using the formula method via \\code{\\link[=fit.model_spec]{fit()}}, parsnip will convert factor columns to indicators. + +## Initializing h2o + + +To use the h2o engine with tidymodels, please run `h2o::h2o.init()` first. By default, This connects R to the local h2o server. This needs to be done in every new R session. You can also connect to a remote h2o server with an IP address, for more details see [h2o::h2o.init()]. + +You can control the number of threads in the thread pool used by h2o with the `nthreads` argument. By default, it uses all CPUs on the host. This is different from the usual parallel processing mechanism in tidymodels for tuning, while tidymodels parallelizes over resamples, h2o parallelizes over hyperparameter combinations for a given resample. + +h2o will automatically shut down the local h2o instance started by R when R is terminated. To manually stop the h2o server, run `h2o::h2o.shutdown()`. From 8bc0e123328aeee8f9219826f78ca6b1d2aeb62f Mon Sep 17 00:00:00 2001 From: qiushi Date: Wed, 15 Jun 2022 20:21:30 -0500 Subject: [PATCH 2/6] use default engine --- R/auto_ml.R | 9 +++++++-- man/auto_ml.Rd | 10 +++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/R/auto_ml.R b/R/auto_ml.R index 65ee06223..acad136b9 100644 --- a/R/auto_ml.R +++ b/R/auto_ml.R @@ -2,7 +2,7 @@ #' #' @description #' -#' `auto_ml` defines an automated searching and tuning process where +#' `auto_ml()` defines an automated searching and tuning process where #' many models of different families are trained and ranked given their #' performance on the training data. #' @@ -11,6 +11,11 @@ #' More information on how \pkg{parsnip} is used for modeling is at #' \url{https://www.tidymodels.org/}. #' +#' @param mode A single character string for the prediction outcome mode. +#' Possible values for this model are "unknown", "regression", or +#' "classification". +#' @param engine A single character string specifying what computational engine +#' to use for fitting. #' #' @template spec-details #' @@ -21,7 +26,7 @@ auto_ml <- function(mode = "unknown", engine = "h2o") { args <- list() out <- list(args = args, eng_args = NULL, - mode = mode, method = NULL, engine = NULL) + mode = mode, method = NULL, engine = engine) class(out) <- make_classes("auto_ml") out } diff --git a/man/auto_ml.Rd b/man/auto_ml.Rd index 0bd7d18ef..e3380bbf9 100644 --- a/man/auto_ml.Rd +++ b/man/auto_ml.Rd @@ -6,8 +6,16 @@ \usage{ auto_ml(mode = "unknown", engine = "h2o") } +\arguments{ +\item{mode}{A single character string for the prediction outcome mode. +Possible values for this model are "unknown", "regression", or +"classification".} + +\item{engine}{A single character string specifying what computational engine +to use for fitting.} +} \description{ -\code{auto_ml} defines an automated searching and tuning process where +\code{auto_ml()} defines an automated searching and tuning process where many models of different families are trained and ranked given their performance on the training data. From bfb8806b61068c5365c0562b1fc9cef32b4b3395 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 16 Jun 2022 07:52:43 -0400 Subject: [PATCH 3/6] missing pkgdown entry --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 648156d09..edf40da23 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -33,6 +33,7 @@ figures: reference: - title: Models contents: + - automl - bag_mars - bag_tree - bart From a4a01d5c5f70f881cc6b6a80bd7d9044c18f3812 Mon Sep 17 00:00:00 2001 From: qiushi Date: Wed, 22 Jun 2022 12:30:57 -0500 Subject: [PATCH 4/6] automl -> auto_ml --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index edf40da23..8e36a6d1d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -33,7 +33,7 @@ figures: reference: - title: Models contents: - - automl + - auto_ml - bag_mars - bag_tree - bart From f217761bfe86c18f94afe94bb2222e2b3f55060c Mon Sep 17 00:00:00 2001 From: qiushi Date: Wed, 22 Jun 2022 14:08:31 -0500 Subject: [PATCH 5/6] validation parameter --- man/details_auto_ml_h2o.Rd | 8 ++++++-- man/rmd/auto_ml_h2o.Rmd | 4 +++- man/rmd/auto_ml_h2o.md | 9 ++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/man/details_auto_ml_h2o.Rd b/man/details_auto_ml_h2o.Rd index ec43558ea..b2aea1ab4 100644 --- a/man/details_auto_ml_h2o.Rd +++ b/man/details_auto_ml_h2o.Rd @@ -21,6 +21,9 @@ time and number of models to build in the automatic process. the excluded or included algorithms during model building. To see a full list of supported models, see the details section in \code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. +\item \code{validation}: An integer between 0 and 1 specifying the \emph{proportion} +of training data reserved as validation set. This is used by h2o for +performance assessment and potential early stopping. For } } @@ -40,7 +43,8 @@ full list of supported models, see the details section in ## Computational engine: h2o ## ## Model fit template: -## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg()) +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) }\if{html}{\out{}} } @@ -58,7 +62,7 @@ full list of supported models, see the details section in ## ## Model fit template: ## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), -## verbosity = NULL) +## validation_frame = missing_arg(), verbosity = NULL) }\if{html}{\out{}} } diff --git a/man/rmd/auto_ml_h2o.Rmd b/man/rmd/auto_ml_h2o.Rmd index 43965c75d..ece7ca2a8 100644 --- a/man/rmd/auto_ml_h2o.Rmd +++ b/man/rmd/auto_ml_h2o.Rmd @@ -11,7 +11,9 @@ Engine arguments of interest - `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. -- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. For ## Translation from parsnip to the original package (regression) diff --git a/man/rmd/auto_ml_h2o.md b/man/rmd/auto_ml_h2o.md index f7ff63269..a432d9823 100644 --- a/man/rmd/auto_ml_h2o.md +++ b/man/rmd/auto_ml_h2o.md @@ -11,7 +11,9 @@ Engine arguments of interest - `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. -- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. For ## Translation from parsnip to the original package (regression) @@ -31,7 +33,8 @@ auto_ml() %>% ## Computational engine: h2o ## ## Model fit template: -## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg()) +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) ``` @@ -52,7 +55,7 @@ auto_ml() %>% ## ## Model fit template: ## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), -## verbosity = NULL) +## validation_frame = missing_arg(), verbosity = NULL) ``` ## Preprocessing requirements From 7d9417c42d907c9226c5933c331d12b1d1c28196 Mon Sep 17 00:00:00 2001 From: qiushi Date: Wed, 22 Jun 2022 14:12:09 -0500 Subject: [PATCH 6/6] typo --- man/details_auto_ml_h2o.Rd | 2 +- man/rmd/auto_ml_h2o.Rmd | 2 +- man/rmd/auto_ml_h2o.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/man/details_auto_ml_h2o.Rd b/man/details_auto_ml_h2o.Rd index b2aea1ab4..4c21dad53 100644 --- a/man/details_auto_ml_h2o.Rd +++ b/man/details_auto_ml_h2o.Rd @@ -23,7 +23,7 @@ full list of supported models, see the details section in \code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. \item \code{validation}: An integer between 0 and 1 specifying the \emph{proportion} of training data reserved as validation set. This is used by h2o for -performance assessment and potential early stopping. For +performance assessment and potential early stopping. } } diff --git a/man/rmd/auto_ml_h2o.Rmd b/man/rmd/auto_ml_h2o.Rmd index ece7ca2a8..cf987b668 100644 --- a/man/rmd/auto_ml_h2o.Rmd +++ b/man/rmd/auto_ml_h2o.Rmd @@ -13,7 +13,7 @@ Engine arguments of interest - `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. -- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. For +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. ## Translation from parsnip to the original package (regression) diff --git a/man/rmd/auto_ml_h2o.md b/man/rmd/auto_ml_h2o.md index a432d9823..2eae961b6 100644 --- a/man/rmd/auto_ml_h2o.md +++ b/man/rmd/auto_ml_h2o.md @@ -13,7 +13,7 @@ Engine arguments of interest - `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. -- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. For +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. ## Translation from parsnip to the original package (regression)