diff --git a/NAMESPACE b/NAMESPACE index 4bc035a24..95291f6ef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -137,6 +137,7 @@ export(C5.0_train) export(C5_rules) export(add_rowindex) export(augment) +export(auto_ml) export(autoplot) export(bag_mars) export(bag_tree) diff --git a/R/auto_ml.R b/R/auto_ml.R new file mode 100644 index 000000000..acad136b9 --- /dev/null +++ b/R/auto_ml.R @@ -0,0 +1,37 @@ +#' Automatic Machine Learning +#' +#' @description +#' +#' `auto_ml()` defines an automated searching and tuning process where +#' many models of different families are trained and ranked given their +#' performance on the training data. +#' +#' \Sexpr[stage=render,results=rd]{parsnip:::make_engine_list("auto_ml")} +#' +#' More information on how \pkg{parsnip} is used for modeling is at +#' \url{https://www.tidymodels.org/}. +#' +#' @param mode A single character string for the prediction outcome mode. +#' Possible values for this model are "unknown", "regression", or +#' "classification". +#' @param engine A single character string specifying what computational engine +#' to use for fitting. +#' +#' @template spec-details +#' +#' @template spec-references +#' +#' @seealso \Sexpr[stage=render,results=rd]{parsnip:::make_seealso_list("auto_ml")} +#' @export +auto_ml <- function(mode = "unknown", engine = "h2o") { + args <- list() + out <- list(args = args, eng_args = NULL, + mode = mode, method = NULL, engine = engine) + class(out) <- make_classes("auto_ml") + out +} + +# ------------------------------------------------------------------------------ +set_new_model("auto_ml") +set_model_mode("auto_ml", "regression") +set_model_mode("auto_ml", "classification") diff --git a/R/auto_ml_h2o.R b/R/auto_ml_h2o.R new file mode 100644 index 000000000..3ac860244 --- /dev/null +++ b/R/auto_ml_h2o.R @@ -0,0 +1,12 @@ +#' Automatic machine learning via h2o +#' +#' [h2o::h2o.automl] defines an automated model training process and returns a +#' leaderboard of models with best performances. +#' +#' @includeRmd man/rmd/auto_ml_h2o.md details +#' +#' @name details_auto_ml_h2o +#' @keywords internal +NULL + +# See inst/README-DOCS.md for a description of how these files are processed diff --git a/R/print.R b/R/print.R index 005ed0bfa..ff6236f97 100644 --- a/R/print.R +++ b/R/print.R @@ -32,6 +32,7 @@ get_model_desc <- function(cls) { model_descs <- tibble::tribble( ~cls, ~desc, + "auto_ml", "Automatic Machine Learning", "bag_mars", "Bagged MARS", "bag_tree", "Bagged Decision Tree", "bart", "BART", diff --git a/_pkgdown.yml b/_pkgdown.yml index 648156d09..8e36a6d1d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -33,6 +33,7 @@ figures: reference: - title: Models contents: + - auto_ml - bag_mars - bag_tree - bart diff --git a/inst/models.tsv b/inst/models.tsv index 402febbe4..e823ac58c 100644 --- a/inst/models.tsv +++ b/inst/models.tsv @@ -1,4 +1,6 @@ "model" "mode" "engine" "pkg" +"auto_ml" "classification" "h2o" "agua" +"auto_ml" "regression" "h2o" "agua" "bag_mars" "classification" "earth" "baguette" "bag_mars" "regression" "earth" "baguette" "bag_tree" "censored regression" "rpart" "censored" diff --git a/man/auto_ml.Rd b/man/auto_ml.Rd new file mode 100644 index 000000000..e3380bbf9 --- /dev/null +++ b/man/auto_ml.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/auto_ml.R +\name{auto_ml} +\alias{auto_ml} +\title{Automatic Machine Learning} +\usage{ +auto_ml(mode = "unknown", engine = "h2o") +} +\arguments{ +\item{mode}{A single character string for the prediction outcome mode. +Possible values for this model are "unknown", "regression", or +"classification".} + +\item{engine}{A single character string specifying what computational engine +to use for fitting.} +} +\description{ +\code{auto_ml()} defines an automated searching and tuning process where +many models of different families are trained and ranked given their +performance on the training data. + +\Sexpr[stage=render,results=rd]{parsnip:::make_engine_list("auto_ml")} + +More information on how \pkg{parsnip} is used for modeling is at +\url{https://www.tidymodels.org/}. +} +\details{ +This function only defines what \emph{type} of model is being fit. Once an engine +is specified, the \emph{method} to fit the model is also defined. See +\code{\link[=set_engine]{set_engine()}} for more on setting the engine, including how to set engine +arguments. + +The model is not trained or fit until the \code{\link[=fit.model_spec]{fit()}} function is used +with the data. +} +\references{ +\url{https://www.tidymodels.org}, \href{https://www.tmwr.org/}{\emph{Tidy Modeling with R}}, \href{https://www.tidymodels.org/find/parsnip/}{searchable table of parsnip models} +} +\seealso{ +\Sexpr[stage=render,results=rd]{parsnip:::make_seealso_list("auto_ml")} +} diff --git a/man/details_auto_ml_h2o.Rd b/man/details_auto_ml_h2o.Rd new file mode 100644 index 000000000..4c21dad53 --- /dev/null +++ b/man/details_auto_ml_h2o.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/auto_ml_h2o.R +\name{details_auto_ml_h2o} +\alias{details_auto_ml_h2o} +\title{Automatic machine learning via h2o} +\description{ +\link[h2o:h2o.automl]{h2o::h2o.automl} defines an automated model training process and returns a +leaderboard of models with best performances. +} +\details{ +For this engine, there are multiple modes: classification and regression +\subsection{Tuning Parameters}{ + +This model has no tuning parameters. + +Engine arguments of interest +\itemize{ +\item \code{max_runtime_secs} and \code{max_models}: controls the maximum running +time and number of models to build in the automatic process. +\item \code{exclude_algos} and \code{include_algos}: a character vector indicating +the excluded or included algorithms during model building. To see a +full list of supported models, see the details section in +\code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. +\item \code{validation}: An integer between 0 and 1 specifying the \emph{proportion} +of training data reserved as validation set. This is used by h2o for +performance assessment and potential early stopping. +} +} + +\subsection{Translation from parsnip to the original package (regression)}{ + +\code{\link[agua:h2o_train]{agua::h2o_train_auto()}} is a wrapper around +\code{\link[h2o:h2o.automl]{h2o::h2o.automl()}}. + +\if{html}{\out{
}}\preformatted{auto_ml() \%>\% + set_engine("h2o") \%>\% + set_mode("regression") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Automatic Machine Learning Model Specification (regression) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) +}\if{html}{\out{
}} +} + +\subsection{Translation from parsnip to the original package (classification)}{ + +\if{html}{\out{
}}\preformatted{auto_ml() \%>\% + set_engine("h2o") \%>\% + set_mode("classification") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Automatic Machine Learning Model Specification (classification) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) +}\if{html}{\out{
}} +} + +\subsection{Preprocessing requirements}{ + +Factor/categorical predictors need to be converted to numeric values +(e.g., dummy or indicator variables) for this engine. When using the +formula method via \code{\link[=fit.model_spec]{fit()}}, parsnip +will convert factor columns to indicators. +} + +\subsection{Initializing h2o}{ + +To use the h2o engine with tidymodels, please run \code{h2o::h2o.init()} +first. By default, This connects R to the local h2o server. This needs +to be done in every new R session. You can also connect to a remote h2o +server with an IP address, for more details see +\code{\link[h2o:h2o.init]{h2o::h2o.init()}}. + +You can control the number of threads in the thread pool used by h2o +with the \code{nthreads} argument. By default, it uses all CPUs on the host. +This is different from the usual parallel processing mechanism in +tidymodels for tuning, while tidymodels parallelizes over resamples, h2o +parallelizes over hyperparameter combinations for a given resample. + +h2o will automatically shut down the local h2o instance started by R +when R is terminated. To manually stop the h2o server, run +\code{h2o::h2o.shutdown()}. +} +} +\keyword{internal} diff --git a/man/rmd/auto_ml_h2o.Rmd b/man/rmd/auto_ml_h2o.Rmd new file mode 100644 index 000000000..cf987b668 --- /dev/null +++ b/man/rmd/auto_ml_h2o.Rmd @@ -0,0 +1,47 @@ +```{r, child = "aaa.Rmd", include = FALSE} +``` + +`r descr_models("auto_ml", "h2o")` + +## Tuning Parameters + +This model has no tuning parameters. + +Engine arguments of interest + +- `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. + +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. + +## Translation from parsnip to the original package (regression) + +[agua::h2o_train_auto()] is a wrapper around [h2o::h2o.automl()]. + +```{r h2o-reg} +auto_ml() %>% + set_engine("h2o") %>% + set_mode("regression") %>% + translate() +``` + + +## Translation from parsnip to the original package (classification) + +```{r h2o-cls} +auto_ml() %>% + set_engine("h2o") %>% + set_mode("classification") %>% + translate() +``` + +## Preprocessing requirements + +```{r child = "template-makes-dummies.Rmd"} +``` + +## Initializing h2o + +```{r child = "template-h2o-init.Rmd"} +``` diff --git a/man/rmd/auto_ml_h2o.md b/man/rmd/auto_ml_h2o.md new file mode 100644 index 000000000..2eae961b6 --- /dev/null +++ b/man/rmd/auto_ml_h2o.md @@ -0,0 +1,73 @@ + + + +For this engine, there are multiple modes: classification and regression + +## Tuning Parameters + +This model has no tuning parameters. + +Engine arguments of interest + +- `max_runtime_secs` and `max_models`: controls the maximum running time and number of models to build in the automatic process. + +- `exclude_algos` and `include_algos`: a character vector indicating the excluded or included algorithms during model building. To see a full list of supported models, see the details section in [h2o::h2o.automl()]. + +- `validation`: An integer between 0 and 1 specifying the _proportion_ of training data reserved as validation set. This is used by h2o for performance assessment and potential early stopping. + +## Translation from parsnip to the original package (regression) + +[agua::h2o_train_auto()] is a wrapper around [h2o::h2o.automl()]. + + +```r +auto_ml() %>% + set_engine("h2o") %>% + set_mode("regression") %>% + translate() +``` + +``` +## Automatic Machine Learning Model Specification (regression) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) +``` + + +## Translation from parsnip to the original package (classification) + + +```r +auto_ml() %>% + set_engine("h2o") %>% + set_mode("classification") %>% + translate() +``` + +``` +## Automatic Machine Learning Model Specification (classification) +## +## Computational engine: h2o +## +## Model fit template: +## agua::h2o_train_auto(x = missing_arg(), y = missing_arg(), weights = missing_arg(), +## validation_frame = missing_arg(), verbosity = NULL) +``` + +## Preprocessing requirements + + +Factor/categorical predictors need to be converted to numeric values (e.g., dummy or indicator variables) for this engine. When using the formula method via \\code{\\link[=fit.model_spec]{fit()}}, parsnip will convert factor columns to indicators. + +## Initializing h2o + + +To use the h2o engine with tidymodels, please run `h2o::h2o.init()` first. By default, This connects R to the local h2o server. This needs to be done in every new R session. You can also connect to a remote h2o server with an IP address, for more details see [h2o::h2o.init()]. + +You can control the number of threads in the thread pool used by h2o with the `nthreads` argument. By default, it uses all CPUs on the host. This is different from the usual parallel processing mechanism in tidymodels for tuning, while tidymodels parallelizes over resamples, h2o parallelizes over hyperparameter combinations for a given resample. + +h2o will automatically shut down the local h2o instance started by R when R is terminated. To manually stop the h2o server, run `h2o::h2o.shutdown()`.