From bd101e80182963e3fbbe155e95f447d97dfa2f34 Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Wed, 23 Jan 2019 14:35:47 -0800 Subject: [PATCH 1/6] adding null model --- NAMESPACE | 5 + R/nullmodel.R | 165 ++++++++++++++++++++++++++++++++ R/nullmodel_data.R | 58 +++++++++++ man/null_model.Rd | 69 +++++++++++++ man/nullmodel.Rd | 47 +++++++++ tests/testthat/test-nullmodel.R | 121 +++++++++++++++++++++++ 6 files changed, 465 insertions(+) create mode 100644 R/nullmodel.R create mode 100644 R/nullmodel_data.R create mode 100644 man/null_model.Rd create mode 100644 man/nullmodel.Rd create mode 100644 tests/testthat/test-nullmodel.R diff --git a/NAMESPACE b/NAMESPACE index 8ce771ce7..d201d151e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,10 +9,12 @@ S3method(multi_predict,"_lognet") S3method(multi_predict,"_multnet") S3method(multi_predict,"_xgb.Booster") S3method(multi_predict,default) +S3method(null_model,default) S3method(predict,"_elnet") S3method(predict,"_lognet") S3method(predict,"_multnet") S3method(predict,model_fit) +S3method(predict,null_model) S3method(predict_class,"_lognet") S3method(predict_class,model_fit) S3method(predict_classprob,"_lognet") @@ -37,6 +39,7 @@ S3method(print,model_fit) S3method(print,model_spec) S3method(print,multinom_reg) S3method(print,nearest_neighbor) +S3method(print,null_model) S3method(print,rand_forest) S3method(print,surv_reg) S3method(print,svm_poly) @@ -95,6 +98,8 @@ export(model_printer) export(multi_predict) export(multinom_reg) export(nearest_neighbor) +export(null_model) +export(nullmodel) export(predict.model_fit) export(predict_class) export(predict_class.model_fit) diff --git a/R/nullmodel.R b/R/nullmodel.R new file mode 100644 index 000000000..5c564210c --- /dev/null +++ b/R/nullmodel.R @@ -0,0 +1,165 @@ +#' Fit a simple, non-informative model +#' +#' Fit a single mean or largest class model +#' +#' \code{null_model} emulates other model building functions, but returns the +#' simplest model possible given a training set: a single mean for numeric +#' outcomes and the most prevalent class for factor outcomes. When class +#' probabilities are requested, the percentage of the training set samples with +#' the most prevalent class is returned. +#' +#' @aliases null_model null_model.default predict.null_model +#' @param x An optional matrix or data frame of predictors. These values are +#' not used in the model fit +#' @param y A numeric vector (for regression) or factor (for classification) of +#' outcomes +#' @param \dots Optional arguments (not yet used) +#' @param object An object of class \code{null_model} +#' @param newdata A matrix or data frame of predictors (only used to determine +#' the number of predictions to return) +#' @param type Either "raw" (for regression), "class" or "prob" (for +#' classification) +#' @return The output of \code{null_model} is a list of class \code{null_model} +#' with elements \item{call }{the function call} \item{value }{the mean of +#' \code{y} or the most prevalent class} \item{levels }{when \code{y} is a +#' factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} +#' is a factor, a data frame with a column for each class (\code{NULL} +#' otherwise). The column for the most prevalent class has the proportion of +#' the training samples with that class (the other columns are zero). } \item{n +#' }{the number of elements in \code{y}} +#' +#' \code{predict.null_model} returns a either a factor or numeric vector +#' depending on the class of \code{y}. All predictions are always the same. +#' @keywords models +#' @examples +#' +#' outcome <- factor(sample(letters[1:2], +#' size = 100, +#' prob = c(.1, .9), +#' replace = TRUE)) +#' useless <- null_model(y = outcome) +#' useless +#' predict(useless, matrix(NA, nrow = 10)) +#' +#' @export +null_model <- function (x, ...) UseMethod("null_model") + +#' @export +#' @rdname null_model +null_model.default <- function(x = NULL, y, ...) +{ + + if(is.factor(y)) + { + lvls <- levels(y) + tab <- table(y) + value <- names(tab)[which.max(tab)] + pct <- tab/sum(tab) + } else { + lvls <- NULL + pct <- NULL + value <- mean(y, na.rm = TRUE) + } + structure( + list(call = match.call(), + value = value, + levels = lvls, + pct = pct, + n = length(y)), + class = "null_model") +} + +#' @export +#' @rdname null_model +print.null_model <- function(x, ...) +{ + cat("Null", + ifelse(is.null(x$levels), "Classification", "Regression"), + "Model\n") + x$call + + cat("Predicted Value:", + ifelse(is.null(x$levels), format(x$value), x$value), + "\n") +} + +#' @export +#' @rdname null_model +predict.null_model <- function (object, newdata = NULL, type = NULL, ...) +{ + if(is.null(type)) + { + type <- if(is.null(object$levels)) "raw" else "class" + } + + n <- if(is.null(newdata)) object$n else nrow(newdata) + if(!is.null(object$levels)) + { + if(type == "prob") + { + out <- matrix(rep(object$pct, n), nrow = n, byrow = TRUE) + colnames(out) <- object$levels + out <- as.data.frame(out) + } else { + out <- factor(rep(object$value, n), levels = object$levels) + } + } else { + if(type %in% c("prob", "class")) stop("ony raw predicitons are applicable to regression models") + out <- rep(object$value, n) + } + out +} + +#' General Interface for null models +#' +#' `nullmodel` is a way to generate a _specification_ of a model before +#' fitting and allows the model to be created using R. It doens't have any +#' main arguments. +#' +#' @param mode A single character string for the type of model. +#' Possible values for this model are "unknown", "regression", or +#' "classification". +#' @details The model can be created using the `fit()` function using the +#' following _engines_: +#' \itemize{ +#' \item \pkg{R}: `"parsnip"` +#' } +#' +#' @section Engine Details: +#' +#' Engines may have pre-set default arguments when executing the +#' model fit call. For this type of +#' model, the template of the fit calls are: +#' +#' \pkg{parsnip} classification +#' +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "classification"), "parsnip")} +#' +#' \pkg{parsnip} regression +#' +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "regression"), "parsnip")} +#' +#' @importFrom purrr map_lgl +#' @seealso [varying()], [fit()] +#' @examples +#' nullmodel(mode = "regression") +#' @export +nullmodel <- + function(mode = "classification") { + # Check for correct mode + if (!(mode %in% nullmodel_modes)) + stop("`mode` should be one of: ", + paste0("'", nullmodel_modes, "'", collapse = ", "), + call. = FALSE) + + # Capture the arguments in quosures + args <- list() + + # Save some empty slots for future parts of the specification + out <- list(args = args, eng_args = NULL, + mode = mode, method = NULL, engine = NULL) + + # set classes in the correct order + class(out) <- make_classes("nullmodel") + out + } diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R new file mode 100644 index 000000000..158bad2bb --- /dev/null +++ b/R/nullmodel_data.R @@ -0,0 +1,58 @@ +nullmodel_arg_key <- data.frame( + parsnip = NULL, + row.names = NULL, + stringsAsFactors = FALSE +) + +nullmodel_modes <- c("classification", "regression", "unknown") + +nullmodel_engines <- data.frame( + parsnip = c(TRUE, TRUE, FALSE), + row.names = c("classification", "regression", "unknown") +) + +# ------------------------------------------------------------------------------ + +nullmodel_parsnip_data <- + list( + libs = "parsnip", + fit = list( + interface = "matrix", + protect = c("x", "y"), + func = c(fun = "null_model"), + defaults = list() + ), + class = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "class" + ) + ), + numeric = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "raw" + ) + ), + raw = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "raw" + ) + ) + ) diff --git a/man/null_model.Rd b/man/null_model.Rd new file mode 100644 index 000000000..bde665ef6 --- /dev/null +++ b/man/null_model.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nullmodel.R +\name{null_model} +\alias{null_model} +\alias{null_model.default} +\alias{predict.null_model} +\alias{print.null_model} +\title{Fit a simple, non-informative model} +\usage{ +null_model(x, ...) + +\method{null_model}{default}(x = NULL, y, ...) + +\method{print}{null_model}(x, ...) + +\method{predict}{null_model}(object, newdata = NULL, type = NULL, ...) +} +\arguments{ +\item{x}{An optional matrix or data frame of predictors. These values are +not used in the model fit} + +\item{\dots}{Optional arguments (not yet used)} + +\item{y}{A numeric vector (for regression) or factor (for classification) of +outcomes} + +\item{object}{An object of class \code{null_model}} + +\item{newdata}{A matrix or data frame of predictors (only used to determine +the number of predictions to return)} + +\item{type}{Either "raw" (for regression), "class" or "prob" (for +classification)} +} +\value{ +The output of \code{null_model} is a list of class \code{null_model} +with elements \item{call }{the function call} \item{value }{the mean of +\code{y} or the most prevalent class} \item{levels }{when \code{y} is a +factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} +is a factor, a data frame with a column for each class (\code{NULL} +otherwise). The column for the most prevalent class has the proportion of +the training samples with that class (the other columns are zero). } \item{n +}{the number of elements in \code{y}} + +\code{predict.null_model} returns a either a factor or numeric vector +depending on the class of \code{y}. All predictions are always the same. +} +\description{ +Fit a single mean or largest class model +} +\details{ +\code{null_model} emulates other model building functions, but returns the +simplest model possible given a training set: a single mean for numeric +outcomes and the most prevalent class for factor outcomes. When class +probabilities are requested, the percentage of the training set samples with +the most prevalent class is returned. +} +\examples{ + +outcome <- factor(sample(letters[1:2], + size = 100, + prob = c(.1, .9), + replace = TRUE)) +useless <- null_model(y = outcome) +useless +predict(useless, matrix(NA, nrow = 10)) + +} +\keyword{models} diff --git a/man/nullmodel.Rd b/man/nullmodel.Rd new file mode 100644 index 000000000..65445b0ca --- /dev/null +++ b/man/nullmodel.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nullmodel.R +\name{nullmodel} +\alias{nullmodel} +\title{General Interface for null models} +\usage{ +nullmodel(mode = "classification") +} +\arguments{ +\item{mode}{A single character string for the type of model. +Possible values for this model are "unknown", "regression", or +"classification".} +} +\description{ +\code{nullmodel} is a way to generate a \emph{specification} of a model before +fitting and allows the model to be created using R. It doens't have any +main arguments. +} +\details{ +The model can be created using the \code{fit()} function using the +following \emph{engines}: +\itemize{ +\item \pkg{R}: \code{"parsnip"} +} +} +\section{Engine Details}{ + + +Engines may have pre-set default arguments when executing the +model fit call. For this type of +model, the template of the fit calls are: + +\pkg{parsnip} classification + +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "classification"), "parsnip")} + +\pkg{parsnip} regression + +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "regression"), "parsnip")} +} + +\examples{ +nullmodel(mode = "regression") +} +\seealso{ +\code{\link[=varying]{varying()}}, \code{\link[=fit]{fit()}} +} diff --git a/tests/testthat/test-nullmodel.R b/tests/testthat/test-nullmodel.R new file mode 100644 index 000000000..436095baa --- /dev/null +++ b/tests/testthat/test-nullmodel.R @@ -0,0 +1,121 @@ +library(testthat) +library(parsnip) +library(rlang) +library(tibble) + +context("test-nullmodel") +source("helpers.R") + + +test_that('primary arguments', { + basic <- nullmodel(mode = "regression") + basic_nullmodel <- translate(basic %>% set_engine("parsnip")) + expect_equal(basic_nullmodel$method$fit$args, + list( + x = expr(missing_arg()), + y = expr(missing_arg()) + ) + ) +}) + + +test_that('engine arguments', { + nullmodel_keep <- nullmodel(mode = "regression") + expect_equal(translate(nullmodel_keep %>% set_engine("parsnip", keepxy = FALSE))$method$fit$args, + list( + x = expr(missing_arg()), + y = expr(missing_arg()), + keepxy = new_empty_quosure(FALSE) + ) + ) +}) + +test_that('bad input', { + expect_error(translate(nullmodel() %>% set_engine("wat?"))) + expect_error(translate(nullmodel(mode = "regression") %>% set_engine())) + expect_error(translate(nullmodel(formula = y ~ x))) + expect_warning( + translate( + nullmodel(mode = "regression") %>% set_engine("parsnip", x = iris[,1:3], y = iris$Species) + ) + ) +}) + +# ------------------------------------------------------------------------------ + +num_pred <- c("Sepal.Width", "Petal.Width", "Petal.Length") +iris_bad_form <- as.formula(Species ~ term) +iris_basic <- nullmodel(mode = "regression") %>% set_engine("parsnip") + +# ------------------------------------------------------------------------------ + +test_that('nullmodel execution', { + + expect_error( + res <- fit( + iris_basic, + Sepal.Length ~ log(Sepal.Width) + Species, + data = iris + ), + regexp = NA + ) + expect_error( + res <- fit_xy( + iris_basic, + x = iris[, num_pred], + y = iris$Sepal.Length + ), + regexp = NA + ) + + expect_error( + res <- fit( + iris_basic, + iris_bad_form, + data = iris + ) + ) + +}) + +test_that('nullmodel prediction', { + + uni_pred <- tibble(.pred = rep(5.843333, 5)) + inl_pred <- rep(5.843333, 5) + + mv_pred <- + structure( + list(Sepal.Width = seq_len(10), + Petal.Width = seq_len(10)), + class = "data.frame", + row.names = c(NA, -10L)) + + res_xy <- fit_xy( + iris_basic, + x = iris[, num_pred], + y = iris$Sepal.Length + ) + + expect_equal(uni_pred, predict(res_xy, new_data = iris[1, num_pred])) + + res_form <- fit( + iris_basic, + Sepal.Length ~ log(Sepal.Width) + Species, + data = iris + ) + expect_equal(inl_pred, predict_numeric(res_form, iris[1:5, ])) +}) + +# ------------------------------------------------------------------------------ + +test_that('classification', { + + expect_error( + nullmodel <- nullmodel(mode = "classification") %>% + set_engine("parsnip") %>% + fit(Species ~ ., data = iris), + regexp = NA + ) + expect_true(!is.null(nullmodel$fit)) +}) + From 47423935dc78b9c6255e83dd6ba7b442ac38feae Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Thu, 24 Jan 2019 20:21:51 -0800 Subject: [PATCH 2/6] name change nullmodel -> null_model --- NAMESPACE | 6 +-- R/nullmodel.R | 46 +++++++++--------- R/nullmodel_data.R | 10 ++-- man/null_model.Rd | 84 ++++++++++++--------------------- man/nullmodel.Rd | 84 +++++++++++++++++++++------------ tests/testthat/test-nullmodel.R | 18 +++---- 6 files changed, 124 insertions(+), 124 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d201d151e..6cbc69a7c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,12 +9,12 @@ S3method(multi_predict,"_lognet") S3method(multi_predict,"_multnet") S3method(multi_predict,"_xgb.Booster") S3method(multi_predict,default) -S3method(null_model,default) +S3method(nullmodel,default) S3method(predict,"_elnet") S3method(predict,"_lognet") S3method(predict,"_multnet") S3method(predict,model_fit) -S3method(predict,null_model) +S3method(predict,nullmodel) S3method(predict_class,"_lognet") S3method(predict_class,model_fit) S3method(predict_classprob,"_lognet") @@ -39,7 +39,7 @@ S3method(print,model_fit) S3method(print,model_spec) S3method(print,multinom_reg) S3method(print,nearest_neighbor) -S3method(print,null_model) +S3method(print,nullmodel) S3method(print,rand_forest) S3method(print,surv_reg) S3method(print,svm_poly) diff --git a/R/nullmodel.R b/R/nullmodel.R index 5c564210c..3c9cb5f00 100644 --- a/R/nullmodel.R +++ b/R/nullmodel.R @@ -2,24 +2,24 @@ #' #' Fit a single mean or largest class model #' -#' \code{null_model} emulates other model building functions, but returns the +#' \code{nullmodel} emulates other model building functions, but returns the #' simplest model possible given a training set: a single mean for numeric #' outcomes and the most prevalent class for factor outcomes. When class #' probabilities are requested, the percentage of the training set samples with #' the most prevalent class is returned. #' -#' @aliases null_model null_model.default predict.null_model +#' @aliases nullmodel nullmodel.default predict.nullmodel #' @param x An optional matrix or data frame of predictors. These values are #' not used in the model fit #' @param y A numeric vector (for regression) or factor (for classification) of #' outcomes #' @param \dots Optional arguments (not yet used) -#' @param object An object of class \code{null_model} +#' @param object An object of class \code{nullmodel} #' @param newdata A matrix or data frame of predictors (only used to determine #' the number of predictions to return) #' @param type Either "raw" (for regression), "class" or "prob" (for #' classification) -#' @return The output of \code{null_model} is a list of class \code{null_model} +#' @return The output of \code{nullmodel} is a list of class \code{nullmodel} #' with elements \item{call }{the function call} \item{value }{the mean of #' \code{y} or the most prevalent class} \item{levels }{when \code{y} is a #' factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} @@ -28,7 +28,7 @@ #' the training samples with that class (the other columns are zero). } \item{n #' }{the number of elements in \code{y}} #' -#' \code{predict.null_model} returns a either a factor or numeric vector +#' \code{predict.nullmodel} returns a either a factor or numeric vector #' depending on the class of \code{y}. All predictions are always the same. #' @keywords models #' @examples @@ -37,16 +37,16 @@ #' size = 100, #' prob = c(.1, .9), #' replace = TRUE)) -#' useless <- null_model(y = outcome) +#' useless <- nullmodel(y = outcome) #' useless -#' predict(useless, matrix(NA, nrow = 10)) +#' predict(useless, matrix(NA, nrow = 5)) #' #' @export -null_model <- function (x, ...) UseMethod("null_model") +nullmodel <- function (x, ...) UseMethod("nullmodel") #' @export -#' @rdname null_model -null_model.default <- function(x = NULL, y, ...) +#' @rdname nullmodel +nullmodel.default <- function(x = NULL, y, ...) { if(is.factor(y)) @@ -66,12 +66,12 @@ null_model.default <- function(x = NULL, y, ...) levels = lvls, pct = pct, n = length(y)), - class = "null_model") + class = "nullmodel") } #' @export -#' @rdname null_model -print.null_model <- function(x, ...) +#' @rdname nullmodel +print.nullmodel <- function(x, ...) { cat("Null", ifelse(is.null(x$levels), "Classification", "Regression"), @@ -84,8 +84,8 @@ print.null_model <- function(x, ...) } #' @export -#' @rdname null_model -predict.null_model <- function (object, newdata = NULL, type = NULL, ...) +#' @rdname nullmodel +predict.nullmodel <- function (object, newdata = NULL, type = NULL, ...) { if(is.null(type)) { @@ -112,7 +112,7 @@ predict.null_model <- function (object, newdata = NULL, type = NULL, ...) #' General Interface for null models #' -#' `nullmodel` is a way to generate a _specification_ of a model before +#' `null_model` is a way to generate a _specification_ of a model before #' fitting and allows the model to be created using R. It doens't have any #' main arguments. #' @@ -133,23 +133,23 @@ predict.null_model <- function (object, newdata = NULL, type = NULL, ...) #' #' \pkg{parsnip} classification #' -#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "classification"), "parsnip")} +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "classification"), "parsnip")} #' #' \pkg{parsnip} regression #' -#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "regression"), "parsnip")} +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "regression"), "parsnip")} #' #' @importFrom purrr map_lgl #' @seealso [varying()], [fit()] #' @examples -#' nullmodel(mode = "regression") +#' null_model(mode = "regression") #' @export -nullmodel <- +null_model <- function(mode = "classification") { # Check for correct mode - if (!(mode %in% nullmodel_modes)) + if (!(mode %in% null_model_modes)) stop("`mode` should be one of: ", - paste0("'", nullmodel_modes, "'", collapse = ", "), + paste0("'", null_model_modes, "'", collapse = ", "), call. = FALSE) # Capture the arguments in quosures @@ -160,6 +160,6 @@ nullmodel <- mode = mode, method = NULL, engine = NULL) # set classes in the correct order - class(out) <- make_classes("nullmodel") + class(out) <- make_classes("null_model") out } diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R index 158bad2bb..632ad609a 100644 --- a/R/nullmodel_data.R +++ b/R/nullmodel_data.R @@ -1,25 +1,25 @@ -nullmodel_arg_key <- data.frame( +null_model_arg_key <- data.frame( parsnip = NULL, row.names = NULL, stringsAsFactors = FALSE ) -nullmodel_modes <- c("classification", "regression", "unknown") +null_model_modes <- c("classification", "regression", "unknown") -nullmodel_engines <- data.frame( +null_model_engines <- data.frame( parsnip = c(TRUE, TRUE, FALSE), row.names = c("classification", "regression", "unknown") ) # ------------------------------------------------------------------------------ -nullmodel_parsnip_data <- +null_model_parsnip_data <- list( libs = "parsnip", fit = list( interface = "matrix", protect = c("x", "y"), - func = c(fun = "null_model"), + func = c(fun = "nullmodel"), defaults = list() ), class = list( diff --git a/man/null_model.Rd b/man/null_model.Rd index bde665ef6..d1b709214 100644 --- a/man/null_model.Rd +++ b/man/null_model.Rd @@ -2,68 +2,46 @@ % Please edit documentation in R/nullmodel.R \name{null_model} \alias{null_model} -\alias{null_model.default} -\alias{predict.null_model} -\alias{print.null_model} -\title{Fit a simple, non-informative model} +\title{General Interface for null models} \usage{ -null_model(x, ...) - -\method{null_model}{default}(x = NULL, y, ...) - -\method{print}{null_model}(x, ...) - -\method{predict}{null_model}(object, newdata = NULL, type = NULL, ...) +null_model(mode = "classification") } \arguments{ -\item{x}{An optional matrix or data frame of predictors. These values are -not used in the model fit} +\item{mode}{A single character string for the type of model. +Possible values for this model are "unknown", "regression", or +"classification".} +} +\description{ +\code{null_model} is a way to generate a \emph{specification} of a model before +fitting and allows the model to be created using R. It doens't have any +main arguments. +} +\details{ +The model can be created using the \code{fit()} function using the +following \emph{engines}: +\itemize{ +\item \pkg{R}: \code{"parsnip"} +} +} +\section{Engine Details}{ -\item{\dots}{Optional arguments (not yet used)} -\item{y}{A numeric vector (for regression) or factor (for classification) of -outcomes} +Engines may have pre-set default arguments when executing the +model fit call. For this type of +model, the template of the fit calls are: -\item{object}{An object of class \code{null_model}} +\pkg{parsnip} classification -\item{newdata}{A matrix or data frame of predictors (only used to determine -the number of predictions to return)} +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "classification"), "parsnip")} -\item{type}{Either "raw" (for regression), "class" or "prob" (for -classification)} -} -\value{ -The output of \code{null_model} is a list of class \code{null_model} -with elements \item{call }{the function call} \item{value }{the mean of -\code{y} or the most prevalent class} \item{levels }{when \code{y} is a -factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} -is a factor, a data frame with a column for each class (\code{NULL} -otherwise). The column for the most prevalent class has the proportion of -the training samples with that class (the other columns are zero). } \item{n -}{the number of elements in \code{y}} +\pkg{parsnip} regression -\code{predict.null_model} returns a either a factor or numeric vector -depending on the class of \code{y}. All predictions are always the same. -} -\description{ -Fit a single mean or largest class model -} -\details{ -\code{null_model} emulates other model building functions, but returns the -simplest model possible given a training set: a single mean for numeric -outcomes and the most prevalent class for factor outcomes. When class -probabilities are requested, the percentage of the training set samples with -the most prevalent class is returned. +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "regression"), "parsnip")} } -\examples{ - -outcome <- factor(sample(letters[1:2], - size = 100, - prob = c(.1, .9), - replace = TRUE)) -useless <- null_model(y = outcome) -useless -predict(useless, matrix(NA, nrow = 10)) +\examples{ +null_model(mode = "regression") +} +\seealso{ +\code{\link[=varying]{varying()}}, \code{\link[=fit]{fit()}} } -\keyword{models} diff --git a/man/nullmodel.Rd b/man/nullmodel.Rd index 65445b0ca..cffbd87cb 100644 --- a/man/nullmodel.Rd +++ b/man/nullmodel.Rd @@ -2,46 +2,68 @@ % Please edit documentation in R/nullmodel.R \name{nullmodel} \alias{nullmodel} -\title{General Interface for null models} +\alias{nullmodel.default} +\alias{predict.nullmodel} +\alias{print.nullmodel} +\title{Fit a simple, non-informative model} \usage{ -nullmodel(mode = "classification") +nullmodel(x, ...) + +\method{nullmodel}{default}(x = NULL, y, ...) + +\method{print}{nullmodel}(x, ...) + +\method{predict}{nullmodel}(object, newdata = NULL, type = NULL, ...) } \arguments{ -\item{mode}{A single character string for the type of model. -Possible values for this model are "unknown", "regression", or -"classification".} -} -\description{ -\code{nullmodel} is a way to generate a \emph{specification} of a model before -fitting and allows the model to be created using R. It doens't have any -main arguments. -} -\details{ -The model can be created using the \code{fit()} function using the -following \emph{engines}: -\itemize{ -\item \pkg{R}: \code{"parsnip"} -} -} -\section{Engine Details}{ +\item{x}{An optional matrix or data frame of predictors. These values are +not used in the model fit} +\item{\dots}{Optional arguments (not yet used)} -Engines may have pre-set default arguments when executing the -model fit call. For this type of -model, the template of the fit calls are: +\item{y}{A numeric vector (for regression) or factor (for classification) of +outcomes} -\pkg{parsnip} classification +\item{object}{An object of class \code{nullmodel}} -\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "classification"), "parsnip")} +\item{newdata}{A matrix or data frame of predictors (only used to determine +the number of predictions to return)} -\pkg{parsnip} regression - -\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::nullmodel(mode = "regression"), "parsnip")} +\item{type}{Either "raw" (for regression), "class" or "prob" (for +classification)} } +\value{ +The output of \code{nullmodel} is a list of class \code{nullmodel} +with elements \item{call }{the function call} \item{value }{the mean of +\code{y} or the most prevalent class} \item{levels }{when \code{y} is a +factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} +is a factor, a data frame with a column for each class (\code{NULL} +otherwise). The column for the most prevalent class has the proportion of +the training samples with that class (the other columns are zero). } \item{n +}{the number of elements in \code{y}} -\examples{ -nullmodel(mode = "regression") +\code{predict.nullmodel} returns a either a factor or numeric vector +depending on the class of \code{y}. All predictions are always the same. } -\seealso{ -\code{\link[=varying]{varying()}}, \code{\link[=fit]{fit()}} +\description{ +Fit a single mean or largest class model +} +\details{ +\code{nullmodel} emulates other model building functions, but returns the +simplest model possible given a training set: a single mean for numeric +outcomes and the most prevalent class for factor outcomes. When class +probabilities are requested, the percentage of the training set samples with +the most prevalent class is returned. +} +\examples{ + +outcome <- factor(sample(letters[1:2], + size = 100, + prob = c(.1, .9), + replace = TRUE)) +useless <- nullmodel(y = outcome) +useless +predict(useless, matrix(NA, nrow = 5)) + } +\keyword{models} diff --git a/tests/testthat/test-nullmodel.R b/tests/testthat/test-nullmodel.R index 436095baa..14f2af5d7 100644 --- a/tests/testthat/test-nullmodel.R +++ b/tests/testthat/test-nullmodel.R @@ -8,7 +8,7 @@ source("helpers.R") test_that('primary arguments', { - basic <- nullmodel(mode = "regression") + basic <- null_model(mode = "regression") basic_nullmodel <- translate(basic %>% set_engine("parsnip")) expect_equal(basic_nullmodel$method$fit$args, list( @@ -20,7 +20,7 @@ test_that('primary arguments', { test_that('engine arguments', { - nullmodel_keep <- nullmodel(mode = "regression") + nullmodel_keep <- null_model(mode = "regression") expect_equal(translate(nullmodel_keep %>% set_engine("parsnip", keepxy = FALSE))$method$fit$args, list( x = expr(missing_arg()), @@ -31,12 +31,12 @@ test_that('engine arguments', { }) test_that('bad input', { - expect_error(translate(nullmodel() %>% set_engine("wat?"))) - expect_error(translate(nullmodel(mode = "regression") %>% set_engine())) - expect_error(translate(nullmodel(formula = y ~ x))) + expect_error(translate(null_model() %>% set_engine("wat?"))) + expect_error(translate(null_model(mode = "regression") %>% set_engine())) + expect_error(translate(null_model(formula = y ~ x))) expect_warning( translate( - nullmodel(mode = "regression") %>% set_engine("parsnip", x = iris[,1:3], y = iris$Species) + null_model(mode = "regression") %>% set_engine("parsnip", x = iris[,1:3], y = iris$Species) ) ) }) @@ -45,7 +45,7 @@ test_that('bad input', { num_pred <- c("Sepal.Width", "Petal.Width", "Petal.Length") iris_bad_form <- as.formula(Species ~ term) -iris_basic <- nullmodel(mode = "regression") %>% set_engine("parsnip") +iris_basic <- null_model(mode = "regression") %>% set_engine("parsnip") # ------------------------------------------------------------------------------ @@ -111,11 +111,11 @@ test_that('nullmodel prediction', { test_that('classification', { expect_error( - nullmodel <- nullmodel(mode = "classification") %>% + null_model <- null_model(mode = "classification") %>% set_engine("parsnip") %>% fit(Species ~ ., data = iris), regexp = NA ) - expect_true(!is.null(nullmodel$fit)) + expect_true(!is.null(null_model$fit)) }) From 3723c877a2d147cb2cc98a07f2b805be36be4bf2 Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Fri, 25 Jan 2019 17:35:32 -0800 Subject: [PATCH 3/6] linting + length of predict fix --- R/nullmodel.R | 16 ++++++---------- R/nullmodel_data.R | 2 +- man/nullmodel.Rd | 4 ++-- tests/testthat/test-nullmodel.R | 21 +++++++-------------- 4 files changed, 16 insertions(+), 27 deletions(-) diff --git a/R/nullmodel.R b/R/nullmodel.R index 3c9cb5f00..7cb8babd5 100644 --- a/R/nullmodel.R +++ b/R/nullmodel.R @@ -15,7 +15,7 @@ #' outcomes #' @param \dots Optional arguments (not yet used) #' @param object An object of class \code{nullmodel} -#' @param newdata A matrix or data frame of predictors (only used to determine +#' @param new_data A matrix or data frame of predictors (only used to determine #' the number of predictions to return) #' @param type Either "raw" (for regression), "class" or "prob" (for #' classification) @@ -85,18 +85,14 @@ print.nullmodel <- function(x, ...) #' @export #' @rdname nullmodel -predict.nullmodel <- function (object, newdata = NULL, type = NULL, ...) -{ - if(is.null(type)) - { +predict.nullmodel <- function (object, new_data = NULL, type = NULL, ...) { + if(is.null(type)) { type <- if(is.null(object$levels)) "raw" else "class" } - n <- if(is.null(newdata)) object$n else nrow(newdata) - if(!is.null(object$levels)) - { - if(type == "prob") - { + n <- if(is.null(new_data)) object$n else nrow(new_data) + if(!is.null(object$levels)) { + if(type == "prob") { out <- matrix(rep(object$pct, n), nrow = n, byrow = TRUE) colnames(out) <- object$levels out <- as.data.frame(out) diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R index 632ad609a..3de382220 100644 --- a/R/nullmodel_data.R +++ b/R/nullmodel_data.R @@ -41,7 +41,7 @@ null_model_parsnip_data <- list( object = quote(object$fit), new_data = quote(new_data), - type = "raw" + type = "numeric" ) ), raw = list( diff --git a/man/nullmodel.Rd b/man/nullmodel.Rd index cffbd87cb..d13596d93 100644 --- a/man/nullmodel.Rd +++ b/man/nullmodel.Rd @@ -13,7 +13,7 @@ nullmodel(x, ...) \method{print}{nullmodel}(x, ...) -\method{predict}{nullmodel}(object, newdata = NULL, type = NULL, ...) +\method{predict}{nullmodel}(object, new_data = NULL, type = NULL, ...) } \arguments{ \item{x}{An optional matrix or data frame of predictors. These values are @@ -26,7 +26,7 @@ outcomes} \item{object}{An object of class \code{nullmodel}} -\item{newdata}{A matrix or data frame of predictors (only used to determine +\item{new_data}{A matrix or data frame of predictors (only used to determine the number of predictions to return)} \item{type}{Either "raw" (for regression), "class" or "prob" (for diff --git a/tests/testthat/test-nullmodel.R b/tests/testthat/test-nullmodel.R index 14f2af5d7..8f775ccbc 100644 --- a/tests/testthat/test-nullmodel.R +++ b/tests/testthat/test-nullmodel.R @@ -43,7 +43,7 @@ test_that('bad input', { # ------------------------------------------------------------------------------ -num_pred <- c("Sepal.Width", "Petal.Width", "Petal.Length") +num_pred <-c("Sepal.Length", "Sepal.Width", "Petal.Width") iris_bad_form <- as.formula(Species ~ term) iris_basic <- null_model(mode = "regression") %>% set_engine("parsnip") @@ -63,7 +63,7 @@ test_that('nullmodel execution', { res <- fit_xy( iris_basic, x = iris[, num_pred], - y = iris$Sepal.Length + y = iris$Petal.Length ), regexp = NA ) @@ -80,27 +80,20 @@ test_that('nullmodel execution', { test_that('nullmodel prediction', { - uni_pred <- tibble(.pred = rep(5.843333, 5)) - inl_pred <- rep(5.843333, 5) - - mv_pred <- - structure( - list(Sepal.Width = seq_len(10), - Petal.Width = seq_len(10)), - class = "data.frame", - row.names = c(NA, -10L)) + uni_pred <- tibble(.pred = rep(3.758, 5)) + inl_pred <- rep(3.758, 5) res_xy <- fit_xy( iris_basic, x = iris[, num_pred], - y = iris$Sepal.Length + y = iris$Petal.Length ) - expect_equal(uni_pred, predict(res_xy, new_data = iris[1, num_pred])) + expect_equal(uni_pred, predict(res_xy, new_data = iris[1:5, num_pred])) res_form <- fit( iris_basic, - Sepal.Length ~ log(Sepal.Width) + Species, + Petal.Length ~ log(Sepal.Width) + Species, data = iris ) expect_equal(inl_pred, predict_numeric(res_form, iris[1:5, ])) From d767d267b95ad702f5c05e7e1b8b408ce0bece43 Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Fri, 25 Jan 2019 18:10:47 -0800 Subject: [PATCH 4/6] more lint --- R/nullmodel.R | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/R/nullmodel.R b/R/nullmodel.R index 7cb8babd5..4e205ce64 100644 --- a/R/nullmodel.R +++ b/R/nullmodel.R @@ -46,11 +46,9 @@ nullmodel <- function (x, ...) UseMethod("nullmodel") #' @export #' @rdname nullmodel -nullmodel.default <- function(x = NULL, y, ...) -{ +nullmodel.default <- function(x = NULL, y, ...) { - if(is.factor(y)) - { + if(is.factor(y)) { lvls <- levels(y) tab <- table(y) value <- names(tab)[which.max(tab)] @@ -71,8 +69,7 @@ nullmodel.default <- function(x = NULL, y, ...) #' @export #' @rdname nullmodel -print.nullmodel <- function(x, ...) -{ +print.nullmodel <- function(x, ...) { cat("Null", ifelse(is.null(x$levels), "Classification", "Regression"), "Model\n") From 4a37edb772a57aae8858f8a5992dbe43e0f5eff0 Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Fri, 25 Jan 2019 18:49:33 -0800 Subject: [PATCH 5/6] library(parsnip) iris_basic <- null_model(mode = "regression") %>% set_engine("parsnip") %>% fit(cbind(Sepal.Width) ~ ., data = iris) predict_numeric(iris_basic, iris[1:5,]) iris_basic <- null_model(mode = "regression") %>% set_engine("parsnip") %>% fit(cbind(Sepal.Width, Petal.Width) ~ ., data = iris) predict_numeric(iris_basic, iris[1:5,]) --- R/nullmodel.R | 32 ++++++++++++++++++++++++++------ tests/testthat/test-nullmodel.R | 22 ++++++++++++++++++++++ 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/R/nullmodel.R b/R/nullmodel.R index 4e205ce64..84ec41cc6 100644 --- a/R/nullmodel.R +++ b/R/nullmodel.R @@ -48,6 +48,7 @@ nullmodel <- function (x, ...) UseMethod("nullmodel") #' @rdname nullmodel nullmodel.default <- function(x = NULL, y, ...) { + if(is.factor(y)) { lvls <- levels(y) tab <- table(y) @@ -56,14 +57,19 @@ nullmodel.default <- function(x = NULL, y, ...) { } else { lvls <- NULL pct <- NULL - value <- mean(y, na.rm = TRUE) + if(is.null(dim(y))) { + value <- mean(y, na.rm = TRUE) + } else { + value <- colMeans(y, na.rm = TRUE) + } } + structure( list(call = match.call(), value = value, levels = lvls, pct = pct, - n = length(y)), + n = length(y[[1]])), class = "nullmodel") } @@ -75,9 +81,16 @@ print.nullmodel <- function(x, ...) { "Model\n") x$call - cat("Predicted Value:", - ifelse(is.null(x$levels), format(x$value), x$value), - "\n") + if (length(x$value) == 1) { + cat("Predicted Value:", + ifelse(is.null(x$levels), format(x$value), x$value), + "\n") + } else { + cat("Predicted Value:\n", + names(x$value), "\n", + x$value, + "\n") + } } #' @export @@ -98,7 +111,14 @@ predict.nullmodel <- function (object, new_data = NULL, type = NULL, ...) { } } else { if(type %in% c("prob", "class")) stop("ony raw predicitons are applicable to regression models") - out <- rep(object$value, n) + if(length(object$value) == 1) { + out <- rep(object$value, n) + } else { + out <- as_tibble(matrix(rep(object$value, n), + ncol = length(object$value), byrow = TRUE)) + + names(out) <- names(object$value) + } } out } diff --git a/tests/testthat/test-nullmodel.R b/tests/testthat/test-nullmodel.R index 8f775ccbc..2d82f59c6 100644 --- a/tests/testthat/test-nullmodel.R +++ b/tests/testthat/test-nullmodel.R @@ -76,12 +76,25 @@ test_that('nullmodel execution', { ) ) + ## multivariate y + + expect_error( + res <- fit( + iris_basic, + cbind(Sepal.Width, Petal.Width) ~ ., + data = iris + ), + regexp = NA + ) + }) test_that('nullmodel prediction', { uni_pred <- tibble(.pred = rep(3.758, 5)) inl_pred <- rep(3.758, 5) + mw_pred <- tibble(gear = rep(3.6875, 5), + carb = rep(2.8125, 5)) res_xy <- fit_xy( iris_basic, @@ -97,6 +110,15 @@ test_that('nullmodel prediction', { data = iris ) expect_equal(inl_pred, predict_numeric(res_form, iris[1:5, ])) + + # Multivariate y + res <- fit( + iris_basic, + cbind(gear, carb) ~ ., + data = mtcars + ) + + expect_equal(mw_pred, predict_numeric(res, mtcars[1:5, ])) }) # ------------------------------------------------------------------------------ From 0ca0445c134eafc659c33a7e4aea548719b53362 Mon Sep 17 00:00:00 2001 From: EmilHvitfeldt Date: Tue, 29 Jan 2019 13:09:05 -0800 Subject: [PATCH 6/6] added classprob prediction type --- R/nullmodel_data.R | 18 ++++++++++++++++-- .../{test-nullmodel.R => test_nullmodel.R} | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) rename tests/testthat/{test-nullmodel.R => test_nullmodel.R} (99%) diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R index 3de382220..80380a98a 100644 --- a/R/nullmodel_data.R +++ b/R/nullmodel_data.R @@ -23,14 +23,28 @@ null_model_parsnip_data <- defaults = list() ), class = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "class" + ) + ), + classprob = list( pre = NULL, - post = NULL, + post = function(x, object) { + str(as_tibble(x)) + as_tibble(x) + }, func = c(fun = "predict"), args = list( object = quote(object$fit), new_data = quote(new_data), - type = "class" + type = "prob" ) ), numeric = list( diff --git a/tests/testthat/test-nullmodel.R b/tests/testthat/test_nullmodel.R similarity index 99% rename from tests/testthat/test-nullmodel.R rename to tests/testthat/test_nullmodel.R index 2d82f59c6..45a6a9932 100644 --- a/tests/testthat/test-nullmodel.R +++ b/tests/testthat/test_nullmodel.R @@ -134,3 +134,4 @@ test_that('classification', { expect_true(!is.null(null_model$fit)) }) +