diff --git a/NAMESPACE b/NAMESPACE index 8ce771ce7..6cbc69a7c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,10 +9,12 @@ S3method(multi_predict,"_lognet") S3method(multi_predict,"_multnet") S3method(multi_predict,"_xgb.Booster") S3method(multi_predict,default) +S3method(nullmodel,default) S3method(predict,"_elnet") S3method(predict,"_lognet") S3method(predict,"_multnet") S3method(predict,model_fit) +S3method(predict,nullmodel) S3method(predict_class,"_lognet") S3method(predict_class,model_fit) S3method(predict_classprob,"_lognet") @@ -37,6 +39,7 @@ S3method(print,model_fit) S3method(print,model_spec) S3method(print,multinom_reg) S3method(print,nearest_neighbor) +S3method(print,nullmodel) S3method(print,rand_forest) S3method(print,surv_reg) S3method(print,svm_poly) @@ -95,6 +98,8 @@ export(model_printer) export(multi_predict) export(multinom_reg) export(nearest_neighbor) +export(null_model) +export(nullmodel) export(predict.model_fit) export(predict_class) export(predict_class.model_fit) diff --git a/R/nullmodel.R b/R/nullmodel.R new file mode 100644 index 000000000..84ec41cc6 --- /dev/null +++ b/R/nullmodel.R @@ -0,0 +1,178 @@ +#' Fit a simple, non-informative model +#' +#' Fit a single mean or largest class model +#' +#' \code{nullmodel} emulates other model building functions, but returns the +#' simplest model possible given a training set: a single mean for numeric +#' outcomes and the most prevalent class for factor outcomes. When class +#' probabilities are requested, the percentage of the training set samples with +#' the most prevalent class is returned. +#' +#' @aliases nullmodel nullmodel.default predict.nullmodel +#' @param x An optional matrix or data frame of predictors. These values are +#' not used in the model fit +#' @param y A numeric vector (for regression) or factor (for classification) of +#' outcomes +#' @param \dots Optional arguments (not yet used) +#' @param object An object of class \code{nullmodel} +#' @param new_data A matrix or data frame of predictors (only used to determine +#' the number of predictions to return) +#' @param type Either "raw" (for regression), "class" or "prob" (for +#' classification) +#' @return The output of \code{nullmodel} is a list of class \code{nullmodel} +#' with elements \item{call }{the function call} \item{value }{the mean of +#' \code{y} or the most prevalent class} \item{levels }{when \code{y} is a +#' factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} +#' is a factor, a data frame with a column for each class (\code{NULL} +#' otherwise). The column for the most prevalent class has the proportion of +#' the training samples with that class (the other columns are zero). } \item{n +#' }{the number of elements in \code{y}} +#' +#' \code{predict.nullmodel} returns a either a factor or numeric vector +#' depending on the class of \code{y}. All predictions are always the same. +#' @keywords models +#' @examples +#' +#' outcome <- factor(sample(letters[1:2], +#' size = 100, +#' prob = c(.1, .9), +#' replace = TRUE)) +#' useless <- nullmodel(y = outcome) +#' useless +#' predict(useless, matrix(NA, nrow = 5)) +#' +#' @export +nullmodel <- function (x, ...) UseMethod("nullmodel") + +#' @export +#' @rdname nullmodel +nullmodel.default <- function(x = NULL, y, ...) { + + + if(is.factor(y)) { + lvls <- levels(y) + tab <- table(y) + value <- names(tab)[which.max(tab)] + pct <- tab/sum(tab) + } else { + lvls <- NULL + pct <- NULL + if(is.null(dim(y))) { + value <- mean(y, na.rm = TRUE) + } else { + value <- colMeans(y, na.rm = TRUE) + } + } + + structure( + list(call = match.call(), + value = value, + levels = lvls, + pct = pct, + n = length(y[[1]])), + class = "nullmodel") +} + +#' @export +#' @rdname nullmodel +print.nullmodel <- function(x, ...) { + cat("Null", + ifelse(is.null(x$levels), "Classification", "Regression"), + "Model\n") + x$call + + if (length(x$value) == 1) { + cat("Predicted Value:", + ifelse(is.null(x$levels), format(x$value), x$value), + "\n") + } else { + cat("Predicted Value:\n", + names(x$value), "\n", + x$value, + "\n") + } +} + +#' @export +#' @rdname nullmodel +predict.nullmodel <- function (object, new_data = NULL, type = NULL, ...) { + if(is.null(type)) { + type <- if(is.null(object$levels)) "raw" else "class" + } + + n <- if(is.null(new_data)) object$n else nrow(new_data) + if(!is.null(object$levels)) { + if(type == "prob") { + out <- matrix(rep(object$pct, n), nrow = n, byrow = TRUE) + colnames(out) <- object$levels + out <- as.data.frame(out) + } else { + out <- factor(rep(object$value, n), levels = object$levels) + } + } else { + if(type %in% c("prob", "class")) stop("ony raw predicitons are applicable to regression models") + if(length(object$value) == 1) { + out <- rep(object$value, n) + } else { + out <- as_tibble(matrix(rep(object$value, n), + ncol = length(object$value), byrow = TRUE)) + + names(out) <- names(object$value) + } + } + out +} + +#' General Interface for null models +#' +#' `null_model` is a way to generate a _specification_ of a model before +#' fitting and allows the model to be created using R. It doens't have any +#' main arguments. +#' +#' @param mode A single character string for the type of model. +#' Possible values for this model are "unknown", "regression", or +#' "classification". +#' @details The model can be created using the `fit()` function using the +#' following _engines_: +#' \itemize{ +#' \item \pkg{R}: `"parsnip"` +#' } +#' +#' @section Engine Details: +#' +#' Engines may have pre-set default arguments when executing the +#' model fit call. For this type of +#' model, the template of the fit calls are: +#' +#' \pkg{parsnip} classification +#' +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "classification"), "parsnip")} +#' +#' \pkg{parsnip} regression +#' +#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "regression"), "parsnip")} +#' +#' @importFrom purrr map_lgl +#' @seealso [varying()], [fit()] +#' @examples +#' null_model(mode = "regression") +#' @export +null_model <- + function(mode = "classification") { + # Check for correct mode + if (!(mode %in% null_model_modes)) + stop("`mode` should be one of: ", + paste0("'", null_model_modes, "'", collapse = ", "), + call. = FALSE) + + # Capture the arguments in quosures + args <- list() + + # Save some empty slots for future parts of the specification + out <- list(args = args, eng_args = NULL, + mode = mode, method = NULL, engine = NULL) + + # set classes in the correct order + class(out) <- make_classes("null_model") + out + } diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R new file mode 100644 index 000000000..80380a98a --- /dev/null +++ b/R/nullmodel_data.R @@ -0,0 +1,72 @@ +null_model_arg_key <- data.frame( + parsnip = NULL, + row.names = NULL, + stringsAsFactors = FALSE +) + +null_model_modes <- c("classification", "regression", "unknown") + +null_model_engines <- data.frame( + parsnip = c(TRUE, TRUE, FALSE), + row.names = c("classification", "regression", "unknown") +) + +# ------------------------------------------------------------------------------ + +null_model_parsnip_data <- + list( + libs = "parsnip", + fit = list( + interface = "matrix", + protect = c("x", "y"), + func = c(fun = "nullmodel"), + defaults = list() + ), + class = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "class" + ) + ), + classprob = list( + pre = NULL, + post = function(x, object) { + str(as_tibble(x)) + as_tibble(x) + }, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "prob" + ) + ), + numeric = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "numeric" + ) + ), + raw = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = + list( + object = quote(object$fit), + new_data = quote(new_data), + type = "raw" + ) + ) + ) diff --git a/man/null_model.Rd b/man/null_model.Rd new file mode 100644 index 000000000..d1b709214 --- /dev/null +++ b/man/null_model.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nullmodel.R +\name{null_model} +\alias{null_model} +\title{General Interface for null models} +\usage{ +null_model(mode = "classification") +} +\arguments{ +\item{mode}{A single character string for the type of model. +Possible values for this model are "unknown", "regression", or +"classification".} +} +\description{ +\code{null_model} is a way to generate a \emph{specification} of a model before +fitting and allows the model to be created using R. It doens't have any +main arguments. +} +\details{ +The model can be created using the \code{fit()} function using the +following \emph{engines}: +\itemize{ +\item \pkg{R}: \code{"parsnip"} +} +} +\section{Engine Details}{ + + +Engines may have pre-set default arguments when executing the +model fit call. For this type of +model, the template of the fit calls are: + +\pkg{parsnip} classification + +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "classification"), "parsnip")} + +\pkg{parsnip} regression + +\Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "regression"), "parsnip")} +} + +\examples{ +null_model(mode = "regression") +} +\seealso{ +\code{\link[=varying]{varying()}}, \code{\link[=fit]{fit()}} +} diff --git a/man/nullmodel.Rd b/man/nullmodel.Rd new file mode 100644 index 000000000..d13596d93 --- /dev/null +++ b/man/nullmodel.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nullmodel.R +\name{nullmodel} +\alias{nullmodel} +\alias{nullmodel.default} +\alias{predict.nullmodel} +\alias{print.nullmodel} +\title{Fit a simple, non-informative model} +\usage{ +nullmodel(x, ...) + +\method{nullmodel}{default}(x = NULL, y, ...) + +\method{print}{nullmodel}(x, ...) + +\method{predict}{nullmodel}(object, new_data = NULL, type = NULL, ...) +} +\arguments{ +\item{x}{An optional matrix or data frame of predictors. These values are +not used in the model fit} + +\item{\dots}{Optional arguments (not yet used)} + +\item{y}{A numeric vector (for regression) or factor (for classification) of +outcomes} + +\item{object}{An object of class \code{nullmodel}} + +\item{new_data}{A matrix or data frame of predictors (only used to determine +the number of predictions to return)} + +\item{type}{Either "raw" (for regression), "class" or "prob" (for +classification)} +} +\value{ +The output of \code{nullmodel} is a list of class \code{nullmodel} +with elements \item{call }{the function call} \item{value }{the mean of +\code{y} or the most prevalent class} \item{levels }{when \code{y} is a +factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y} +is a factor, a data frame with a column for each class (\code{NULL} +otherwise). The column for the most prevalent class has the proportion of +the training samples with that class (the other columns are zero). } \item{n +}{the number of elements in \code{y}} + +\code{predict.nullmodel} returns a either a factor or numeric vector +depending on the class of \code{y}. All predictions are always the same. +} +\description{ +Fit a single mean or largest class model +} +\details{ +\code{nullmodel} emulates other model building functions, but returns the +simplest model possible given a training set: a single mean for numeric +outcomes and the most prevalent class for factor outcomes. When class +probabilities are requested, the percentage of the training set samples with +the most prevalent class is returned. +} +\examples{ + +outcome <- factor(sample(letters[1:2], + size = 100, + prob = c(.1, .9), + replace = TRUE)) +useless <- nullmodel(y = outcome) +useless +predict(useless, matrix(NA, nrow = 5)) + +} +\keyword{models} diff --git a/tests/testthat/test_nullmodel.R b/tests/testthat/test_nullmodel.R new file mode 100644 index 000000000..45a6a9932 --- /dev/null +++ b/tests/testthat/test_nullmodel.R @@ -0,0 +1,137 @@ +library(testthat) +library(parsnip) +library(rlang) +library(tibble) + +context("test-nullmodel") +source("helpers.R") + + +test_that('primary arguments', { + basic <- null_model(mode = "regression") + basic_nullmodel <- translate(basic %>% set_engine("parsnip")) + expect_equal(basic_nullmodel$method$fit$args, + list( + x = expr(missing_arg()), + y = expr(missing_arg()) + ) + ) +}) + + +test_that('engine arguments', { + nullmodel_keep <- null_model(mode = "regression") + expect_equal(translate(nullmodel_keep %>% set_engine("parsnip", keepxy = FALSE))$method$fit$args, + list( + x = expr(missing_arg()), + y = expr(missing_arg()), + keepxy = new_empty_quosure(FALSE) + ) + ) +}) + +test_that('bad input', { + expect_error(translate(null_model() %>% set_engine("wat?"))) + expect_error(translate(null_model(mode = "regression") %>% set_engine())) + expect_error(translate(null_model(formula = y ~ x))) + expect_warning( + translate( + null_model(mode = "regression") %>% set_engine("parsnip", x = iris[,1:3], y = iris$Species) + ) + ) +}) + +# ------------------------------------------------------------------------------ + +num_pred <-c("Sepal.Length", "Sepal.Width", "Petal.Width") +iris_bad_form <- as.formula(Species ~ term) +iris_basic <- null_model(mode = "regression") %>% set_engine("parsnip") + +# ------------------------------------------------------------------------------ + +test_that('nullmodel execution', { + + expect_error( + res <- fit( + iris_basic, + Sepal.Length ~ log(Sepal.Width) + Species, + data = iris + ), + regexp = NA + ) + expect_error( + res <- fit_xy( + iris_basic, + x = iris[, num_pred], + y = iris$Petal.Length + ), + regexp = NA + ) + + expect_error( + res <- fit( + iris_basic, + iris_bad_form, + data = iris + ) + ) + + ## multivariate y + + expect_error( + res <- fit( + iris_basic, + cbind(Sepal.Width, Petal.Width) ~ ., + data = iris + ), + regexp = NA + ) + +}) + +test_that('nullmodel prediction', { + + uni_pred <- tibble(.pred = rep(3.758, 5)) + inl_pred <- rep(3.758, 5) + mw_pred <- tibble(gear = rep(3.6875, 5), + carb = rep(2.8125, 5)) + + res_xy <- fit_xy( + iris_basic, + x = iris[, num_pred], + y = iris$Petal.Length + ) + + expect_equal(uni_pred, predict(res_xy, new_data = iris[1:5, num_pred])) + + res_form <- fit( + iris_basic, + Petal.Length ~ log(Sepal.Width) + Species, + data = iris + ) + expect_equal(inl_pred, predict_numeric(res_form, iris[1:5, ])) + + # Multivariate y + res <- fit( + iris_basic, + cbind(gear, carb) ~ ., + data = mtcars + ) + + expect_equal(mw_pred, predict_numeric(res, mtcars[1:5, ])) +}) + +# ------------------------------------------------------------------------------ + +test_that('classification', { + + expect_error( + null_model <- null_model(mode = "classification") %>% + set_engine("parsnip") %>% + fit(Species ~ ., data = iris), + regexp = NA + ) + expect_true(!is.null(null_model$fit)) +}) + +