tidymodels · topepo · Feb 25, 2019 · Jan 23, 2019 · Jan 25, 2019 · Jan 26, 2019
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,10 +9,12 @@ S3method(multi_predict,"_lognet")
 S3method(multi_predict,"_multnet")
 S3method(multi_predict,"_xgb.Booster")
 S3method(multi_predict,default)
+S3method(nullmodel,default)
 S3method(predict,"_elnet")
 S3method(predict,"_lognet")
 S3method(predict,"_multnet")
 S3method(predict,model_fit)
+S3method(predict,nullmodel)
 S3method(predict_class,"_lognet")
 S3method(predict_class,model_fit)
 S3method(predict_classprob,"_lognet")
@@ -37,6 +39,7 @@ S3method(print,model_fit)
 S3method(print,model_spec)
 S3method(print,multinom_reg)
 S3method(print,nearest_neighbor)
+S3method(print,nullmodel)
 S3method(print,rand_forest)
 S3method(print,surv_reg)
 S3method(print,svm_poly)
@@ -95,6 +98,8 @@ export(model_printer)
 export(multi_predict)
 export(multinom_reg)
 export(nearest_neighbor)
+export(null_model)
+export(nullmodel)
 export(predict.model_fit)
 export(predict_class)
 export(predict_class.model_fit)

diff --git a/R/nullmodel.R b/R/nullmodel.R
@@ -0,0 +1,178 @@
+#' Fit a simple, non-informative model
+#'
+#' Fit a single mean or largest class model
+#'
+#' \code{nullmodel} emulates other model building functions, but returns the
+#' simplest model possible given a training set: a single mean for numeric
+#' outcomes and the most prevalent class for factor outcomes. When class
+#' probabilities are requested, the percentage of the training set samples with
+#' the most prevalent class is returned.
+#'
+#' @aliases nullmodel nullmodel.default predict.nullmodel
+#' @param x An optional matrix or data frame of predictors. These values are
+#' not used in the model fit
+#' @param y A numeric vector (for regression) or factor (for classification) of
+#' outcomes
+#' @param \dots Optional arguments (not yet used)
+#' @param object An object of class \code{nullmodel}
+#' @param new_data A matrix or data frame of predictors (only used to determine
+#' the number of predictions to return)
+#' @param type Either "raw" (for regression), "class" or "prob" (for
+#' classification)
+#' @return The output of \code{nullmodel} is a list of class \code{nullmodel}
+#' with elements \item{call }{the function call} \item{value }{the mean of
+#' \code{y} or the most prevalent class} \item{levels }{when \code{y} is a
+#' factor, a vector of levels. \code{NULL} otherwise} \item{pct }{when \code{y}
+#' is a factor, a data frame with a column for each class (\code{NULL}
+#' otherwise). The column for the most prevalent class has the proportion of
+#' the training samples with that class (the other columns are zero). } \item{n
+#' }{the number of elements in \code{y}}
+#'
+#' \code{predict.nullmodel} returns a either a factor or numeric vector
+#' depending on the class of \code{y}. All predictions are always the same.
+#' @keywords models
+#' @examples
+#'
+#' outcome <- factor(sample(letters[1:2],
+#'                          size = 100,
+#'                          prob = c(.1, .9),
+#'                          replace = TRUE))
+#' useless <- nullmodel(y = outcome)
+#' useless
+#' predict(useless, matrix(NA, nrow = 5))
+#'
+#' @export
+nullmodel <- function (x, ...) UseMethod("nullmodel")
+
+#' @export
+#' @rdname nullmodel
+nullmodel.default <- function(x = NULL, y, ...) {
+
+
+  if(is.factor(y)) {
+    lvls <- levels(y)
+    tab <- table(y)
+    value <- names(tab)[which.max(tab)]
+    pct <- tab/sum(tab)
+  } else {
+    lvls <- NULL
+    pct <- NULL
+    if(is.null(dim(y))) {
+      value <- mean(y, na.rm = TRUE)
+    } else {
+      value <- colMeans(y, na.rm = TRUE)
+    }
+  }
+
+  structure(
+    list(call = match.call(),
+         value = value,
+         levels = lvls,
+         pct = pct,
+         n = length(y[[1]])),
+    class = "nullmodel")
+}
+
+#' @export
+#' @rdname nullmodel
+print.nullmodel <- function(x, ...) {
+  cat("Null",
+      ifelse(is.null(x$levels), "Classification", "Regression"),
+      "Model\n")
+  x$call
+
+  if (length(x$value) == 1) {
+    cat("Predicted Value:",
+        ifelse(is.null(x$levels), format(x$value), x$value),
+        "\n")
+  } else {
+    cat("Predicted Value:\n",
+        names(x$value), "\n",
+        x$value,
+        "\n")
+  }
+}
+
+#' @export
+#' @rdname nullmodel
+predict.nullmodel <- function (object, new_data = NULL, type  = NULL, ...) {
+  if(is.null(type)) {
+    type <- if(is.null(object$levels)) "raw" else "class"
+  }
+
+  n <- if(is.null(new_data)) object$n else nrow(new_data)
+  if(!is.null(object$levels)) {
+    if(type == "prob") {
+      out <- matrix(rep(object$pct, n), nrow = n, byrow = TRUE)
+      colnames(out) <- object$levels
+      out <- as.data.frame(out)
+    } else {
+      out <- factor(rep(object$value, n), levels = object$levels)
+    }
+  } else {
+    if(type %in% c("prob", "class")) stop("ony raw predicitons are applicable to regression models")
+    if(length(object$value) == 1) {
+      out <- rep(object$value, n)
+    } else {
+      out <- as_tibble(matrix(rep(object$value, n),
+                              ncol = length(object$value), byrow = TRUE))
+
+      names(out) <- names(object$value)
+    }
+  }
+  out
+}
+
+#' General Interface for null models
+#'
+#' `null_model` is a way to generate a _specification_ of a model before
+#'  fitting and allows the model to be created using R. It doens't have any
+#'  main arguments.
+#'
+#' @param mode A single character string for the type of model.
+#'  Possible values for this model are "unknown", "regression", or
+#'  "classification".
+#' @details The model can be created using the `fit()` function using the
+#'  following _engines_:
+#' \itemize{
+#' \item \pkg{R}:  `"parsnip"`
+#' }
+#'
+#' @section Engine Details:
+#'
+#' Engines may have pre-set default arguments when executing the
+#'  model fit call.  For this type of
+#'  model, the template of the fit calls are:
+#'
+#' \pkg{parsnip} classification
+#'
+#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "classification"), "parsnip")}
+#'
+#' \pkg{parsnip} regression
+#'
+#' \Sexpr[results=rd]{parsnip:::show_fit(parsnip:::null_model(mode = "regression"), "parsnip")}
+#'
+#' @importFrom purrr map_lgl
+#' @seealso [varying()], [fit()]
+#' @examples
+#' null_model(mode = "regression")
+#' @export
+null_model <-
+  function(mode = "classification") {
+    # Check for correct mode
+    if (!(mode %in% null_model_modes))
+      stop("`mode` should be one of: ",
+           paste0("'", null_model_modes, "'", collapse = ", "),
+           call. = FALSE)
+
+    # Capture the arguments in quosures
+    args <- list()
+
+    # Save some empty slots for future parts of the specification
+    out <- list(args = args, eng_args = NULL,
+                mode = mode, method = NULL, engine = NULL)
+
+    # set classes in the correct order
+    class(out) <- make_classes("null_model")
+    out
+  }
diff --git a/R/nullmodel_data.R b/R/nullmodel_data.R
@@ -0,0 +1,72 @@
+null_model_arg_key <- data.frame(
+  parsnip   =  NULL,
+  row.names =  NULL,
+  stringsAsFactors = FALSE
+)
+
+null_model_modes <- c("classification", "regression", "unknown")
+
+null_model_engines <- data.frame(
+  parsnip   = c(TRUE, TRUE, FALSE),
+  row.names =  c("classification", "regression", "unknown")
+)
+
+# ------------------------------------------------------------------------------
+
+null_model_parsnip_data <-
+  list(
+    libs = "parsnip",
+    fit = list(
+      interface = "matrix",
+      protect = c("x", "y"),
+      func = c(fun = "nullmodel"),
+      defaults = list()
+      ),
+    class = list(
+      pre = NULL,
+      post = NULL,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          type = "class"
+        )
+    ),
+    classprob = list(
+        pre = NULL,
+        post = function(x, object) {
+          str(as_tibble(x))
+          as_tibble(x)
+        },
+        func = c(fun = "predict"),
+        args =
+          list(
+            object = quote(object$fit),
+            new_data = quote(new_data),
+            type = "prob"
+          )
+      ),
+    numeric = list(
+      pre = NULL,
+      post = NULL,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          type = "numeric"
+        )
+      ),
+    raw = list(
+      pre = NULL,
+      post = NULL,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          type = "raw"
+        )
+      )
+    )
diff --git a/man/null_model.Rd b/man/null_model.Rd
diff --git a/man/nullmodel.Rd b/man/nullmodel.Rd