tidymodels · topepo · Oct 5, 2020 · Sep 29, 2020 · Sep 29, 2020 · Sep 29, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,7 +31,7 @@ Imports:
     prettyunits,
     vctrs (>= 0.2.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.1.9000
 Suggests: 
     testthat,
     knitr,
@@ -46,9 +46,10 @@ Suggests:
     kernlab,
     kknn,
     randomForest,
-    ranger,
+    ranger (>= 0.12.0),
     rpart,
     MASS,
     nlme,
     modeldata,
-    liquidSVM
+    liquidSVM,
+    Matrix
diff --git a/NAMESPACE b/NAMESPACE
@@ -130,6 +130,8 @@ export(linear_reg)
 export(logistic_reg)
 export(make_classes)
 export(mars)
+export(maybe_data_frame)
+export(maybe_matrix)
 export(mlp)
 export(model_printer)
 export(multi_predict)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 * `show_engines()` will provide information on the current set for a model. 
 
+* For three models (`glmnet`, `xgboost`, and `ranger`), enable sparse matrix use via `fit_xy()` (#373).
+
 # parsnip 0.1.3
 
  * A `glance()` method for `model_fit` objects was added (#325)

diff --git a/R/aaa_models.R b/R/aaa_models.R
@@ -819,7 +819,8 @@ check_encodings <- function(x) {
   }
   req_args <- list(predictor_indicators = rlang::na_chr,
                    compute_intercept = rlang::na_lgl,
-                   remove_intercept = rlang::na_lgl)
+                   remove_intercept = rlang::na_lgl,
+                   allow_sparse_x = rlang::na_lgl)
 
   missing_args <- setdiff(names(req_args), names(x))
   if (length(missing_args) > 0) {
@@ -896,7 +897,8 @@ get_encoding <- function(model) {
         model = model,
         predictor_indicators = "traditional",
         compute_intercept = TRUE,
-        remove_intercept = TRUE
+        remove_intercept = TRUE,
+        allow_sparse_x = FALSE
       ) %>%
       dplyr::select(model, engine, mode, predictor_indicators,
                     compute_intercept, remove_intercept)

diff --git a/R/arguments.R b/R/arguments.R
@@ -191,8 +191,8 @@ make_xy_call <- function(object, target) {
     switch(
       target,
       none = rlang::expr(x),
-      data.frame = rlang::expr(as.data.frame(x)),
-      matrix = rlang::expr(as.matrix(x)),
+      data.frame = rlang::expr(maybe_data_frame(x)),
+      matrix = rlang::expr(maybe_matrix(x)),
       rlang::abort(glue::glue("Invalid data type target: {target}."))
     )
 

diff --git a/R/boost_tree.R b/R/boost_tree.R
@@ -290,11 +290,8 @@ xgb_train <- function(
   min_child_weight = 1, gamma = 0, subsample = 1, validation = 0,
   early_stop = NULL, ...) {
 
-  if (length(levels(y)) > 2) {
-    num_class <- length(levels(y))
-  }  else {
-    num_class <- NULL
-  }
+  num_class <- length(levels(y))
+
   if (!is.numeric(validation) || validation < 0 || validation >= 1) {
     rlang::abort("`validation` should be on [0, 1).")
   }
@@ -311,36 +308,17 @@ xgb_train <- function(
   if (is.numeric(y)) {
     loss <- "reg:squarederror"
   } else {
-    lvl <- levels(y)
-    y <- as.numeric(y) - 1
-    if (length(lvl) == 2) {
+    if (num_class == 2) {
       loss <- "binary:logistic"
     } else {
       loss <- "multi:softprob"
     }
   }
 
-  if (is.data.frame(x)) {
-    x <- as.matrix(x) # maybe use model.matrix here?
-  }
-
   n <- nrow(x)
   p <- ncol(x)
 
-  if (!inherits(x, "xgb.DMatrix")) {
-    if (validation > 0) {
-      trn_index <- sample(1:n, size = floor(n * validation) + 1)
-      wlist <-
-        list(validation = xgboost::xgb.DMatrix(x[-trn_index, ], label = y[-trn_index], missing = NA))
-      x <- xgboost::xgb.DMatrix(x[trn_index, ], label = y[trn_index], missing = NA)
-
-    } else {
-      x <- xgboost::xgb.DMatrix(x, label = y, missing = NA)
-      wlist <- list(training = x)
-    }
-  } else {
-    xgboost::setinfo(x, "label", y)
-  }
+  x <- as_xgb_data(x, y, validation)
 
   # translate `subsample` and `colsample_bytree` to be on (0, 1] if not
   if (subsample > 1) {
@@ -366,17 +344,15 @@ xgb_train <- function(
     subsample = subsample
   )
 
-  # eval if contains expressions?
-
   main_args <- list(
-    data = quote(x),
-    watchlist = quote(wlist),
+    data = quote(x$data),
+    watchlist = quote(x$watchlist),
     params = arg_list,
     nrounds = nrounds,
     objective = loss,
     early_stopping_rounds = early_stop
   )
-  if (!is.null(num_class)) {
+  if (!is.null(num_class) && num_class > 2) {
     main_args$num_class <- num_class
   }
 
@@ -399,7 +375,7 @@ xgb_train <- function(
 #' @importFrom stats binomial
 xgb_pred <- function(object, newdata, ...) {
   if (!inherits(newdata, "xgb.DMatrix")) {
-    newdata <- as.matrix(newdata)
+    newdata <- maybe_matrix(newdata)
     newdata <- xgboost::xgb.DMatrix(data = newdata, missing = NA)
   }
 
@@ -415,6 +391,37 @@ xgb_pred <- function(object, newdata, ...) {
   x
 }
 
+
+as_xgb_data <- function(x, y, validation = 0, ...) {
+  lvls <- levels(y)
+  n <- nrow(x)
+
+  if (is.data.frame(x)) {
+    x <- as.matrix(x)
+  }
+
+  if (is.factor(y)) {
+    y <- as.numeric(y) - 1
+  }
+
+  if (!inherits(x, "xgb.DMatrix")) {
+    if (validation > 0) {
+      trn_index <- sample(1:n, size = floor(n * (1 - validation)) + 1)
+      wlist <-
+        list(validation = xgboost::xgb.DMatrix(x[-trn_index, ], label = y[-trn_index], missing = NA))
+      dat <- xgboost::xgb.DMatrix(x[trn_index, ], label = y[trn_index], missing = NA)
+
+    } else {
+      dat <- xgboost::xgb.DMatrix(x, label = y, missing = NA)
+      wlist <- list(training = dat)
+    }
+  } else {
+    dat <- xgboost::setinfo(x, "label", y)
+    wlist <- list(training = dat)
+  }
+
+  list(data = dat, watchlist = wlist)
+}
 #' @importFrom purrr map_df
 #' @export
 #' @rdname multi_predict

diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
@@ -94,7 +94,8 @@ set_encoding(
   options = list(
     predictor_indicators = "one_hot",
     compute_intercept = FALSE,
-    remove_intercept = TRUE
+    remove_intercept = TRUE,
+    allow_sparse_x = TRUE
   )
 )
 
@@ -143,7 +144,8 @@ set_encoding(
   options = list(
     predictor_indicators = "one_hot",
     compute_intercept = FALSE,
-    remove_intercept = TRUE
+    remove_intercept = TRUE,
+    allow_sparse_x = TRUE
   )
 )
 
@@ -250,7 +252,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -384,7 +387,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -408,7 +412,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 

diff --git a/R/convert_data.R b/R/convert_data.R
@@ -323,3 +323,31 @@ check_dup_names <- function(x, y) {
     )
   invisible(NULL)
 }
+
+## -----------------------------------------------------------------------------
+
+#' Fuzzy conversions
+#'
+#' These are substitutes for `as.matrix()` and `as.data.frame()` that leave
+#'  a sparse matrix as-is.
+#' @param x A data frame, matrix, or sparse matrix.
+#' @return A data frame, matrix, or sparse matrix.
+#' @export
+maybe_matrix <- function(x) {
+  inher(x, c("data.frame", "matrix", "dgCMatrix"), cl = match.call())
+  if (is.data.frame(x)) {
+    x <- as.matrix(x)
+  }
+  # leave alone if matrix or sparse matrix
+  x
+}
+
+#' @rdname maybe_matrix
+#' @export
+maybe_data_frame <- function(x) {
+  if (!inherits(x, "dgCMatrix")) {
+    x <- as.data.frame(x)
+  }
+  x
+}
+
diff --git a/R/decision_tree_data.R b/R/decision_tree_data.R
@@ -55,7 +55,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -78,7 +79,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -187,7 +189,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -285,7 +288,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )
 
@@ -310,7 +314,8 @@ set_encoding(
   options = list(
     predictor_indicators = "none",
     compute_intercept = FALSE,
-    remove_intercept = FALSE
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
   )
 )