tidymodels · topepo · Mar 17, 2019 · Feb 27, 2019 · Feb 27, 2019 · Feb 28, 2019
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,3 +7,6 @@
 ^\.Rproj\.user$
 ^.travis.yml$
 ^R/README\.md$
+derby.log
+^logs$
+^tests/testthat/logs$
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,5 @@
 tests/testthat/derby.log
 tests/testthat/logs/
 *.history
+derby.log
+logs/*
diff --git a/.travis.yml b/.travis.yml
@@ -8,67 +8,76 @@ sudo: true
 warnings_are_errors: false
 
 r:
-- 3.1
-- 3.2
-- oldrel
-- release
-- devel
+  - 3.1
+  - 3.2
+  - oldrel
+  - release
+  - devel
 
-env:
-  global:
-  - KERAS_BACKEND="tensorflow"
-  - MAKEFLAGS="-j 2"
 
-# until we troubleshoot these issues
 matrix:
   allow_failures:
     - r: 3.1
     - r: 3.2
 
 r_binary_packages: 
- - rstan
- - rstanarm
- - RCurl
- - dplyr
- - glue
- - magrittr
- - stringi
- - stringr
- - munsell
- - rlang
- - reshape2
- - scales
- - tibble
- - ggplot2
- - StanHeaders
- - Rcpp
- - RcppEigen
- - BH
- - glmnet
- - earth
- - sparklyr
- - flexsurv
- - ranger
- - randomforest
- - xgboost
- - C50
+  - RCurl
+  - dplyr
+  - glue
+  - magrittr
+  - stringi
+  - stringr
+  - munsell
+  - rlang
+  - reshape2
+  - scales
+  - tibble
+  - ggplot2
+  - Rcpp
+  - RcppEigen
+  - BH
+  - glmnet
+  - earth
+  - sparklyr
+  - flexsurv
+  - ranger
+  - randomforest
+  - xgboost
+  - C50
+
 
 cache:
   packages: true
   directories:
     - $HOME/.keras
     - $HOME/.cache/pip
 
+env:
+  global:
+  - KERAS_BACKEND="tensorflow"
+  - MAKEFLAGS="-j 2"
+
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      g++-6
 
 before_script:
   - python -m pip install --upgrade --ignore-installed --user travis pip setuptools wheel virtualenv
   - python -m pip install --upgrade --ignore-installed --user travis keras h5py pyyaml requests Pillow scipy theano
   - R -e 'tensorflow::install_tensorflow()'
 
+
 before_install:
   - sudo apt-get -y install libnlopt-dev 
   - sudo apt-get update
   - sudo apt-get -y install python3
+  - mkdir -p ~/.R && echo "CXX14=g++-6" > ~/.R/Makevars
+  - echo "CXX14FLAGS += -fPIC" >> ~/.R/Makevars
+
 
 after_success:
   - Rscript -e 'covr::codecov()'
+
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,4 +38,14 @@ Suggests:
     keras,
     xgboost,
     covr,
-    sparklyr
+    C50,
+    sparklyr,
+    earth,
+    glmnet,
+    kernlab,
+    kknn,
+    randomForest,
+    ranger,
+    rpart,
+    MASS,
+    nlme
diff --git a/NEWS.md b/NEWS.md
@@ -14,8 +14,11 @@ that are actually varying).
 
 * `fit_control()` not returns an S3 method. 
 
+* For classification models, an error occurs if the outcome data are not encoded as factors (#115). 
+
 * The prediction modules (e.g. `predict_class`, `predict_numeric`, etc) were de-exported. These were internal functions that were not to be used by the users and the users were using them. 
 
+
 ## Bug Fixes
 
 * `varying_args()` now uses the version from the `generics` package. This means

diff --git a/R/fit_helpers.R b/R/fit_helpers.R
@@ -6,21 +6,21 @@
 #' @importFrom  stats model.frame model.response terms as.formula model.matrix
 form_form <-
   function(object, control, env, ...) {
-    opts <- quos(...)
 
-    if (object$mode != "regression") {
-      y_levels <- levels_from_formula( # prob rewrite this as simple subset/levels
-        env$formula,
-        env$data
-      )
+    if (object$mode == "classification") {
+      # prob rewrite this as simple subset/levels
+      y_levels <- levels_from_formula(env$formula, env$data)
+      if (!inherits(env$data, "tbl_spark") && is.null(y_levels))
+        stop("For classification models, the outcome should be a factor.",
+             call. =  FALSE)
     } else {
       y_levels <- NULL
     }
 
     object <- check_mode(object, y_levels)
 
     # if descriptors are needed, update descr_env with the calculated values
-    if(requires_descrs(object)) {
+    if (requires_descrs(object)) {
       data_stats <- get_descr_form(env$formula, env$data)
       scoped_descrs(data_stats)
     }
@@ -71,8 +71,14 @@ xy_xy <- function(object, env, control, target = "none", ...) {
 
   object <- check_mode(object, levels(env$y))
 
+  if (object$mode == "classification") {
+    if (is.null(levels(env$y)))
+      stop("For classification models, the outcome should be a factor.",
+           call. =  FALSE)
+  }
+
   # if descriptors are needed, update descr_env with the calculated values
-  if(requires_descrs(object)) {
+  if (requires_descrs(object)) {
     data_stats <- get_descr_form(env$formula, env$data)
     scoped_descrs(data_stats)
   }
@@ -125,13 +131,12 @@ form_xy <- function(object, control, env,
   env$x <- data_obj$x
   env$y <- data_obj$y
 
-  res <- list(
-    lvl = levels_from_formula(
-      env$formula,
-      env$data
-    ),
-    spec = object
-  )
+  res <- list(lvl = levels_from_formula(env$formula, env$data), spec = object)
+  if (object$mode == "classification") {
+    if (is.null(res$lvl))
+      stop("For classification models, the outcome should be a factor.",
+           call. =  FALSE)
+  }
 
   res <- xy_xy(
     object = object,
@@ -148,6 +153,13 @@ form_xy <- function(object, control, env,
 }
 
 xy_form <- function(object, env, control, ...) {
+
+  if (object$mode == "classification") {
+    if (is.null(levels(env$y)))
+      stop("For classification models, the outcome should be a factor.",
+           call. =  FALSE)
+  }
+
   data_obj <-
     convert_xy_to_form_fit(
       x = env$x,

diff --git a/R/multinom_reg.R b/R/multinom_reg.R
@@ -168,7 +168,7 @@ check_args.multinom_reg <- function(object) {
 
   args <- lapply(object$args, rlang::eval_tidy)
 
-  if (is.numeric(args$penalty) && args$penalty < 0)
+  if (all(is.numeric(args$penalty)) && any(args$penalty < 0))
     stop("The amount of regularization should be >= 0", call. = FALSE)
   if (is.numeric(args$mixture) && (args$mixture < 0 | args$mixture > 1))
     stop("The mixture proportion should be within [0,1]", call. = FALSE)

diff --git a/tests/testthat/test_boost_tree_C50.R b/tests/testthat/test_boost_tree_C50.R
@@ -1,13 +1,17 @@
 library(testthat)
 library(parsnip)
 library(tibble)
+library(dplyr)
 
 # ------------------------------------------------------------------------------
 
 context("boosted tree execution with C5.0")
 
 data("lending_club")
 lending_club <- head(lending_club, 200)
+lending_club_fail <-
+  lending_club %>%
+  mutate(bad = Inf, miss = NA)
 num_pred <- c("funded_amnt", "annual_inc", "num_il_tl")
 lc_basic <-
   boost_tree(mode = "classification")  %>%
@@ -41,6 +45,8 @@ test_that('C5.0 execution', {
     ),
     regexp = NA
   )
+
+  # outcome is not a factor:
   expect_error(
     res <- fit(
       lc_basic,
@@ -51,19 +57,21 @@ test_that('C5.0 execution', {
     )
   )
 
+  # Model fails
   C5.0_form_catch <- fit(
     lc_basic,
-    funded_amnt ~ term,
-    data = lending_club,
+    Class ~ miss,
+    data = lending_club_fail,
     control = caught_ctrl
   )
   expect_true(inherits(C5.0_form_catch$fit, "try-error"))
 
+  # Model fails
   C5.0_xy_catch <- fit_xy(
     lc_basic,
     control = caught_ctrl,
-    x = lending_club[, num_pred],
-    y = lending_club$total_bal_il
+    x = lending_club_fail[, "miss"],
+    y = lending_club_fail$Class
   )
   expect_true(inherits(C5.0_xy_catch$fit, "try-error"))
 })
@@ -108,11 +116,12 @@ test_that('C5.0 probabilities', {
 test_that('submodel prediction', {
 
   skip_if_not_installed("C50")
+  library(C50)
 
   vars <- c("female", "tenure", "total_charges", "phone_service", "monthly_charges")
   class_fit <-
     boost_tree(trees = 20, mode = "classification") %>%
-    set_engine("C5.0", control = C50::C5.0Control(earlyStopping = FALSE)) %>%
+    set_engine("C5.0", control = C5.0Control(earlyStopping = FALSE)) %>%
     fit(churn ~ ., data = wa_churn[-(1:4), c("churn", vars)])
 
   pred_class <- predict(class_fit$fit, wa_churn[1:4, vars], trials = 4, type = "prob")

diff --git a/tests/testthat/test_linear_reg_stan.R b/tests/testthat/test_linear_reg_stan.R
@@ -102,14 +102,21 @@ test_that('stan intervals', {
             type = "pred_int",
             level = 0.93)
 
-  prediction_stan <-
-    predictive_interval(res_xy$fit, newdata = iris[1:5, ], seed = 13,
-                        prob = 0.93)
-
-  stan_post <- posterior_linpred(res_xy$fit, newdata = iris[1:5, ],
-                                 seed = 13)
-  stan_lower <- apply(stan_post, 2, quantile, prob = 0.035)
-  stan_upper <- apply(stan_post, 2, quantile, prob = 0.965)
+  # prediction_stan <-
+  #   predictive_interval(res_xy$fit, newdata = iris[1:5, ], seed = 13,
+  #                       prob = 0.93)
+  #
+  # stan_post <- posterior_linpred(res_xy$fit, newdata = iris[1:5, ],
+  #                                seed = 13)
+  # stan_lower <- apply(stan_post, 2, quantile, prob = 0.035)
+  # stan_upper <- apply(stan_post, 2, quantile, prob = 0.965)
+
+  stan_lower <- c(`1` = 4.93164991101342, `2` = 4.60197941230393,
+                  `3` = 4.6671442757811, `4` = 4.74402724639963,
+                  `5` = 4.99248110476701)
+  stan_upper <- c(`1` = 5.1002837047058, `2` = 4.77617561853506,
+                  `3` = 4.83183673602725, `4` = 4.90844811805409,
+                  `5` = 5.16979395659009)
 
   expect_equivalent(confidence_parsnip$.pred_lower, stan_lower)
   expect_equivalent(confidence_parsnip$.pred_upper, stan_upper)

diff --git a/tests/testthat/test_logistic_reg.R b/tests/testthat/test_logistic_reg.R
@@ -244,23 +244,24 @@ test_that('glm execution', {
     )
   )
 
-  # passes interactively but not on R CMD check
-  # glm_form_catch <- fit(
-  #   lc_basic,
-  #   funded_amnt ~ term,
-  #   data = lending_club,
-  #
-  #   control = caught_ctrl
-  # )
-  # expect_true(inherits(glm_form_catch$fit, "try-error"))
+  # wrong outcome type
+  expect_error(
+    glm_form_catch <- fit(
+      lc_basic,
+      funded_amnt ~ term,
+      data = lending_club,
+      control = caught_ctrl
+    )
+  )
 
-  glm_xy_catch <- fit_xy(
-    lc_basic,
-    control = caught_ctrl,
-    x = lending_club[, num_pred],
-    y = lending_club$total_bal_il
+  expect_error(
+    glm_xy_catch <- fit_xy(
+      lc_basic,
+      control = caught_ctrl,
+      x = lending_club[, num_pred],
+      y = lending_club$total_bal_il
+    )
   )
-  expect_true(inherits(glm_xy_catch$fit, "try-error"))
 })
 
 test_that('glm prediction', {