Skip to content

Commit

Permalink
Merge branch 'master' into lintr
Browse files Browse the repository at this point in the history
  • Loading branch information
zachmayer committed May 13, 2015
2 parents 4e6cf06 + 1b631d6 commit 668dc97
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 319 deletions.
10 changes: 8 additions & 2 deletions R/caretList.R
Expand Up @@ -127,6 +127,7 @@ extractCaretTarget.formula <- function(form, data, ...){
#' @param trControl a \code{\link{trainControl}} object. We are going to intercept this object check that it has the "index" slot defined, and define the indexes if they are not.
#' @param methodList optional, a character vector of caret models to ensemble. One of methodList or tuneList must be specified.
#' @param tuneList optional, a NAMED list of caretModelSpec objects. This much more flexible than methodList and allows the specificaiton of model-specific parameters (e.g. passing trace=FALSE to nnet)
#' @param continue_on_fail, logical, should a valid caretList be returned that excludes models that fail, default is FALSE
#' @return A list of \code{\link{train}} objects. If the model fails to build,
#' it is dropped from the list.
#' @import caret
Expand All @@ -153,7 +154,8 @@ caretList <- function(
...,
trControl = trainControl(),
methodList = NULL,
tuneList = NULL) {
tuneList = NULL,
continue_on_fail = FALSE) {

#Checks
if(is.null(tuneList) & is.null(methodList)){
Expand Down Expand Up @@ -189,7 +191,11 @@ caretList <- function(
#Loop through the tuneLists and fit caret models with those specs
modelList <- lapply(tuneList, function(m){
model_args <- c(global_args, m)
model <- tryCatch(do.call(train, model_args), error=function(e) NULL)
if(continue_on_fail == TRUE){
model <- tryCatch(do.call(train, model_args), error=function(e) NULL)
} else{
model <- do.call(train, model_args)
}
return(model)
})
names(modelList) <- names(tuneList)
Expand Down
4 changes: 3 additions & 1 deletion man/caretList.Rd
Expand Up @@ -5,7 +5,7 @@
\title{Create a list of several train models from the caret package}
\usage{
caretList(..., trControl = trainControl(), methodList = NULL,
tuneList = NULL)
tuneList = NULL, continue_on_fail = FALSE)
}
\arguments{
\item{...}{arguments to pass to \code{\link{train}}. These arguments will determine which train method gets dispatched.}
Expand All @@ -15,6 +15,8 @@ caretList(..., trControl = trainControl(), methodList = NULL,
\item{methodList}{optional, a character vector of caret models to ensemble. One of methodList or tuneList must be specified.}

\item{tuneList}{optional, a NAMED list of caretModelSpec objects. This much more flexible than methodList and allows the specificaiton of model-specific parameters (e.g. passing trace=FALSE to nnet)}

\item{continue_on_fail,}{logical, should a valid caretList be returned that excludes models that fail, default is FALSE}
}
\value{
A list of \code{\link{train}} objects. If the model fails to build,
Expand Down
240 changes: 1 addition & 239 deletions tests/testthat/test-ensemble.R
Expand Up @@ -60,242 +60,4 @@ test_that("We can ensemble models of different predictors", {
expect_true(length(pred.nest)==150)
})

context("Does ensembling work with missingness")

test_that("Warnings issued for missing data correctly", {
skip_on_cran()
mseeds <- vector(mode = "list", length = 12)
for(i in 1:11) mseeds[[i]] <- sample.int(1000, 1)
mseeds[[12]] <- sample.int(1000, 1)
myControl = trainControl(method = "cv", number = 10, repeats = 1,
p = 0.75, savePrediction = TRUE,
classProbs = TRUE, returnResamp = "final",
returnData = TRUE, seeds = mseeds)

trainC <- twoClassSim(n = 2000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)

testC <- twoClassSim(n = 1000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)
MCAR.df <- function(df, p){
MCARx <- function(x, p){
z <- rbinom(length(x), 1, prob=p)
x[z==1] <- NA
return(x)
}
if(length(p) == 1){
df <- apply(df, 2, MCARx, p)
} else if(length(p) > 1) {
df <- apply(df, 2, MCARx, sample(p, 1))
}
df <- as.data.frame(df)
return(df)
}
set.seed(3256)
trainC[, c(1:17)] <- MCAR.df(trainC[, c(1:17)], 0.15)
testC[, c(1:17)] <- MCAR.df(testC[, c(1:17)], 0.05)
set.seed(482)
glm1 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm2 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl, preProcess = "medianImpute")
set.seed(482)
glm3 <- train(x = trainC[, c(2:9)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm4 <- train(x = trainC[, c(1, 9:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)

nestedList <- list(glm1, glm2, glm3, glm4)
class(nestedList) <- 'caretList'
set.seed(482)
ensNest <- caretEnsemble(nestedList, iter=2000)
pred.nest1 <- predict(ensNest, keepNA = TRUE, newdata=testC[, c(1:17)])
pred.nest1a <- predict(ensNest, newdata = testC[, c(1:17)])
pred.nest2 <- predict(ensNest, keepNA=FALSE, newdata = testC[, c(1:17)])
pred.nestTrain_a <- predict(ensNest, keepNA = FALSE)
pred.nestTrain_b <- predict(ensNest, keepNA = TRUE)
expect_warning(caretEnsemble(nestedList, iter=20), "Missing values found")
expect_warning(caretEnsemble(nestedList, iter=20), "not consistent across")
expect_that(ensNest, is_a("caretEnsemble"))
pred.nest1 <- predict(ensNest, newdata=testC[, c(1:17)])
expect_message(predict(ensNest, newdata = testC[, c(1:17)]))
expect_message(predict(ensNest, keepNA=FALSE, newdata=testC[, c(1:17)]))
expect_true(is.numeric(pred.nest1))
expect_false(is.list(pred.nest2))
expect_true(is.numeric(pred.nest2))
expect_true(anyNA(pred.nest1))
expect_false(anyNA(pred.nest2))
expect_true(is.numeric(pred.nest1a))
expect_true(length(pred.nestTrain_b)==2000)
expect_true(length(pred.nest1)==1000)
expect_true(length(pred.nestTrain_a)==2000)
expect_true(length(pred.nest2)==1000)
})

test_that("Predictions the same for non-missing data under predict", {
skip_on_cran()
load(system.file("testdata/models_class.rda",
package="caretEnsemble", mustWork=TRUE))
load(system.file("testdata/models_reg.rda",
package="caretEnsemble", mustWork=TRUE))
ens.reg <- caretEnsemble(models_reg, iter=1000)
ens.class <- caretEnsemble(models_class, iter=1000)
pred1 <- predict(ens.reg, keepNA = FALSE)
pred2 <- predict(ens.reg, keepNA = TRUE)
expect_true(identical(pred1, pred2))
pred1 <- predict(ens.class, keepNA = FALSE)
pred2 <- predict(ens.class, keepNA = TRUE)
expect_true(identical(pred1, pred2))
pred1 <- predict(ens.reg, keepNA = FALSE, se = TRUE)
pred2 <- predict(ens.reg, keepNA = TRUE, se = TRUE)
expect_true(identical(pred1, pred2))
pred1 <- predict(ens.class, keepNA = FALSE, se = TRUE)
pred2 <- predict(ens.class, keepNA = TRUE, se =TRUE)
expect_true(identical(pred1, pred2))
})

test_that("NA preservation and standard errors work right", {
skip_on_cran()
mseeds <- vector(mode = "list", length = 12)
for(i in 1:11) mseeds[[i]] <- sample.int(1000, 1)
mseeds[[12]] <- sample.int(1000, 1)
myControl = trainControl(method = "cv", number = 10, repeats = 1,
p = 0.75, savePrediction = TRUE,
classProbs = TRUE, returnResamp = "final",
returnData = TRUE, seeds = mseeds)

trainC <- twoClassSim(n = 2000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)

testC <- twoClassSim(n = 1000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)
MCAR.df <- function(df, p){
MCARx <- function(x, p){
z <- rbinom(length(x), 1, prob=p)
x[z==1] <- NA
return(x)
}
if(length(p) == 1){
df <- apply(df, 2, MCARx, p)
} else if(length(p) > 1) {
df <- apply(df, 2, MCARx, sample(p, 1))
}
df <- as.data.frame(df)
return(df)
}

set.seed(3256)
trainC[, c(1:17)] <- MCAR.df(trainC[, c(1:17)], 0.15)
testC[, c(1:17)] <- MCAR.df(testC[, c(1:17)], 0.05)
set.seed(482)
glm1 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm2 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl, preProcess = "medianImpute")
set.seed(482)
glm3 <- train(x = trainC[, c(2:9)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm4 <- train(x = trainC[, c(1, 9:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
nestedList <- list(glm1, glm2, glm3, glm4)
class(nestedList) <- 'caretList'
set.seed(482)
ensNest <- caretEnsemble(nestedList, iter=200)
load(system.file("testdata/models_class.rda",
package="caretEnsemble", mustWork=TRUE))
load(system.file("testdata/models_reg.rda",
package="caretEnsemble", mustWork=TRUE))
ens.reg <- caretEnsemble(models_reg, iter=1000)
ens.class <- caretEnsemble(models_class, iter=1000)
pred1 <- predict(ens.reg, keepNA = FALSE)
pred2 <- predict(ens.reg, keepNA = TRUE)
expect_is(pred1, "numeric")
expect_is(pred2, "numeric")
pred1 <- predict(ens.class, keepNA = FALSE)
pred2 <- predict(ens.class, keepNA = TRUE)
expect_is(pred1, "numeric")
expect_is(pred2, "numeric")
pred1 <- predict(ens.class, keepNA = FALSE, se = TRUE)
pred2 <- predict(ens.class, keepNA = TRUE, se = TRUE)
expect_is(pred1, "data.frame")
expect_is(pred2, "data.frame")
pred1 <- predict(ens.reg, keepNA = FALSE, se = TRUE)
pred2 <- predict(ens.reg, keepNA = TRUE, se = TRUE)
expect_is(pred1, "data.frame")
expect_is(pred2, "data.frame")
nestedList <- list(glm1, glm2, glm3, glm4)
class(nestedList) <- 'caretList'
ensNest <- caretEnsemble(nestedList, iter=2000)
pred.nest1 <- predict(ensNest, keepNA = TRUE, newdata=testC[, c(1:17)], se =TRUE)
pred.nest1a <- predict(ensNest, newdata = testC[, c(1:17)], se = TRUE)
pred.nest2 <- predict(ensNest, keepNA = FALSE, newdata = testC[, c(1:17)], se = TRUE)
pred.nestTrain_a <- predict(ensNest, keepNA = FALSE, se = TRUE)
pred.nestTrain_b <- predict(ensNest, keepNA = TRUE, se = TRUE)
expect_is(pred.nest1, "data.frame")
expect_is(pred.nest1a, "data.frame")
expect_is(pred.nest2, "data.frame")
expect_is(pred.nestTrain_a, "data.frame")
expect_is(pred.nestTrain_b, "data.frame")
expect_message(predict(ensNest, keepNA = TRUE, newdata=testC[, c(1:17)]), "complete data")
expect_message(predict(ensNest, newdata = testC[, c(1:17)]), "complete data")
expect_message(predict(ensNest, keepNA=FALSE, newdata = testC[, c(1:17)]), "available data")
expect_message(predict(ensNest, keepNA = FALSE), "available data")
expect_message(predict(ensNest, keepNA = TRUE), "complete data")
expect_warning(predict(ensNest, keepNA = TRUE, return_weights = "car"), "default set to")
nestedList <- list(glm1, glm2, glm3, glm4)
class(nestedList) <- 'caretList'
set.seed(482)
ensNest <- caretEnsemble(nestedList, iter=2000)
pred.nest1 <- predict(ensNest, keepNA = FALSE, newdata = testC[, c(1:17)], se = TRUE)
pred.nest2 <- predict(ensNest, keepNA = FALSE, se = TRUE)
pred.nest3 <- predict(ensNest, keepNA = TRUE, se = TRUE)
pred.nest4 <- predict(ensNest, keepNA = TRUE, se = FALSE)
expect_is(pred.nest1, "data.frame")
expect_is(pred.nest2, "data.frame")
expect_is(pred.nest3, "data.frame")
expect_is(pred.nest4, "numeric")
expect_identical(names(pred.nest1), names(pred.nest2))
expect_identical(names(pred.nest2), names(pred.nest3))
pred.nest1 <- predict(ensNest, keepNA = FALSE, newdata = testC[, c(1:17)], se = TRUE,
return_weights = TRUE)
pred.nest2 <- predict(ensNest, keepNA = FALSE, se = TRUE, return_weights = TRUE)
pred.nest3 <- predict(ensNest, keepNA = TRUE, se = TRUE, return_weights = TRUE)
pred.nest4 <- predict(ensNest, keepNA = TRUE, se = FALSE, return_weights = TRUE)
pred.nest5 <- predict(ensNest, keepNA = FALSE, se = FALSE, return_weights = TRUE)
expect_is(pred.nest1, "list")
expect_is(pred.nest2, "list")
expect_is(pred.nest3, "list")
expect_is(pred.nest4, "list")
expect_is(pred.nest5, "list")
pred.nest1 <- predict(ensNest, keepNA = FALSE, newdata = testC[, c(1:17)], se = TRUE,
return_weights = TRUE)
pred.nest2 <- predict(ensNest, keepNA = FALSE, se = TRUE, return_weights = TRUE)
pred.nest3 <- predict(ensNest, keepNA = TRUE, se = TRUE, return_weights = TRUE)
pred.nest4 <- predict(ensNest, keepNA = TRUE, se = FALSE, return_weights = TRUE)
pred.nest5 <- predict(ensNest, keepNA = FALSE, se = FALSE, return_weights = TRUE)
expect_identical(names(pred.nest1), c("preds", "weight"))
expect_identical(names(pred.nest1), names(pred.nest2))
expect_identical(names(pred.nest1), names(pred.nest3))
expect_identical(names(pred.nest1), names(pred.nest4))
expect_identical(names(pred.nest1), names(pred.nest5))
expect_identical(names(pred.nest1$preds), names(pred.nest2$preds))
expect_identical(names(pred.nest3$preds), names(pred.nest2$preds))
expect_null(names(pred.nest4$preds))
expect_null(names(pred.nest5$preds))
expect_identical(nrow(pred.nest1$preds), 1000L)
expect_identical(nrow(pred.nest2$preds), nrow(pred.nest3$preds))
expect_is(pred.nest1$weight, "matrix")
expect_is(pred.nest2$weight, "matrix")
expect_is(pred.nest3$weight, "matrix")
expect_is(pred.nest4$weight, "matrix")
expect_is(pred.nest5$weight, "matrix")
expect_identical(dim(pred.nest1$weight), c(1000L, 4L))
expect_identical(dim(pred.nest2$weight), c(2000L, 4L))
expect_identical(dim(pred.nest3$weight), c(1L, 4L))
expect_identical(dim(pred.nest4$weight), c(1L, 4L))
expect_identical(dim(pred.nest5$weight), c(2000L, 4L))
})
# context("Does ensembling work with missingness")
76 changes: 0 additions & 76 deletions tests/testthat/test-ensembleMethods.R
Expand Up @@ -300,72 +300,6 @@ context("Does prediction method work for classification")

test_that("We can ensemble models and handle missingness across predictors", {
skip_on_cran()
mseeds <- vector(mode = "list", length = 12)
for(i in 1:11) mseeds[[i]] <- sample.int(1000, 1)
mseeds[[12]] <- sample.int(1000, 1)
myControl = trainControl(method = "cv", number = 10, repeats = 1,
p = 0.75, savePrediction = TRUE,
classProbs = TRUE, returnResamp = "final",
returnData = TRUE, seeds = mseeds)

trainC <- twoClassSim(n = 2000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)

testC <- twoClassSim(n = 1000, intercept = -9, linearVars = 6, noiseVars = 4, corrVars = 2,
corrType = "AR1", corrValue = 0.6, mislabel = 0)

MCAR.df <- function(df, p){
MCARx <- function(x, p){
z <- rbinom(length(x), 1, prob=p)
x[z==1] <- NA
return(x)
}
if(length(p) == 1){
df <- apply(df, 2, MCARx, p)
} else if(length(p) > 1) {
df <- apply(df, 2, MCARx, sample(p, 1))
}
df <- as.data.frame(df)
return(df)
}

set.seed(3256)
trainC[, c(1:17)] <- MCAR.df(trainC[, c(1:17)], 0.15)
testC[, c(1:17)] <- MCAR.df(testC[, c(1:17)], 0.05)

set.seed(482)
glm1 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm2 <- train(x = trainC[, c(1:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl, preProcess = "medianImpute")
set.seed(482)
glm3 <- train(x = trainC[, c(2:9)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
set.seed(482)
glm4 <- train(x = trainC[, c(1, 9:17)], y = trainC[, "Class"], method = 'glm',
trControl = myControl)
nestedList <- list(glm1, glm2, glm3, glm4)
class(nestedList) <- 'caretList'
set.seed(482)
ensNest <- caretEnsemble(nestedList, iter=2000)
pred.nest1 <- predict(ensNest, keepNA = TRUE, newdata=testC[, c(1:17)])
pred.nest1a <- predict(ensNest, newdata = testC[, c(1:17)])
pred.nest2 <- predict(ensNest, keepNA=FALSE, newdata = testC[, c(1:17)])
pred.nestTrain_a <- predict(ensNest, keepNA = FALSE)
pred.nestTrain_b <- predict(ensNest, keepNA = TRUE)
expect_that(ensNest, is_a("caretEnsemble"))
pred.nest1 <- predict(ensNest, newdata=testC[, c(1:17)])
expect_message(predict(ensNest, newdata=testC[, c(1:17)]))
expect_message(predict(ensNest, keepNA=TRUE, newdata=testC[1:20, c(1:17)]))
expect_true(is.numeric(pred.nest1))
expect_is(pred.nest2, "numeric")
expect_true(is.numeric(pred.nest1a))
expect_true(length(pred.nestTrain_b)==2000)
expect_true(length(pred.nest1)==1000)
expect_true(length(pred.nestTrain_a)==2000)
expect_true(length(pred.nest2)==1000)
expect_true(anyNA(pred.nest1))
load(system.file("testdata/models_reg.rda",
package="caretEnsemble", mustWork=TRUE))
load(system.file("testdata/models_class.rda",
Expand Down Expand Up @@ -402,22 +336,12 @@ test_that("We can ensemble models and handle missingness across predictors", {
expect_identical(modres2[1, 3], caretEnsemble:::getMetricSD.train(ens.reg$models[[1]], "RMSE", which = "all"))
expect_false(identical(modres2[2, 3], caretEnsemble:::getMetricSD.train(ens.reg$models[[2]],
"RMSE", which = "all")))
modres3 <- caretEnsemble:::extractModRes(ensNest)
expect_equal(modres3[1, 3], caretEnsemble:::getMetricSD.train(ensNest$models[[1]], "AUC", which = "best"))
expect_equal(modres3[2, 3], caretEnsemble:::getMetricSD.train(ensNest$models[[2]], "AUC", which = "best"))
expect_equal(modres3[3, 3], caretEnsemble:::getMetricSD.train(ensNest$models[[3]], "AUC", which = "best"))
expect_equal(modres3[4, 3], caretEnsemble:::getMetricSD.train(ensNest$models[[4]], "AUC", which = "best"))
modF <- caretEnsemble:::extractModFrame(ens.class)
modF2 <- caretEnsemble:::extractModFrame(ens.reg)
modF3 <- caretEnsemble:::extractModFrame(ensNest)
expect_true(ncol(modF) > ncol(ens.class$models[[2]]$trainingData))
expect_true(ncol(modF2) > ncol(ens.reg$models[[1]]$trainingData))
expect_true(ncol(modF3) > ncol(ensNest$models[[3]]$trainingData))
expect_true(ncol(modF3) > ncol(ensNest$models[[4]]$trainingData))
expect_true(nrow(modF) == nrow(ens.class$models[[2]]$trainingData))
expect_true(nrow(modF2) == nrow(ens.reg$models[[1]]$trainingData))
expect_true(nrow(modF3) == nrow(ensNest$models[[3]]$trainingData))
expect_true(nrow(modF3) == nrow(ensNest$models[[4]]$trainingData))
})

context("Does prediction method work for regression")
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-optimizers.R
Expand Up @@ -110,7 +110,7 @@ test_that("Warnings and fallbacks in degenerate cases", {

expect_false(identical(wghts1, wghts2))
expect_equal(wghts1, c(1, 0, 0, 0))
expect_equal(wghts2, c(39, 1, 60, 0))
expect_equal(wghts2, c(46, 0, 54, 0))

ens1 <- caretEnsemble(out, optFUN = safeOptAUC)
ens2 <- caretEnsemble(out, optFUN = greedOptAUC)
Expand Down

0 comments on commit 668dc97

Please sign in to comment.