Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Implemented optimism bootstrap #544

Merged
merged 3 commits into from Dec 5, 2016
Jump to file or symbol
Failed to load files and symbols.
+176 −31
Split
View
@@ -21,6 +21,9 @@
NULL
+.onUnload <- function(libpath) { library.dynam.unload("caret", libpath) }
+
+
###################################################################
## Global Variables
###################################################################
@@ -572,7 +572,8 @@ resampName <- function(x, numbers = TRUE){
ifelse(x$control$fixedWindow, " a ", " no "),
"fixed window)", sep = ""),
oob = "Out of Bag Resampling",
- boot =, boot632 = paste("Bootstrapped (", numResamp, " reps)", sep = ""),
+ boot =, optimism_boot =, boot_all =,
+ boot632 = paste("Bootstrapped (", numResamp, " reps)", sep = ""),
cv = paste("Cross-Validated (", x$control$number, " fold)", sep = ""),
repeatedcv = paste("Cross-Validated (", x$control$number, " fold, repeated ",
x$control$repeats, " times)", sep = ""),
@@ -593,6 +594,8 @@ resampName <- function(x, numbers = TRUE){
timeslice = "Rolling Forecasting Origin Resampling",
oob = "Out of Bag Resampling",
boot = "(Bootstrap)",
+ optimism_boot = "(Optimism Bootstrap)",
+ boot_all = "(Bootstrap All)",
boot632 = "(Bootstrap 632 Rule)",
cv = "(Cross-Validation)",
repeatedcv = "(Repeated Cross-Validation)",
@@ -132,7 +132,7 @@ stringFunc <- function (x) {
cat("Resampling results")
if(dim(tuneAcc)[1] > 1) cat(" across tuning parameters")
- if(showSD) cat(" (values above are 'mean (sd)')")
+ if(showSD) cat(" (values below are 'mean (sd)')")
cat(":\n\n")
if(dim(tuneAcc)[1] > 1) {
@@ -199,7 +199,7 @@ stringFunc <- function (x) {
} else constString <- NULL
} else constString <- NULL
- tuneAcc <- tuneAcc[,!grepl("Apparent$", names(tuneAcc)),drop = FALSE]
+ tuneAcc <- tuneAcc[,!grepl("Apparent$|Optimism$", names(tuneAcc)), drop = FALSE]
colnames(tuneAcc)[colnames(tuneAcc) == ".B"] <- "Resamples"
nms <- names(tuneAcc)[names(tuneAcc) %in% params]
sort_args <- vector(mode = "list", length = length(nms))
@@ -264,7 +264,8 @@ stringFunc <- function (x) {
cat(truncateText(met))
}
- cat(truncateText(optString), "\n")
+ cat(truncateText(optString))
+ if(nzchar(optString)) cat("\n")
} else printMat <- NULL
if(details) {
@@ -89,7 +89,7 @@
#' tuning parameters.} \item{bestTune }{a data frame with the final
#' parameters.}
#'
-#' \item{call }{the (matched) function call with dots expanded} \item{dots}{a
+#' \item{call}{the (matched) function call with dots expanded} \item{dots}{a
#' list containing any ... values passed to the original call} \item{metric}{a
#' string that specifies what summary metric will be used to select the optimal
#' model.} \item{control}{the list of control parameters.} \item{preProcess
@@ -359,7 +359,8 @@ train.default <- function(x, y,
alt_cv =, cv = createFolds(y, trControl$number, returnTrain = TRUE),
repeatedcv =, adaptive_cv = createMultiFolds(y, trControl$number, trControl$repeats),
loocv = createFolds(y, n, returnTrain = TRUE),
- boot =, boot632 =, adaptive_boot = createResample(y, trControl$number),
+ boot =, boot632 =, optimism_boot =, boot_all =,
+ adaptive_boot = createResample(y, trControl$number),
test = createDataPartition(y, 1, trControl$p),
adaptive_lgocv =, lgocv = createDataPartition(y, trControl$number, trControl$p),
timeslice = createTimeSlices(seq(along = y),
@@ -394,13 +395,16 @@ train.default <- function(x, y,
stop('`savePredictions` should be either logical or "all", "final" or "none"')
}
- ## Create hold--out indicies
- if(is.null(trControl$indexOut) & trControl$method != "oob"){
+ ## Create holdout indices
+ if(is.null(trControl$indexOut) && trControl$method != "oob"){
if(tolower(trControl$method) != "timeslice") {
y_index <- if(class(y)[1] == "Surv") 1:nrow(y) else seq(along = y)
- trControl$indexOut <- lapply(trControl$index,
- function(training, allSamples) allSamples[-unique(training)],
- allSamples = y_index)
+ trControl$indexOut <- lapply(trControl$index, function(training) setdiff(y_index, training))
+ if(trControl$method %in% c("optimism_boot", "boot_all")) {
+ trControl$indexExtra <- lapply(trControl$index, function(training) {
+ list(origIndex = y_index, bootIndex = training)
+ })
+ }
names(trControl$indexOut) <- prettySeq(trControl$indexOut)
} else {
trControl$indexOut <- createTimeSlices(seq(along = y),
@@ -559,7 +563,7 @@ train.default <- function(x, y,
num_rs <- if(trControl$method != "oob") length(trControl$index) else 1L
- if(trControl$method == "boot632") num_rs <- num_rs + 1L
+ if(trControl$method %in% c("boot632", "optimism_boot", "boot_all")) num_rs <- num_rs + 1L
## Set or check the seeds when needed
if(is.null(trControl$seeds) || all(is.na(trControl$seeds))) {
seeds <- sample.int(n = 1000000L, size = num_rs * nrow(trainInfo$loop) + 1L)
@@ -653,6 +657,9 @@ train.default <- function(x, y,
}
}
}
+
+ ## Remove extra indices
+ trControl$indexExtra <- NULL
## TODO we used to give resampled results for LOO
if(!(trControl$method %in% c("LOOCV", "oob"))) {
View
@@ -34,11 +34,22 @@
#' Bengio, 2012). See \url{http://topepo.github.io/caret/random-hyperparameter-search.html} for
#' details and an example.
#'
-#' The \code{"boot632"} method uses the 0.632 estimator presented in Efron
-#' (1983), not to be confused with the 0.632+ estimator proposed later by the
-#' same author.
+#' The supported bootstrap methods are:
+#'
+#' \itemize{
+#' \item \code{"boot"}: the usual bootstrap.
+#' \item \code{"boot632"}: the 0.632 bootstrap estimator (Efron, 1983).
+#' \item \code{"optimism_boot"}: the optimism bootstrap estimator.
+#' (Efron and Tibshirani, 1994).
+#' \item \code{"boot_all"}: all of the above (for efficiency,
+#' but "boot" will be used for calculations).
+#' }
+#'
+#' The \code{"boot632"} method should not to be confused with the 0.632+
+#' estimator proposed later by the same author.
#'
#' @param method The resampling method: \code{"boot"}, \code{"boot632"},
+#' \code{"optimism_boot"}, \code{"boot_all"},
#' \code{"cv"}, \code{"repeatedcv"}, \code{"LOOCV"}, \code{"LGOCV"} (for
#' repeated training/test splits), \code{"none"} (only fits one model to the
#' entire training set), \code{"oob"} (only for random forest, bagged trees,
@@ -126,6 +137,9 @@
#' improvement on cross-validation''. Journal of the American Statistical
#' Association, 78(382):316-331
#'
+#' Efron, B., & Tibshirani, R. J. (1994). ``An introduction to the bootstrap'',
+#' pages 249-252. CRC press.
+#'
#' Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter
#' Optimization'', Journal of Machine Learning Research, 13(Feb):281-305
#'
View
@@ -65,10 +65,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
## fitting and predicting the full data set.
resampleIndex <- ctrl$index
- if(ctrl$method %in% c("boot632"))
+ if(ctrl$method %in% c("boot632", "optimism_boot", "boot_all"))
{
resampleIndex <- c(list("AllData" = rep(0, nrow(x))), resampleIndex)
ctrl$indexOut <- c(list("AllData" = rep(0, nrow(x))), ctrl$indexOut)
+ if(!is.null(ctrl$indexExtra)) ctrl$indexExtra <- c(list("AllData" = NULL), ctrl$indexExtra)
}
`%op%` <- getOper(ctrl$allowParallel && getDoParWorkers() > 1)
keep_pred <- isTRUE(ctrl$savePredictions) || ctrl$savePredictions %in% c("all", "final")
@@ -94,6 +95,8 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
holdoutIndex <- modelIndex
}
+ extraIndex <- if(is.null(ctrl$indexExtra)) NULL else ctrl$indexExtra[[iter]]
+
if(testing) cat("pre-model\n")
if(!is.null(info$submodels[[parm]]) && nrow(info$submodels[[parm]]) > 0) {
@@ -115,6 +118,7 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
if(testing) print(mod)
+ predictedExtra <- NULL
if(class(mod)[1] != "try-error")
{
predicted <- try(
@@ -153,6 +157,14 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
for(i in seq(along = predicted)) predicted[[i]] <- tmp
rm(tmp)
}
+ } else if(!is.null(extraIndex)) {
+ predictedExtra <- lapply(extraIndex, function(idx) {
+ try(predictionFunction(method = method,
+ modelFit = mod$fit,
+ newdata = x[idx, , drop = FALSE],
+ preProc = mod$preProc,
+ param = submod))
+ })
}
} else {
wrn <- paste(colnames(printed[parm,,drop = FALSE]),
@@ -258,6 +270,21 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
wts = wts[holdoutIndex],
lv = lev,
rows = holdoutIndex)
+ if(!is.null(predictedExtra))
+ predictedExtra <- mapply(predictedExtra, extraIndex,
+ SIMPLIFY = FALSE, USE.NAMES = FALSE,
+ FUN = function(pred, rows) {
+ lapply(pred, function(x) {
+ y <- y[rows]
+ wts <- wts[rows]
+
+ x <- outcome_conversion(x, lv = lev)
+ out <- data.frame(pred = x, obs = y, stringsAsFactors = FALSE)
+ if(!is.null(wts)) out$weights <- wts
+ out$rowIndex <- rows
+ out
+ })
+ })
if(testing) print(head(predicted))
## same for the class probabilities
@@ -266,12 +293,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
for(k in seq(along = predicted)) predicted[[k]] <- cbind(predicted[[k]], probValues[[k]])
}
- if(keep_pred)
+ if(keep_pred || (ctrl$method == "boot_all" && names(resampleIndex)[iter] == "AllData"))
{
tmpPred <- predicted
for(modIndex in seq(along = tmpPred))
{
- tmpPred[[modIndex]]$rowIndex <- holdoutIndex
tmpPred[[modIndex]] <- merge(tmpPred[[modIndex]],
allParam[modIndex,,drop = FALSE],
all = TRUE)
@@ -286,6 +312,26 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
lev = lev,
model = method)
if(testing) print(head(thisResample))
+
+ if(!is.null(predictedExtra)) {
+ thisResampleExtra <- lapply(predictedExtra, function(predicted) {
+ lapply(predicted,
+ ctrl$summaryFunction,
+ lev = lev,
+ model = method)
+ })
+ thisResampleExtra[[1L]] <- lapply(thisResampleExtra[[1L]], function(res) {
+ names(res) <- paste0(names(res), "Orig")
+ res
+ })
+ thisResampleExtra[[2L]] <- lapply(thisResampleExtra[[2L]], function(res) {
+ names(res) <- paste0(names(res), "Boot")
+ res
+ })
+ thisResampleExtra <- do.call(cbind, lapply(thisResampleExtra, function(x) do.call(rbind, x)))
+ thisResampleExtra <- cbind(allParam, thisResampleExtra)
+ } else thisResampleExtra <- NULL
+
## for classification, add the cell counts
if(length(lev) > 1 && length(lev) <= 50)
{
@@ -308,7 +354,7 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
if(ctrl$classProbs) tmp <- cbind(tmp, probValues)
tmp$rowIndex <- holdoutIndex
- if(keep_pred)
+ if(keep_pred || (ctrl$method == "boot_all" && names(resampleIndex)[iter] == "AllData"))
{
tmpPred <- tmp
tmpPred$rowIndex <- holdoutIndex
@@ -328,19 +374,47 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
thisResample <- as.data.frame(t(thisResample))
thisResample <- cbind(thisResample, info$loop[parm,,drop = FALSE])
+ ## for optimism bootstrap
+ if(!is.null(predictedExtra)) {
+ thisResampleExtra <- mapply(predictedExtra, extraIndex,
+ SIMPLIFY = FALSE, USE.NAMES = FALSE,
+ FUN = function(predicted, holdoutIndex) {
+ if(is.factor(y)) predicted <- outcome_conversion(predicted, lv = lev)
+ tmp <- data.frame(pred = predicted,
+ obs = y[holdoutIndex],
+ stringsAsFactors = FALSE)
+ ## Sometimes the code above does not coerce the first
+ ## columnn to be named "pred" so force it
+ names(tmp)[1] <- "pred"
+ if(!is.null(wts)) tmp$weights <- wts[holdoutIndex]
+ if(ctrl$classProbs && nrow(tmp) == nrow(probValues)) tmp <- cbind(tmp, probValues)
+ tmp$rowIndex <- holdoutIndex
+ ctrl$summaryFunction(tmp, lev = lev, model = method)
+ })
+
+ names(thisResampleExtra[[1L]]) <- paste0(names(thisResampleExtra[[1L]]), "Orig")
+ names(thisResampleExtra[[2L]]) <- paste0(names(thisResampleExtra[[2L]]), "Boot")
+
+ thisResampleExtra <- unlist(thisResampleExtra, recursive = FALSE)
+
+ thisResampleExtra <- cbind(as.data.frame(t(thisResampleExtra)), info$loop[parm, , drop = FALSE])
+
+ } else thisResampleExtra <- NULL
+
}
thisResample$Resample <- names(resampleIndex)[iter]
if(ctrl$verboseIter) progress(printed[parm,,drop = FALSE],
names(resampleIndex), iter, FALSE)
if(testing) print(thisResample)
- list(resamples = thisResample, pred = tmpPred)
+ list(resamples = thisResample, pred = tmpPred, resamplesExtra = thisResampleExtra)
}
resamples <- rbind.fill(result[names(result) == "resamples"])
- pred <- if(keep_pred) rbind.fill(result[names(result) == "pred"]) else NULL
- if(ctrl$method %in% c("boot632"))
+ pred <- rbind.fill(result[names(result) == "pred"])
+ resamplesExtra <- rbind.fill(result[names(result) == "resamplesExtra"])
+ if(ctrl$method %in% c("boot632", "optimism_boot", "boot_all"))
{
perfNames <- names(resamples)
perfNames <- perfNames[!(perfNames %in% c("Resample", as.character(method$parameters$parameter)))]
@@ -355,6 +429,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
warning("There were missing values in the apparent performance measures.")
}
resamples <- subset(resamples, Resample != "AllData")
+ if(!is.null(pred))
+ {
+ predHat <- subset(pred, Resample == "AllData")
+ pred <- subset(pred, Resample != "AllData")
+ }
}
names(resamples) <- gsub("^\\.", "", names(resamples))
@@ -369,12 +448,36 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes
MeanSD,
exclude = gsub("^\\.", "", colnames(info$loop)))
- if(ctrl$method %in% c("boot632")) {
+ if(ctrl$method %in% c("boot632", "boot_all")) {
out <- merge(out, apparent)
- for(p in seq(along = perfNames)) {
- const <- 1-exp(-1)
- out[, perfNames[p]] <- (const * out[, perfNames[p]]) + ((1-const) * out[, paste(perfNames[p],"Apparent", sep = "")])
- }
+ const <- 1 - exp(-1)
+ sapply(perfNames, function(perfName) {
+ perfOut <- if(ctrl$method == "boot_all") paste0(perfName, "_632") else perfName
+ out[, perfOut] <<- (const * out[, perfName]) + ((1-const) * out[, paste(perfName, "Apparent", sep = "")])
+ NULL
+ })
+ }
+
+ if(ctrl$method %in% c("optimism_boot", "boot_all")) {
+ out <- merge(out, apparent)
+ out <- merge(out, ddply(resamplesExtra[, !grepl("Resample", colnames(resamplesExtra)), drop = FALSE],
+ colnames(info$loop),
+ function(df, exclude) {
+ colMeans(df[, setdiff(colnames(df), exclude), drop = FALSE])
+ },
+ exclude = colnames(info$loop)))
+ sapply(perfNames, function(perfName) {
+ optimism <- out[ , paste0(perfName, "Orig")] - out[ , paste0(perfName, "Boot")]
+ final_estimate <- out[ , paste0(perfName, "Apparent")] + optimism
+ ## Remove unnecessary values
+ out[ , paste0(perfName, "Orig")] <<- NULL
+ out[ , paste0(perfName, "Boot")] <<- NULL
+ perfOut <- if(ctrl$method == "boot_all") paste0(perfName, "_OptBoot") else perfName
+ ## Update estimates
+ out[ , paste0(perfName, "Optimism")] <<- optimism
+ out[ , perfOut] <<- final_estimate
+ NULL
+ })
}
list(performance = out, resamples = resamples, predictions = if(keep_pred) pred else NULL)
View

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
Oops, something went wrong.