Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Confusion matrix normalization #355

Merged
merged 5 commits into from Mar 9, 2016
@@ -145,7 +145,7 @@ confusionMatrix.table <- function(data, positive = NULL, prevalence = NULL, ...)
overall = overall,
byClass = tableStats,
dots = list(...)),
- class = "confusionMatrix")
+ class = "confusionMatrix")
}
@@ -170,17 +170,37 @@ as.matrix.confusionMatrix <- function(x, what = "xtabs", ...)
out
}
-as.table.confusionMatrix <- function(x, ...) x$table
+as.table.confusionMatrix <- function(x, ...) x$table
confusionMatrix.train <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...)
{
- if(data$modelType == "Regression") stop("confusion matrices are only valid for classification models")
- if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'")
- if(data$control$method %in% c("oob", "LOOCV", "none")) stop("cannot compute confusion matrices for leave-one-out, out-of-bag resampling or no resampling")
- if(!is.null(data$control$index))
- {
+ if (inherits(data, "train")) {
+ if(data$modelType == "Regression")
+ stop("confusion matrices are only valid for classification models")
+ if(data$control$method %in% c("oob", "LOOCV"))
+ stop("cannot compute confusion matrices for leave-one-out or out-of-bag resampling")
+ if(data$control$method == "none")
+ return(confusionMatrix(predict(data), data$trainingData$.outcome, dnn = dnn, ...))
+
+ lev <- levels(data)
+
+ ## get only best tune
+ names(data$bestTune) <- gsub("^\\.", "", names(data$bestTune))
+ resampledCM <- merge(data$bestTune, data$resampledCM)
+
+ } else {
+ if(is.null(data$resampledCM))
+ stop("resampled confusion matrices are not availible")
+ if(data$control$method %in% c("oob", "LOOCV"))
+ stop("cannot compute confusion matrices for leave-one-out or out-of-bag resampling")
+
+ lev <- data$obsLevels
+ resampledCM <- data$resampledCM
+ }
+
+ if(!is.null(data$control$index)) {
resampleN <- unlist(lapply(data$control$index, length))
numResamp <- length(resampleN)
resampText <- resampName(data)
@@ -189,81 +209,57 @@ confusionMatrix.train <- function(data, norm = "overall", dnn = c("Prediction",
numResamp <- 0
}
- lev <- levels(data)
- ## get only best tune
- names(data$bestTune) <- gsub("^\\.", "", names(data$bestTune))
- resampledCM <- merge(data$bestTune, data$resampledCM)
- counts <- as.matrix(resampledCM[,grep("^cell", colnames(resampledCM))])
- ## normalize by true class?
+ counts <- as.matrix(resampledCM[ , grep("^\\.?cell", colnames(resampledCM))])
+
+ ## normalize?
+ norm <- match.arg(norm, c("none", "overall", "average"))
+
+ if(norm == "none") counts <- matrix(apply(counts, 2, sum), nrow = length(lev))
+ else counts <- matrix(apply(counts, 2, mean), nrow = length(lev))
- if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x)))
- if(norm == "average") counts <- counts/numResamp
- overall <- matrix(apply(counts, 2, mean), nrow = length(lev))
- rownames(overall) <- colnames(overall) <- lev
- if(norm != "none") overall <- overall*100
- names(dimnames(overall)) <- dnn
+ if(norm == "overall") counts <- counts / sum(counts) * 100
+ ## names
+ rownames(counts) <- colnames(counts) <- lev
+ names(dimnames(counts)) <- dnn
- out <- list(table = as.table(overall),
+ ## out
+ out <- list(table = as.table(counts),
norm = norm,
B = length(data$control$index),
text = paste(resampText, "Confusion Matrix"))
- class(out) <- "confusionMatrix.train"
+ class(out) <- paste0("confusionMatrix.", class(data))
out
}
+confusionMatrix.rfe <- confusionMatrix.train
+confusionMatrix.sbf <- confusionMatrix.train
print.confusionMatrix.train <- function(x, digits = 1, ...)
{
cat(x$text, "\n")
normText <- switch(x$norm,
- none = "\n(entries are un-normalized counts)\n",
- average = "\n(entries are cell counts per resample)\n",
- overall = "\n(entries are percentages of table totals)\n",
- byClass = "\n(entries are percentages within the reference class)\n",
+ none = "\n(entries are un-normalized aggregated counts)\n",
+ average = "\n(entries are average cell counts across resamples)\n",
+ overall = "\n(entries are percentual average cell counts across resamples)\n",
"")
cat(normText, "\n")
- if(x$norm == "none" & x$B == 1) print(getFromNamespace("confusionMatrix.table", "caret")(x$table)) else print(round(x$table, digits))
- cat("\n")
- invisible(x)
-}
-
-confusionMatrix.rfe <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...)
-{
- if(is.null(data$resampledCM)) stop("resampled confusion matrices are not availible")
- if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'")
- if(data$control$method %in% c("oob", "LOOCV")) stop("cannot compute confusion matrices for leave-one-out and out-of-bag resampling")
- if(!is.null(data$control$index))
- {
- resampleN <- unlist(lapply(data$control$index, length))
- numResamp <- length(resampleN)
- resampText <- resampName(data)
+ if(x$norm == "none" & x$B == 1) {
+ print(getFromNamespace("confusionMatrix.table", "caret")(x$table))
} else {
- resampText <- ""
- numResamp <- 0
+ print(round(x$table, digits))
+
+ out <- cbind("Accuracy (average)", ":", formatC(sum(diag(x$table) / sum(x$table))))
+
+ dimnames(out) <- list(rep("", nrow(out)), rep("", ncol(out)))
+ print(out, quote = FALSE)
+ cat("\n")
}
-
-
- resampledCM <- data$resampledCM
- counts <- as.matrix(resampledCM[,grep("^\\.cell", colnames(resampledCM))])
- ## normalize by true class?
-
- if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x)))
- if(norm == "average") counts <- counts/numResamp
- overall <- matrix(apply(counts, 2, mean), nrow = length(data$obsLevels))
- rownames(overall) <- colnames(overall) <- data$obsLevels
- overall <- overall*100
- names(dimnames(overall)) <- dnn
-
-
- out <- list(table = overall,
- norm = norm,
- B = numResamp,
- text = paste(resampText, "Confusion Matrix"))
- class(out) <- "confusionMatrix.rfe"
- out
+ invisible(x)
}
+print.confusionMatrix.rfe <- print.confusionMatrix.train
+print.confusionMatrix.sbf <- print.confusionMatrix.train
resampName <- function(x, numbers = TRUE)
{
@@ -288,10 +284,10 @@ resampName <- function(x, numbers = TRUE)
loocv = "Leave-One-Out Cross-Validation",
adaptive_boot = paste("Adaptively Bootstrapped (", numResamp, " reps)", sep = ""),
adaptive_cv = paste("Adaptively Cross-Validated (", x$control$number, " fold, repeated ",
- x$control$repeats, " times)", sep = ""),
+ x$control$repeats, " times)", sep = ""),
adaptive_lgocv = paste("Adaptive Repeated Train/Test Splits Estimated (", numResamp, " reps, ",
- round(x$control$p, 2), "%)", sep = "")
- )
+ round(x$control$p, 2), "%)", sep = "")
+ )
} else {
out <- switch(tolower(x$control$method),
none = "None",
@@ -309,45 +305,6 @@ resampName <- function(x, numbers = TRUE)
out
}
-confusionMatrix.sbf <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...)
-{
- if(is.null(data$resampledCM)) stop("resampled confusion matrices are not availible")
- if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'")
- if(data$control$method %in% c("oob", "LOOCV")) stop("cannot compute confusion matrices for leave-one-out and out-of-bag resampling")
- if(!is.null(data$control$index))
- {
- resampleN <- unlist(lapply(data$control$index, length))
- numResamp <- length(resampleN)
- resampText <- resampName(data)
- } else {
- resampText <- ""
- numResamp <- 0
- }
-
- resampledCM <- data$resampledCM
- counts <- as.matrix(resampledCM[,grep("^\\.cell", colnames(resampledCM))])
- ## normalize by true class?
-
- if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x)))
- if(norm == "average") counts <- counts/numResamp
- overall <- matrix(apply(counts, 2, mean), nrow = length(data$obsLevels))
- rownames(overall) <- colnames(overall) <- data$obsLevels
- overall <- overall*100
- names(dimnames(overall)) <- dnn
-
-
- out <- list(table = overall,
- norm = norm,
- B = numResamp,
- text = paste(resampText, "Confusion Matrix"))
- class(out) <- "confusionMatrix.sbf"
- out
-}
-
-print.confusionMatrix.rfe <- print.confusionMatrix.train
-print.confusionMatrix.sbf <- print.confusionMatrix.train
-
-
mcc <- function(tab, pos = colnames(tab)[1])
{
@@ -17,15 +17,15 @@ Using a \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} object, determ
}
\arguments{
- \item{data}{an object of class \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} that did not use out-of-bag resampling or leave-one-out cross-validation.}
- \item{norm}{a character string indicating how the table entries should be normalized. Valid values are "none", "overall" or "average". }
- \item{dnn}{a character vector of dimnames for the table}
+ \item{data}{An object of class \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} that did not use out-of-bag resampling or leave-one-out cross-validation.}
+ \item{norm}{A character string indicating how the table entries should be normalized. Valid values are "none", "overall" or "average". }
+ \item{dnn}{A character vector of dimnames for the table}
\item{\dots}{not used here}
}
\details{
When \code{\link{train}} is used for tuning a model, it tracks the confusion matrix cell entries for the hold-out samples. These can be aggregated and used for diagnostic purposes. For \code{\link{train}}, the matrix is estimated for the final model tuning parameters determined by \code{\link{train}}. For \code{\link{rfe}}, the matrix is associated with the optimal number of variables.
-There are several ways to show the table entries. Using \code{norm = "none"} will show the frequencies of samples on each of the cells (across all resamples). \code{norm = "overall"} first divides the cell entries by the total number of data points in the table, then averages these percentages. \code{norm = "average"} takes the raw, aggregate cell counts across resamples and divides by the number of resamples (i.e. to yield an average count for each cell).
+There are several ways to show the table entries. Using \code{norm = "none"} will show the aggregated counts of samples on each of the cells (across all resamples). For \code{norm = "average"}, the average number of cell counts across resamples is computed (this can help evaluate how many holdout samples there were on average). The default is \code{norm = "overall"}, which is equivalento to \code{"average"} but in percentages.
}
\value{
a list of class \code{confusionMatrix.train}, \code{confusionMatrix.rfe} or \code{confusionMatrix.sbf} with elements
@@ -77,11 +77,15 @@ Using adaptive resampling when \code{method} is either \code{"adaptive_cv"}, \co
}
The option \code{search = "grid"} uses the default grid search routine. When \code{search = "random"}, a random search procedure is used (Bergstra and Bengio, 2012). See \url{http://topepo.github.io/caret/random.html} for details and an example.
+
+The \code{"boot632"} method uses the 0.632 estimator presented in Efron (1983), not to be confused with the 0.632+ estimator proposed later by the same author.
}
\author{Max Kuhn}
\references{
+Efron (1983). ``Estimating the error rate of a prediction rule: improvement on cross-validation''. Journal of the American Statistical Association, 78(382):316-331
+
Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter Optimization'', Journal of Machine Learning Research, 13(Feb):281-305
Kuhn (2014), ``Futility Analysis in the Cross-Validation of Machine Learning Models'' \url{http://arxiv.org/abs/1405.6974},