-
Notifications
You must be signed in to change notification settings - Fork 1
/
crossval.R
137 lines (127 loc) · 5.41 KB
/
crossval.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#' Estimator of the Mean Squared Prediction Error using Cross-Validation.
#'
#' @family utilities
#'
#' @description Estimator of the mean squared prediction error of
#' different learners using cross-validation.
#'
#' @inheritParams ddml_plm
#' @param y The outcome variable.
#' @param X A (sparse) matrix of predictive variables.
#' @param Z Optional additional (sparse) matrix of predictive variables.
#' @param learners \code{learners} is a list of lists, each containing four
#' named elements:
#' \itemize{
#' \item{\code{fun} The base learner function. The function must be
#' such that it predicts a named input \code{y} using a named input
#' \code{X}.}
#' \item{\code{args} Optional arguments to be passed to \code{fun}.}
#' \item{\code{assign_X} An optional vector of column indices
#' corresponding to variables in \code{X} that are passed to
#' the base learner.}
#' \item{\code{assign_Z} An optional vector of column indices
#' corresponding to variables in \code{Z} that are passed to the
#' base learner.}
#' }
#' Omission of the \code{args} element results in default arguments being
#' used in \code{fun}. Omission of \code{assign_X} (and/or \code{assign_Z})
#' results in inclusion of all predictive variables in \code{X} (and/or
#' \code{Z}).
#' @param cv_folds Number of folds used for cross-validation.
#' @param cv_subsamples List of vectors with sample indices for
#' cross-validation.
#' @param progress String to print before learner and cv fold progress.
#'
#' @return \code{crossval} returns a list containing the following components:
#' \describe{
#' \item{\code{mspe}}{A vector of MSPE estimates,
#' each corresponding to a base learners (in chronological order).}
#' \item{\code{oos_resid}}{A matrix of out-of-sample prediction errors,
#' each column corresponding to a base learners (in chronological
#' order).}
#' \item{\code{cv_subsamples}}{Pass-through of \code{cv_subsamples}.
#' See above.}
#' }
#' @export
#'
#' @examples
#' # Construct variables from the included Angrist & Evans (1998) data
#' y = AE98[, "worked"]
#' X = AE98[, c("morekids", "age","agefst","black","hisp","othrace","educ")]
#'
#' # Compare ols, lasso, and ridge using 4-fold cross-validation
#' cv_res <- crossval(y, X,
#' learners = list(list(fun = ols),
#' list(fun = mdl_glmnet),
#' list(fun = mdl_glmnet,
#' args = list(alpha = 0))),
#' cv_folds = 4,
#' silent = TRUE)
#' cv_res$mspe
crossval <- function(y, X, Z = NULL,
learners,
cv_folds = 5,
cv_subsamples = NULL,
silent = FALSE,
progress = NULL) {
# Data parameters
nobs <- length(y)
nlearners <- length(learners)
# Create cv sample fold tuple
if (is.null(cv_subsamples)) {
cv_subsamples <- generate_subsamples(nobs, cv_folds)
}#IF
cv_folds <- length(cv_subsamples)
nobs <- length(unlist(cv_subsamples)) # In case subsamples are user-provided
# Compute out-of-sample errors
cv_res <- sapply(1:(cv_folds * nlearners), function(x) {
# Select model and cv-fold for this job
j <- ceiling(x / cv_folds) # jth model
i <- x - cv_folds * (ceiling(x / cv_folds) - 1) # ith CV fold
fold_x <- cv_subsamples[[i]]
# Print progress
if (!silent) {
cat(paste0("\r", progress,
"learner ", j, "/", nlearners,
", cv fold ", i, "/", cv_folds))
}#IF
# Compute model for this fold
crossval_compute(test_sample = fold_x,
learner = learners[[j]],
y, X, Z)
})#SAPPLY
# Compile residual matrix
oos_resid <- unlist(cv_res)[order(unlist(cv_subsamples))]
oos_resid <- matrix(oos_resid, nobs, nlearners)
# Compute MSPE by learner
mspe <- colMeans(oos_resid^2)
# Organize and return output
output <- list(mspe = mspe,
oos_resid = oos_resid,
cv_subsamples = cv_subsamples)
return(output)
}#CROSSVAL
# Complementary functions ======================================================
crossval_compute <- function(test_sample, learner,
y, X, Z = NULL) {
# Check whether X, Z assignment has been specified. If not, include all.
if (is.null(learner$assign_X)) learner$assign_X <- 1:ncol(X)
if (is.null(learner$assign_Z) & !is.null(Z)) learner$assign_Z <- 1:ncol(Z)
# Extract model arguments
mdl_fun <- list(what = learner$fun, args = learner$args)
assign_X <- learner$assign_X
assign_Z <- learner$assign_Z
# Compute model for this fold
# Note: this is effectively copying the data -- improvement needed.
mdl_fun$args$y <- y[-test_sample]
mdl_fun$args$X <- cbind(X[-test_sample, assign_X, drop = F],
Z[-test_sample, assign_Z, drop = F])
mdl_fit <- do.call(do.call, mdl_fun)
# Compute out of sample residuals
oos_fitted <- stats::predict(mdl_fit,
cbind(X[test_sample, assign_X, drop = F],
Z[test_sample, assign_Z, drop = F]))
oos_resid <- y[test_sample] - methods::as(oos_fitted, "matrix")
# Return residuals and cv_Z
return(oos_resid)
}#CROSSVAL_COMPUTE