/
LSCV.density.R
238 lines (224 loc) · 11.4 KB
/
LSCV.density.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#' Cross-validation bandwidths for spatial kernel density estimates
#'
#' Isotropic fixed or global (for adaptive) bandwidth selection for standalone 2D density/intensity
#' based on either unbiased least squares cross-validation (LSCV) or likelihood (LIK) cross-validation.
#'
#' This function implements the bivariate, edge-corrected versions of fixed-bandwidth least squares cross-validation and likelihood cross-validation
#' as outlined in Sections 3.4.3 and 3.4.4 of Silverman (1986) in order to select an optimal fixed smoothing bandwidth. With \code{type = "adaptive"} it may also be used to select the global bandwidth
#' for adaptive kernel density estimates, making use of multi-scale estimation (Davies and Baddeley, 2018) via \code{\link{multiscale.density}}.
#' Note that for computational reasons, the leave-one-out procedure is not performed on the pilot density in the adaptive setting; it
#' is only performed on the final stage estimate. Current development efforts include extending this functionality, see \code{\link{SLIK.adapt}}. See also `Warning' below.
#'
#' Where \code{LSCV.density} is based on minimisation of an unbiased estimate of the mean integrated squared error (MISE) of the density, \code{LIK.density} is based on
#' maximisation of the cross-validated leave-one-out average of the log-likelihood of the density estimate with respect to \eqn{h}.
#'
#' In both functions, the argument \code{zero.action} can be used to control the level of severity in response to small bandwidths that result (due to numerical error) in at least one density value being zero or less.
#' When \code{zero.action = -1}, the function strictly forbids bandwidths that would result in one or more \emph{pixel} values of a kernel estimate of the original data (i.e. anything over the whole region) being zero or less---this is the most restrictive truncation. With \code{zero.action = 0} (default), the function
#' automatically forbids bandwidths that yield erroneous values at the leave-one-out data point locations only. With \code{zero.action = 1}, the minimum machine value (see \code{.Machine$double.xmin} at the prompt) is
#' used to replace these individual leave-one-out values. When \code{zero.action = 2}, the minimum value of the valid (greater than zero) leave-one-out values is used to replace any erroneous leave-one-out values.
#'
#'
#'
#' @aliases LIK.density
#'
#' @rdname CV
#'
#' @param pp An object of class \code{\link[spatstat.geom]{ppp}} giving the observed
#' 2D data to be smoothed.
#' @param hlim An optional vector of length 2 giving the limits of the
#' optimisation routine with respect to the bandwidth. If unspecified, the
#' function attempts to choose this automatically.
#' @param hseq An optional increasing sequence of bandwidth values at which to
#' manually evaluate the optimisation criterion. Used only in the case
#' \code{(!auto.optim && is.null(hlim))}.
#' @param resolution Spatial grid size; the optimisation will be based on a
#' [\code{resolution} \eqn{\times}{x} \code{resolution}] density estimate.
#' @param edge Logical value indicating whether to edge-correct the density
#' estimates used.
#' @param auto.optim Logical value indicating whether to automate the numerical
#' optimisation using \code{\link{optimise}}. If \code{FALSE}, the optimisation
#' criterion is evaluated over \code{hseq} (if supplied), or over a seqence of
#' values controlled by \code{hlim} and \code{seqres}.
#' @param seqres Optional resolution of an increasing sequence of bandwidth
#' values. Only used if \code{(!auto.optim && is.null(hseq))}.
#' @param parallelise Numeric argument to invoke parallel processing, giving
#' the number of CPU cores to use when \code{!auto.optim} \bold{and} \code{type = "fixed"}. Experimental. Test
#' your system first using \code{parallel::detectCores()} to identify the
#' number of cores available to you.
#' @param verbose Logical value indicating whether to provide function progress
#' commentary.
#' @param type A character string; \code{"fixed"} (default) performs classical leave-one-out
#' cross-validation for the fixed-bandwidth estimator. Alternatively, \code{"adaptive"} utilises
#' multiscale adaptive kernel estimation (Davies & Baddeley, 2018) to run the cross-validation
#' in an effort to find a suitable global bandwidth for the adaptive estimator. Note that data points are not `left out' of
#' the pilot density estimate when using this option (this capability is currently in development). See also the entry for \code{...}.
#' @param zero.action A numeric integer, either \code{-1}, \code{0} (default), \code{1} or \code{2} controlling how the function should behave in response to numerical errors at very small bandwidths, when such a bandwidth results in one or more zero or negative density values during the leave-one-out computations. See `Details'.
#' @param ... Additional arguments controlling pilot density estimation and multi-scale bandwidth-axis
#' resolution when \code{type = "adaptive"}. Relevant arguments are \code{hp}, \code{pilot.density},
#' \code{gamma.scale}, and \code{trim} from \code{\link{bivariate.density}}; and \code{dimz} from
#' \code{\link{multiscale.density}}. If \code{hp} is missing and required, the function makes a (possibly recursive)
#' call to \code{LSCV.density} to set this using fixed-bandwidth LSCV. The remaining defaults are \code{pilot.density = pp},
#' \code{gamma.scale = "geometric"}, \code{trim = 5}, and \code{dimz = resolution}.
#'
#' @return A single numeric value of the estimated bandwidth (if
#' \code{auto.optim = TRUE}). Otherwise, a \eqn{[}\code{seqres} \eqn{x} 2\eqn{]} matrix
#' giving the bandwidth sequence and corresponding CV
#' function value.
#'
#' @section Warning: Leave-one-out CV for bandwidth selection in kernel
#' density estimation is notoriously unstable in practice and has a tendency to
#' produce rather small bandwidths, particularly for spatial data. Satisfactory bandwidths are not guaranteed
#' for every application; \code{zero.action} can curb adverse numeric effects for very small bandwidths during the optimisation procedures. This method can also be computationally expensive for
#' large data sets and fine evaluation grid resolutions. The user may also need to
#' experiment with adjusting \code{hlim} to find a suitable minimum.
#'
#' @author T. M. Davies
#'
#' @seealso \code{\link{SLIK.adapt}} and functions for bandwidth selection in package
#' \code{\link{spatstat}}: \code{\link[spatstat.explore]{bw.diggle}};
#' \code{\link[spatstat.explore]{bw.ppl}}; \code{\link[spatstat.explore]{bw.scott}};
#' \code{\link[spatstat.explore]{bw.frac}}.
#'
#' @references
#'
#' Davies, T.M. and Baddeley A. (2018), Fast computation of
#' spatially adaptive kernel estimates, \emph{Statistics and Computing}, \bold{28}(4), 937-956.
#'
#' Silverman, B.W. (1986), \emph{Density Estimation for Statistics
#' and Data Analysis}, Chapman & Hall, New York.
#'
#' Wand, M.P. and Jones,
#' C.M., 1995. \emph{Kernel Smoothing}, Chapman & Hall, London.
#'
#' @examples
#'
#' data(pbc)
#' pbccas <- split(pbc)$case
#'
#' LIK.density(pbccas)
#' LSCV.density(pbccas)
#'
#' \donttest{
#' #* FIXED
#'
#' # custom limits
#' LIK.density(pbccas,hlim=c(0.01,4))
#' LSCV.density(pbccas,hlim=c(0.01,4))
#'
#' # disable edge correction
#' LIK.density(pbccas,hlim=c(0.01,4),edge=FALSE)
#' LSCV.density(pbccas,hlim=c(0.01,4),edge=FALSE)
#'
#' # obtain objective function
#' hcv <- LIK.density(pbccas,hlim=c(0.01,4),auto.optim=FALSE)
#' plot(hcv);abline(v=hcv[which.max(hcv[,2]),1],lty=2,col=2)
#'
#' #* ADAPTIVE
#' LIK.density(pbccas,type="adaptive")
#' LSCV.density(pbccas,type="adaptive")
#'
#' # change pilot bandwidth used
#' LIK.density(pbccas,type="adaptive",hp=2)
#' LSCV.density(pbccas,type="adaptive",hp=2)
#' }
#'
#' @export
LSCV.density <- function(pp,hlim=NULL,hseq=NULL,resolution=64,edge=TRUE,auto.optim=TRUE,
type=c("fixed","adaptive"),seqres=30,parallelise=NULL,
zero.action=0,verbose=TRUE,...){
if(!inherits(pp,"ppp")) stop("data object 'pp' must be of class \"ppp\"")
W <- Window(pp)
if(is.null(hlim)){
ppu <- pp
marks(ppu) <- NULL
md <- min(nndist(unique(ppu)))
hlim <- c(md,max(md*50,min(diff(W$xrange),diff(W$yrange))/6))
} else {
hlim <- checkran(hlim,"'hlim'")
}
if(!zero.action%in%((-1):2)) stop("invalid 'zero.action'")
typ <- type[1]
if(typ=="fixed"){
if(auto.optim){
if(verbose) cat("Searching for optimal h in ",prange(hlim),"...",sep="")
result <- suppressWarnings(optimise(LSCV.density.spatial.single,interval=hlim,pp=pp,res=resolution,edge=edge,za=zero.action)$minimum)
if(verbose) cat("Done.\n")
} else {
if(is.null(hseq)) hseq <- seq(hlim[1],hlim[2],length=seqres)
hn <- length(hseq)
if(is.null(parallelise)){
lscv.vec <- rep(NA,hn)
if(verbose) pb <- txtProgressBar(1,hn)
for(i in 1:hn){
lscv.vec[i] <- LSCV.density.spatial.single(hseq[i],pp,resolution,edge,za=zero.action)
if(verbose) setTxtProgressBar(pb,i)
}
if(verbose) close(pb)
} else {
ncores <- detectCores()
if(verbose) cat(paste("Evaluating criterion on",parallelise,"/",ncores,"cores..."))
if(parallelise>ncores) stop("cores requested exceeds available count")
registerDoParallel(cores=parallelise)
lscv.vec <- foreach(i=1:hn,.packages="spatstat",.combine=c) %dopar% {
return(LSCV.density.spatial.single(hseq[i],pp,resolution,edge,zero.action))
}
if(verbose) cat("Done.\n")
}
result <- cbind(hseq,lscv.vec)
dimnames(result)[[2]] <- c("h","CV")
}
} else if(typ=="adaptive"){
ellip <- list(...)
if(is.null(ellip$hp)){
if(verbose) cat("Selecting pilot bandwidth...")
hp <- LSCV.density(pp,verbose=FALSE,zero.action=zero.action)
if(verbose) cat(paste("Done.\n [ Found hp =",hp,"]\n"))
} else {
hp <- ellip$hp
}
if(is.null(ellip$pilot.density)){
pilot.density <- pp
} else {
pilot.density <- ellip$pilot.density
}
if(is.null(ellip$gamma.scale)){
gamma.scale <- "geometric"
} else {
gamma.scale <- ellip$gamma.scale
}
if(is.null(ellip$trim)){
trim <- 5
} else {
trim <- ellip$trim
}
if(is.null(ellip$dimz)){
dimz <- resolution
} else {
dimz <- ellip$dimz
}
if(verbose) cat("Computing multi-scale estimate...")
hhash <- mean(hlim)
msobject <- multiscale.density(pp,h0=hhash,hp=hp,h0fac=hlim/hhash,edge=ifelse(edge,"uniform","none"),resolution=resolution,dimz=dimz,gamma.scale=gamma.scale,trim=trim,intensity=TRUE,pilot.density=pilot.density,verbose=FALSE)
if(verbose) cat("Done.\n")
h0range <- range(as.numeric(names(msobject$z)))
if(auto.optim){
if(verbose) cat("Searching for optimal h0 in ",prange(h0range),"...",sep="")
h0opt <- suppressWarnings(optimise(ms.loo,interval=h0range,object=msobject,za=zero.action)$minimum)
if(verbose) cat("Done.\n")
return(h0opt)
} else {
if(is.null(hseq)) hseq <- seq(h0range[1],h0range[2],length=seqres)
hn <- length(hseq)
lscv.vec <- rep(NA,hn)
if(verbose) pb <- txtProgressBar(1,hn)
for(i in 1:hn){
lscv.vec[i] <- ms.loo(hseq[i],msobject,za=zero.action)
if(verbose) setTxtProgressBar(pb,i)
}
if(verbose) close(pb)
result <- cbind(hseq,lscv.vec)
dimnames(result)[[2]] <- c("h0","CV")
}
} else stop("invalid 'type'")
return(result)
}