-
Notifications
You must be signed in to change notification settings - Fork 12
/
sigmas.r
232 lines (200 loc) · 7.85 KB
/
sigmas.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#' @include s4-unions.r dist-matrix-coerce.r accessor-generics.r
NULL
#' Sigmas Object
#'
#' Holds the information about how the \code{sigma} parameter for a \link{DiffusionMap} was obtained,
#' and in this way provides a plotting function for the \link{find_sigmas} heuristic.
#' You should not need to create a Sigmas object yourself. Provide \code{sigma} to \link{DiffusionMap} instead or use \link{find_sigmas}.
#'
#' A Sigmas object is either created by \link{find_sigmas} or by specifying the \code{sigma} parameter to \link{DiffusionMap}.
#'
#' In the second case, if the \code{sigma} parameter is just a number,
#' the resulting \code{Sigmas} object has all slots except of \code{optimal_sigma} set to \code{NULL}.
#'
#' @usage Sigmas(...)
#'
#' @param ... See \dQuote{\strong{Slots}} below
#' @param object,x \link{Sigmas} object
#'
#' @return \code{Sigmas} creates an object of the same class
#'
#' @slot log_sigmas Vector of length \eqn{m} containing the \eqn{\log_{10}} of the \eqn{\sigma}s
#' @slot dim_norms Vector of length \eqn{m-1} containing the average dimensionality \eqn{\langle p \rangle} for the respective kernel widths
#' @slot optimal_sigma Multiple local sigmas or the mean of the two global \eqn{\sigma}s around the highest \eqn{\langle p \rangle} (\code{c(optimal_idx, optimal_idx+1L)})
#' @slot optimal_idx The index of the highest \eqn{\langle p \rangle}.
#' @slot avrd_norms Vector of length \eqn{m} containing the average dimensionality for the corresponding sigma.
#'
#' @examples
#' data(guo)
#' sigs <- find_sigmas(guo, verbose = FALSE)
#' optimal_sigma(sigs)
#' print(sigs)
#'
#' @seealso \code{\link{find_sigmas}}, the function to determine a locally optimal sigma and returning this class
#'
#' @aliases Sigmas-methods
#'
#' @importFrom methods setClass
#' @rdname Sigmas-class
#' @export Sigmas
#' @exportClass Sigmas
Sigmas <- setClass('Sigmas', slots = c(
log_sigmas = 'numericOrNULL',
dim_norms = 'numericOrNULL',
optimal_sigma = 'numericOrNULL',
optimal_idx = 'integerOrNULL',
avrd_norms = 'numericOrNULL'))
#' @return \code{optimal_sigma} retrieves the numeric value of the optimal sigma or local sigmas
#' @rdname Sigmas-class
#' @export
setMethod('optimal_sigma', 'Sigmas', function(object) object@optimal_sigma)
#' @rdname Sigmas-class
#' @export
setMethod('print', 'Sigmas', function(x) {
cat(sprintf('Sigmas (%s Steps performed)\noptimal_sigma: ', length(x@log_sigmas)))
str(optimal_sigma(x))
invisible(x)
})
#' @importFrom graphics plot
#' @rdname Sigmas-class
#' @export
setMethod('show', 'Sigmas', function(object) {
plot(object)
invisible()
})
#' Calculate the average dimensionality for m different gaussian kernel widths (\eqn{\sigma}).
#'
#' The sigma with the maximum value in average dimensionality is close to the ideal one.
#' Increasing step number gets this nearer to the ideal one.
#'
#' @param data Data set with \eqn{n} observations. Can be a \link[base]{data.frame},
#' \link[base]{matrix}, \link[Biobase:class.ExpressionSet]{ExpressionSet}
#' or \link[SingleCellExperiment]{SingleCellExperiment}.
#' @param step_size Size of log-sigma steps
#' @param steps Number of steps/calculations
#' @param start Initial value to search from. (Optional. default: \eqn{\log_{10}(min(dist(data)))})
#' @param sample_rows Number of random rows to use for sigma estimation or vector of row indices/names to use.
#' In the first case, only used if actually smaller than the number of available rows (Optional. default: 500)
#' @param early_exit logical. If TRUE, return if the first local maximum is found, else keep running
#' @param ... Unused. All parameters to the right of the \code{...} have to be specified by name (e.g. \code{find_sigmas(data, verbose = FALSE)})
#' @param censor_val Value regarded as uncertain. Either a single value or one for every dimension
#' @param censor_range Uncertainity range for censoring. A length-2-vector of certainty range start and end. TODO: also allow \eqn{2\times G} matrix
#' @param missing_range Whole data range for missing value model. Has to be specified if NAs are in the data
#' @param vars Variables (columns) of the data to use. Specifying TRUE will select all columns (default: All floating point value columns)
#' @param verbose logical. If TRUE, show a progress bar and plot the output
#'
#' @return Object of class \link{Sigmas}
#'
#' @seealso \code{\link{Sigmas}}, the class returned by this; \code{\link{DiffusionMap}}, the class this is used for
#'
#' @examples
#' data(guo)
#' sigs <- find_sigmas(guo, verbose = TRUE)
#' DiffusionMap(guo, sigs)
#'
#' @importFrom methods as
#' @importFrom graphics plot
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom Matrix colSums rowSums
#' @importFrom proxy dist
#' @export
find_sigmas <- function(
data,
step_size = .1, steps = 10L,
start = NULL,
sample_rows = 500L,
early_exit = FALSE,
...,
censor_val = NULL, censor_range = NULL,
missing_range = NULL,
vars = NULL,
verbose = TRUE
) {
chkDots(...)
data <- dataset_extract_doublematrix(data, vars)
if (any(is.na(data)))
data <- as.matrix(hotdeck(data, imp_var = FALSE))
stopifnot(steps >= 3L)
if (length(sample_rows) > 1L) {
data <- data[sample_rows, ]
} else if (nrow(data) > sample_rows) {
sample_idx <- sample(nrow(data), sample_rows)
data <- data[sample_idx, ]
}
n <- nrow(data)
dists <- dist(data)
min_dist <- min(dists)
if (min_dist == 0)
stop('Minimum distance in the data may not be 0')
dists <- as(dists, 'symmetricMatrix')
if (is.null(start))
start <- log10(min_dist)
#if (missing(step_size))
# step_size = min_dist / steps
if (verbose) print(c(min_dist = min_dist, start = start, step_size = step_size))
get_trans_p <-
if (test_censoring(censor_val, censor_range, data, missing_range)) {
function(sigma) censoring(data, sigma, dists, censor_val, censor_range, missing_range)
} else {
msqd <- -(dists ^ 2)
function(sigma) exp(msqd / (2 * sigma ^ 2))
}
do_step <- function(i) {
# i can be negative!
log_sigma <- start + i*step_size
trans_p <- get_trans_p(10 ^ log_sigma)
diag_d <- colSums(trans_p, na.rm = TRUE)
list(avrd_norm = (sum(log10(diag_d/n) / diag_d)) / sum(1 / diag_d),
log_sigma = log_sigma)
}
avrd_norms <- numeric(steps)
log_sigmas <- numeric(steps)
dim_norms <- numeric(steps - 1)
step_diff <- function(step) {
idxs <- c(step, step - 1)
diff(avrd_norms[idxs]) / diff(log_sigmas[idxs])
}
a0 <- do_step(0L)
avrd_norms[[1L]] <- a0$avrd_norm
log_sigmas[[1L]] <- a0$log_sigma
a1 <- do_step(1)
dir <- 1L
avrd_norms[[2L]] <- a1$avrd_norm
log_sigmas[[2L]] <- a1$log_sigma
if (step_diff(2L) < 0) {
a1 <- do_step(-1L)
dir <- -1L
avrd_norms[[2L]] <- a1$avrd_norm
log_sigmas[[2L]] <- a1$log_sigma
}
dim_norms[[1L]] <- step_diff(2L)
if (verbose) pb <- txtProgressBar(2L, steps, 1L, style = 3)
for (step in seq(2L, steps)) {
a_i = do_step(dir * (step - 1L))
avrd_norms[[step]] <- a_i$avrd_norm
log_sigmas[[step]] <- a_i$log_sigma
dif_step <- step - 1
dim_norms[[dif_step]] <- step_diff(step)
if (verbose) setTxtProgressBar(pb, step)
if (early_exit && step > 2 && dim_norms[[dif_step]] < dim_norms[[dif_step - 1L]]) {
avrd_norms <- avrd_norms[seq_len(step)]
log_sigmas <- log_sigmas[seq_len(step)]
dim_norms <- dim_norms[seq_len(dif_step)]
break
}
}
if (verbose) {
setTxtProgressBar(pb, steps)
close(pb)
}
if (early_exit && step == steps) warning('All steps were exhausted without finding a maximum. Using last encountered sigma')
optimal_idx <- which.max(dim_norms)
ret <- Sigmas(
log_sigmas = log_sigmas,
dim_norms = dim_norms,
optimal_sigma = 10 ^ mean(log_sigmas[c(optimal_idx, optimal_idx + 1L)]),
optimal_idx = optimal_idx,
avrd_norms = avrd_norms)
if (verbose) plot(ret)
ret
}