/
nn_utils.R
368 lines (329 loc) · 10.5 KB
/
nn_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#' @name nn
#' @title Nearest neighbors and distances
#' @description Obtain nearest neighbors and distances from a matrix or disto
#' handle. k nearest or fixed radius neighbors are supported
#' @param x Object of class 'disto' or a numeric matrix
#' @param k Number of nearest neighbors
#' @param r Radius for nearest neighbors
#' @param method (string or function) distance metric when x is a matrix. Passed
#' to `proxy::dist`. Ignored when x is not a matrix.
#' @param ... Additional arguments for \code{\link{dapply}} when x is 'disto'
#' object. Else additional arguments are sent to
#' \code{\link[pbmcapply]{pbmclapply}}
#' @details Exactly one among k or r has to be provided
#' @return Object of class nn. A list with these elements: \itemize{
#'
#' \item \emph{triplet}: Matrix with three columns: row, col and distance. For
#' a fixed observation(value in 'row'), all corresponding values in 'col' are
#' the indexes of the nearest neighbors. All corresponding values in
#' 'distance' are the distances to those nearest neighbors
#'
#' \item \emph{size}: Size of the distance matrix or number of rows of the
#' matrix
#'
#' \item \emph{k} or \emph{r} : Depending on the input
#'
#' }
#' @examples
#' \dontrun{
#' # create a matrix
#' set.seed(100)
#' mat <- cbind(rnorm(3e3), rpois(3e3, 1))
#'
#' # compute a distance matrix and get a disto handle
#' do <- stats::dist(mat)
#' dio <- disto(objectname = "do")
#'
#' # nearest neighbors: k nearest and fixed radius
#' nn(dio, k = 1)
#' nn(mat, k = 1) # distance method defaults to 'euclidean'
#' str(nn(mat, k = 1)) # observe the structure of the output
#'
#' nn(dio, r = 0.1)
#' nn(mat, r = 0.1)
#'
#' # nearest neighbors parallelized: k nearest and fixed radius
#' # fast computation, higher memory usage
#' nn(dio, k = 1, nproc = 2)
#' nn(mat, k = 1, mc.cores = 2)
#'
#' nn(dio, r = 0.1, nproc = 2)
#' nn(mat, r = 0.1, mc.cores = 2)
#'
#' # different distance method
#' do <- stats::dist(mat, method = "manhattan")
#'
#' nn(dio, k = 1, nproc = 2)
#' nn(mat, k = 1, method = "manhattan", mc.cores = 2)
#'
#' nn(dio, r = 0.1, nproc = 2)
#' nn(mat, r = 0.1, method = "manhattan", mc.cores = 2)
#' }
#' @export
nn <- function(x, k, r, method = "euclidean", ...){
UseMethod("nn", x)
}
#' @export
nn.disto <- function(x
, k
, r
, method = "euclidean"
, ...
){
# assertions ----
assertthat::assert_that(inherits(x, "disto"))
sz <- size(x)
mk <- missing(k)
mr <- missing(r)
assertthat::assert_that(mk || mr
, msg = "One among k or r should be missing"
)
assertthat::assert_that(!(mk && mr)
, msg = "One among k or r should be provided"
)
if(mk){
assertthat::assert_that(assertthat::is.number(r) && (r > 0))
} else {
assertthat::assert_that(assertthat::is.count(k))
}
# decide the type ----
type <- ifelse(missing(k)
, function(vec, ind) nn_r(vec, ind, r = r)
, function(vec, ind) nn_k(vec, ind, k = k)
)
# dapply over points ----
tSparse <- do.call(rbind
, dapply(x = x
, margin = 1
, fun = type
, ...
)
)
colnames(tSparse) <- c("row", "col", "distance")
# return ----
res <- list(triplet = tSparse, size = sz)
if(mk){
res[["r"]] <- r
} else {
res[["k"]] <- k
}
class(res) <- "nn"
return(res)
}
#' @export
nn.matrix <- function(x
, k
, r
, method = "euclidean"
, ...
){
# assertions ----
assertthat::assert_that(inherits(x, "matrix"))
assertthat::assert_that(typeof(x) %in% c("integer", "double"))
sz <- nrow(x)
mk <- missing(k)
mr <- missing(r)
assertthat::assert_that(mk || mr
, msg = "One among k or r should be missing"
)
assertthat::assert_that(!(mk && mr)
, msg = "One among k or r should be provided"
)
if(mk){
assertthat::assert_that(assertthat::is.number(r) && (r > 0))
} else {
assertthat::assert_that(assertthat::is.count(k))
}
assertthat::assert_that(assertthat::is.string(method) || is.function(method))
# decide the type ----
# type is a function that provides nn's
type <- ifelse(missing(k)
, function(vec, ind) nn_r(vec, ind, r = r)
, function(vec, ind) nn_k(vec, ind, k = k)
)
# function to get nn per obs ----
nnPerObs <- function(i){
proxy::dist(x, matrix(x[i, ], nrow = 1), method = method) %>%
type(i)
}
# loop over obs ----
tSparse <- do.call(rbind, pbmcapply::pbmclapply(1:sz, nnPerObs, ...))
colnames(tSparse) <- c("row", "col", "distance")
#return ----
res <- list(triplet = tSparse, size = sz)
if(mk){
res[["r"]] <- r
} else {
res[["k"]] <- k
}
class(res) <- "nn"
return(res)
}
#' @name nn_k
#' @title k Nearest neighbors
#' @description k Nearest neighbors from a vector of distances
#' @param vec Vector of distances
#' @param index dummy to facilitate dapply
#' @param k Number of nearest neighbors
nn_k <- function(vec, index, k){
notMoreThanK <- (data.table::frankv(vec, ties.method = "dense") <= (k + 1))
notMoreThanK[index] <- FALSE
cbind(index, which(notMoreThanK), vec[notMoreThanK])
}
#' @name nn_r
#' @title Fixed radius Nearest neighbors
#' @description Fixed radius Nearest neighbors from a vector of distances
#' @param vec Vector of distances
#' @param index dummy to facilitate dapply
#' @param r Radius for nearest neighbors
nn_r <- function(vec, index, r){
withinR <- (vec <= r)
withinR[index] <- FALSE
nearestIndexes <- which(withinR)
if(length(nearestIndexes) != 0){
cbind(index, nearestIndexes , vec[withinR])
} else {
matrix(nrow = 0, ncol = 3)
}
}
#' @name print.nn
#' @title Print method for class 'nn'
#' @description Print method for class 'nn'
#' @param x Object of class 'nn'
#' @param ... stub
#' @return Returns the input invisibly besides printing on the screen
#' @export
print.nn <- function(x, ...){
if("k" %in% names(x)){
message("k Nearest neighbors of class 'nn'")
} else {
message("Fixed radius nearest neighbors of class 'nn'")
}
if(is.null(x[["sizeX"]])){
message("Dimension of sparse adjacency matrix: "
, x[["size"]]
, " X "
, x[["size"]]
)
} else {
message("Dimension of sparse adjacency matrix: "
, x[["size"]]
, " X "
, x[["sizeX"]]
)
}
return(invisible(x))
}
#' @name eps_k
#' @title Distance corresponding to kth neighbor
#' @description Distance corresponding to kth neighbor
#' @param x Disto object
#' @param k A positive integer
#' @param ... Arguments to 'dapply'. Should be among: subset, nproc, progress
#' @return A vector of distances
#' @export
#'
eps_k <- function(x, k, ...){
assertthat::assert_that(inherits(x, "disto"))
assertthat::assert_that(assertthat::is.count(k))
eff <- function(vec, index) Rfast::nth(vec, k)
unlist(dapply(x, 1, eff, ...))
}
#' @name nn2
#' @title Extension of nn method for two matrices
#' @description Find k or fixed radius nearest neighbors of each observation(row) matrix y in matrix x
#' @param x Numeric matrix
#' @param y Numneric matrix
#' @param k Number of nearest neighbors
#' @param r Radius for nearest neighbors
#' @param method (string or function) Distance metric passed to `proxy::dist`
#' @param ... Additional arguments are sent to
#' \code{\link[pbmcapply]{pbmclapply}}
#' @details Exactly one among k or r has to be provided
#' @return Object of class 'nn'. A list with these elements: \itemize{
#'
#' \item \emph{triplet}: Matrix with three columns: row, col and distance. For
#' a fixed observation of matrix y (value in 'row'), all corresponding values in 'col' are
#' the indexes of the nearest neighbors in matrix x. All corresponding values in
#' 'distance' are the distances to those nearest neighbors
#'
#' \item \emph{size}: Number of rows of matrix y
#'
#' \item \emph{sizeX}: Number of rows of matrix x
#'
#' \item \emph{k} or \emph{r} : Depending on the input
#'
#' }
#' @examples
#' temp <- nn2(x = matrix(rnorm(1e4), ncol = 10)
#' , y = matrix(runif(1e3), ncol = 10)
#' , r = 2
#' )
#' temp
#' @export
nn2 <- function(x
, y
, k
, r
, method = "euclidean"
, ...
){
# assertions ----
assertthat::assert_that(inherits(x, "matrix"))
assertthat::assert_that(typeof(x) %in% c("integer", "double"))
sz <- nrow(y)
assertthat::assert_that(inherits(y, "matrix"))
assertthat::assert_that(typeof(y) %in% c("integer", "double"))
assertthat::assert_that(ncol(x) == ncol(y))
mk <- missing(k)
mr <- missing(r)
assertthat::assert_that(mk || mr
, msg = "One among k or r should be missing"
)
assertthat::assert_that(!(mk && mr)
, msg = "One among k or r should be provided"
)
if(mk){
assertthat::assert_that(assertthat::is.number(r) && (r > 0))
} else {
assertthat::assert_that(assertthat::is.count(k))
}
assertthat::assert_that(assertthat::is.string(method) || is.function(method))
# decide the type ----
# type is a function that provides nn's
type <- ifelse(
!missing(k)
, function(vec, index){
notMoreThanK <- (data.table::frankv(vec, ties.method = "dense") <= k)
cbind(index, which(notMoreThanK), vec[notMoreThanK])
}
, function(vec, index){
withinR <- (vec <= r)
nearestIndexes <- which(withinR)
if(length(nearestIndexes) != 0){
cbind(index, nearestIndexes , vec[withinR])
} else {
matrix(nrow = 0, ncol = 3)
}
}
)
# nnPerObs with y
nnPerObs <- function(i){
proxy::dist(x, matrix(y[i, ], nrow = 1), method = method) %>%
type(i)
}
# loop over obs ----
tSparse <- do.call(rbind
, pbmcapply::pbmclapply(1:sz, nnPerObs, ...)
)
colnames(tSparse) <- c("row", "col", "distance")
#return ----
res <- list(triplet = tSparse, size = sz, sizeX = nrow(x))
if(mk){
res[["r"]] <- r
} else {
res[["k"]] <- k
}
class(res) <- "nn"
return(res)
}