/
kdtools.R
427 lines (381 loc) · 13.6 KB
/
kdtools.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#' @importFrom Rcpp evalCpp
#' @useDynLib kdtools, .registration = TRUE
NULL
#' Generate column indices
#'
#' @param x a data frame
#' @param cols a column specification
#'
#' @details \code{cols} can be a logical, numeric, or character vector
#' indicating which columns to extract. Or \code{cols} can be a formula
#' with the right-hand-side variable specifying the columns. This function
#' is used where ever a \code{cols} option is present throughout \code{kdtools}.
#'
#' @return A vector of integers
#'
#' @examples
#' colspec(mtcars, c(1, 3, 6))
#' colspec(mtcars, c("mpg", "disp", "wt"))
#' colspec(mtcars, ~mpg + disp + wt)
#' colspec(mtcars, c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE,
#' FALSE, FALSE, FALSE, FALSE, FALSE))
#'
#' @export
colspec <- function(x, cols = NULL) {
res <- switch(mode(cols),
"call" = colspec(x, labels(stats::terms(cols, data = x))),
"character" = match(cols, colnames(x)),
"numeric" = cols,
"logical" = (1:ncol(x))[cols],
"NULL" = 1:ncol(x),
stop("Invalid column specificaiton"))
if (length(res) == 0 || !all(res %in% 1:ncol(x)))
stop("Invalid column specificaiton")
return(res)
}
#' Sort multidimensional data
#' @param x a matrix or arrayvec object
#' @param parallel use multiple threads if true
#' @param inplace sort as a side-effect if true
#' @param cols integer or character vector or formula indicating columns
#' @param ... ignored
#' @details The algorithm used is a divide-and-conquer quicksort variant that
#' recursively partitions an range of tuples using the median of each
#' successive dimension. Ties are resolved by cycling over successive
#' dimensions. The result is an ordering of tuples matching their order if
#' they were inserted into a kd-tree.
#'
#' \code{kd_order} returns permutation vector that will order the rows of the
#' original matrix, exactly as \code{\link{order}}. If \code{inplace} is true,
#' then \code{kd_order} will also sort the arrayvec object as a side effect.
#' This can be more efficient when many subsequent queries are required.
#'
#' \code{kd_sort} and \code{kd_order} have been extended to work directly on R
#' native data.frame and matrix types. All vector column types are supported
#' (even lists of objects as long as equality and comparison operators are
#' defined). Additional, the user can specify a sequence of column indices
#' that will be used for sorting. These can be a subset of columns and given
#' in any order.
#'
#' \code{kd_sort} and \code{kd_order} work differently on spatial \code{sf}
#' types. If no sort columns are specified, then the spatial coordinates are sorted.
#' Otherwise, the coordinates are ignored and the specified columns are used.
#'
#' @return \tabular{ll}{\code{kd_sort} \tab the table sorted in kd-tree order
#' \cr \code{kd_order} \tab a permutation vector \cr \code{kd_is_sorted} \tab
#' a boolean \cr}
#' @examples
#' if (has_cxx17()) {
#' z <- data.frame(real = runif(10), lgl = runif(10) > 0.5,
#' int = as.integer(rpois(10, 2)), char = sample(month.name, 10),
#' stringsAsFactors = FALSE)
#' kd_sort(z)
#' x <- matrix(runif(200), 100)
#' y <- kd_sort(x)
#' kd_is_sorted(y)
#' kd_order(x)
#' plot(y, type = "o", pch = 19, col = "steelblue", asp = 1)
#' }
#' @seealso \code{\link{arrayvec}}
#' @rdname kdsort
#' @export
kd_sort <- function(x, ...) UseMethod("kd_sort")
#' @rdname kdsort
#' @export
kd_sort.arrayvec <- function(x, inplace = FALSE, parallel = TRUE, ...) {
return(kd_sort_(x, inplace = inplace, parallel = parallel))
}
#' @rdname kdsort
#' @export
kd_sort.matrix <- function(x, cols = NULL, parallel = TRUE, ...) {
return(x[kd_order(x, cols = cols, parallel = parallel),, drop = FALSE])
}
#' @rdname kdsort
#' @export
kd_sort.data.frame <- function(x, cols = NULL, parallel = TRUE, ...) {
return(x[kd_order(x, cols = cols, parallel = parallel),, drop = FALSE])
}
#' @rdname kdsort
#' @export
kd_order <- function(x, ...) UseMethod("kd_order")
#' @rdname kdsort
#' @export
kd_order.arrayvec <- function(x, inplace = FALSE, parallel = TRUE, ...) {
return(kd_order_(x, inplace = inplace, parallel = parallel))
}
#' @rdname kdsort
#' @export
kd_order.matrix <- function(x, cols = NULL, parallel = TRUE, ...) {
return(kd_order_mat(x, colspec(x, cols), parallel = parallel))
}
#' @rdname kdsort
#' @export
kd_order.data.frame <- function(x, cols = NULL, parallel = TRUE, ...) {
return(kd_order_df(x, colspec(x, cols), parallel = parallel))
}
#' @rdname kdsort
#' @export
kd_order.sf <- function(x, cols = NULL, parallel = TRUE, ...) {
if (is.null(cols)) return(kd_order(sf::st_coordinates(x), parallel = parallel))
return(kd_order(sf::st_drop_geometry(x), colspec(x, cols), parallel = parallel))
}
#' @rdname kdsort
#' @export
kd_is_sorted <- function(x, ...) UseMethod("kd_is_sorted")
#' @export
kd_is_sorted.arrayvec <- function(x, parallel = TRUE, ...) {
return(kd_is_sorted_(x, parallel))
}
#' @export
kd_is_sorted.matrix <- function(x, cols = NULL, parallel = TRUE, ...) {
return(kd_is_sorted_mat(x, colspec(x, cols), parallel))
}
#' @export
kd_is_sorted.data.frame <- function(x, cols = NULL, parallel = TRUE, ...) {
return(kd_is_sorted_df(x, colspec(x, cols), parallel))
}
#' Sort a matrix into lexicographical order
#' @param x a matrix, data frame, or arrayvec object
#' @param ... other parameters
#' @details Sorts a range of tuples into lexicographical order
#' @return the input type sorted
#' @examples
#' if (has_cxx17()) {
#' x = lex_sort(matrix(runif(200), 100))
#' plot(x, type = "o", pch = 19, col = "steelblue", asp = 1)
#' }
#' @rdname lexsort
#' @export
lex_sort <- function(x, ...) UseMethod("lex_sort")
#' @export
lex_sort.arrayvec <- function(x, inplace = FALSE, ...) {
return(lex_sort_(x, inplace = inplace))
}
#' @export
lex_sort.matrix <- function(x, cols = NULL, ...) {
i <- kd_lex_order_mat(x, colspec(x, cols))
return(x[i, ])
}
#' @export
lex_sort.data.frame <- function(x, cols = NULL, ...) {
i <- kd_lex_order_df(x, colspec(x, cols))
return(x[i, ])
}
#' Search sorted data
#' @param x an object sorted by \code{\link{kd_sort}}
#' @param v a vector specifying where to look
#' @param l lower left corner of search region
#' @param u upper right corner of search region
#' @param cols integer or character vector or formula indicating columns
#' @param ... ignored
#' @return \tabular{ll}{
#' \code{kd_lower_bound} \tab a row of values (vector) \cr
#' \code{kd_upper_bound} \tab a row of values (vector) \cr
#' \code{kd_range_query} \tab a set of rows in the same format as the sorted input \cr
#' \code{kd_rq_indices} \tab a vector of integer indices specifying rows in the input \cr
#' \code{kd_binary_search} \tab a boolean \cr}
#' @examples
#' if (has_cxx17()) {
#' x = matrix(runif(200), 100)
#' y = matrix_to_tuples(x)
#' kd_sort(y, inplace = TRUE)
#' y[kd_lower_bound(y, c(1/2, 1/2)),]
#' y[kd_upper_bound(y, c(1/2, 1/2)),]
#' kd_binary_search(y, c(1/2, 1/2))
#' kd_range_query(y, c(1/3, 1/3), c(2/3, 2/3))
#' kd_rq_indices(y, c(1/3, 1/3), c(2/3, 2/3))
#' }
#' @aliases kd_lower_bound
#' @rdname search
#' @export
kd_lower_bound <- function(x, v) UseMethod("kd_lower_bound")
#' @export
kd_lower_bound.arrayvec <- function(x, v) {
return(kd_lower_bound_(x, v))
}
#' @export
kd_lower_bound.matrix <- function(x, v) {
y <- matrix_to_tuples(x)
return(kd_lower_bound_(y, v))
}
#' @rdname search
#' @export
kd_upper_bound <- function(x, v) UseMethod("kd_upper_bound")
#' @export
kd_upper_bound.arrayvec <- function(x, v) {
return(kd_upper_bound_(x, v))
}
#' @export
kd_upper_bound.matrix <- function(x, v) {
y <- matrix_to_tuples(x)
return(kd_upper_bound_(y, v))
}
#' @rdname search
#' @export
kd_range_query <- function(x, l, u, ...) UseMethod("kd_range_query")
#' @rdname search
#' @export
kd_range_query.arrayvec <- function(x, l, u, ...) {
return(kd_range_query_(x, l, u))
}
#' @rdname search
#' @export
kd_range_query.matrix <- function(x, l, u, cols = NULL, ...) {
return(x[kd_rq_indices(x, l, u, colspec(x, cols)),, drop = FALSE])
}
#' @rdname search
#' @export
kd_range_query.data.frame <- function(x, l, u, cols = NULL, ...) {
return(x[kd_rq_indices(x, l, u, colspec(x, cols)),, drop = FALSE])
}
#' @rdname search
#' @export
kd_rq_indices <- function(x, l, u, ...) UseMethod("kd_rq_indices")
#' @rdname search
#' @export
kd_rq_indices.arrayvec <- function(x, l, u, ...) {
return(kd_rq_indices_(x, l, u))
}
#' @rdname search
#' @export
kd_rq_indices.matrix <- function(x, l, u, cols = NULL, ...) {
return(kd_rq_mat(x, colspec(x, cols), l, u))
}
#' @rdname search
#' @export
kd_rq_indices.data.frame <- function(x, l, u, cols = NULL, ...) {
return(kd_rq_df(x, colspec(x, cols), l, u))
}
#' @rdname search
#' @export
kd_binary_search <- function(x, v) UseMethod("kd_binary_search")
#' @rdname search
#' @export
kd_binary_search.arrayvec <- function(x, v) {
return(kd_binary_search_(x, v))
}
#' @rdname search
#' @export
kd_binary_search.matrix <- function(x, v) {
y <- matrix_to_tuples(x)
return(kd_binary_search_(y, v))
}
#' Find nearest neighbors
#' @param x an object sorted by \code{\link{kd_sort}}
#' @param v a vector specifying where to look
#' @param n the number of neighbors to return
#' @param cols integer or character vector or formula indicating columns
#' @param w distance weights
#' @param p exponent of p-norm (Minkowski) distance
#' @param a approximate neighbors within (1 + a)
#' @param validate if FALSE, no input validation is performed
#' @param ... ignored
#' @return \tabular{ll}{
#' \code{kd_nearest_neighbors} \tab one or more rows from the sorted input \cr
#' \code{kd_nn_indices} \tab a vector of row indices indicating the result \cr
#' \code{kd_nearest_neighbor} \tab the row index of the neighbor \cr
#' }
#'
#' @details Distance is calculated as
#' \deqn{D_{ij} = [\sum_k w_k G(x_{ik}, x_{jk}) ^ p] ^ {1 / p}}
#' where \eqn{i} and \eqn{j} are records, \eqn{k} is the \eqn{k}th field or
#' tuple element, and \eqn{w_k} is the weight in the \eqn{k}th dimension. Here,
#' \eqn{G} depends on the type. For reals, \eqn{G(a, b) = |a - b|}.
#' For logicals and integers, \eqn{G(a, b)} is one if \eqn{a = b}
#' and zero otherwise. For strings, \eqn{G(a, b)} is Levenshtein or edit distance.
#' Convert strings to factors unless edit distance makes sense for your application.
#'
#' When using the \code{cols} argument, the search key \code{v} is handled specially. If
#' the length of \code{v} is equal to the number of columns of \code{x}, then it is
#' assumed that the key is given in the same order as the columns of \code{x}. In that case,
#' the key \code{v} is mapped through the \code{cols} argument. This will possibly
#' change the order of the elements and length of the search key. The reason for this
#' is that it allows one to use a row of \code{x} as the key and it will respect the
#' \code{cols} argument. Otherwise, or if \code{validate} is \code{FALSE}, the search
#' key \code{v} is passed unchanged and must be given with the correct length and order
#' to match the \code{cols} argument. The same is true of the \code{w} parameter.
#'
#' @examples
#' if (has_cxx17()) {
#' x = matrix(runif(200), 100)
#' y = matrix_to_tuples(x)
#' kd_sort(y, inplace = TRUE)
#' y[kd_nearest_neighbor(y, c(1/2, 1/2)),]
#' kd_nearest_neighbors(y, c(1/2, 1/2), 3)
#' y[kd_nn_indices(y, c(1/2, 1/2), 5),]
#' }
#' @rdname nneighb
#' @export
kd_nearest_neighbors <- function(x, v, n, ...) UseMethod("kd_nearest_neighbors")
#' @rdname nneighb
#' @export
kd_nearest_neighbors.arrayvec <- function(x, v, n, p = 2, a = 0, ...) {
return(kd_nearest_neighbors_(x, v, n, p = p, a = a))
}
#' @rdname nneighb
#' @export
kd_nearest_neighbors.matrix <- function(x, v, n, cols = NULL, p = 2, a = 0, ...) {
return(x[kd_nn_indices(x, v, n, colspec(x, cols), p = p, a = a),, drop = FALSE])
}
#' @rdname nneighb
#' @export
kd_nearest_neighbors.data.frame <- function(x, v, n, cols = NULL, w = NULL, p = 2, a = 0, ...) {
return(x[kd_nn_indices(x, v, n, colspec(x, cols), w, p = p, a = a),, drop = FALSE])
}
#' @rdname nneighb
#' @export
kd_nn_indices <- function(x, v, n, ...) UseMethod("kd_nn_indices")
#' @param distances return distances as attribute if true
#' @rdname nneighb
#' @export
kd_nn_indices.arrayvec <- function(x, v, n, distances = FALSE, p = 2, a = 0, ...) {
if (distances)
return(as.data.frame(kd_nn_dist_(x, v, n, p, a)))
return(kd_nn_indices_(x, v, n, p, a))
}
#' @rdname nneighb
#' @export
kd_nn_indices.matrix <- function(x, v, n, cols = NULL,
distances = FALSE, p = 2, a = 0,
validate = TRUE, ...) {
cols <- colspec(x, cols)
res <- if (validate) {
if (length(v) == ncol(x)) v <- v[cols]
kd_nn_mat(x, cols, v, a, p, n)
} else {
kd_nn_mat_no_validation(x, cols, v, a, p, n)
}
if (distances) return(as.data.frame(res))
return(res[[1]])
}
#' @rdname nneighb
#' @export
kd_nn_indices.data.frame <- function(x, v, n, cols = NULL, w = NULL,
distances = FALSE, p = 2, a = 0,
validate = TRUE, ...) {
cols <- colspec(x, cols)
if (is.null(w)) w <- rep_len(1, length(cols))
res <- if (validate) {
if (length(v) == ncol(x)) v <- v[cols]
if (length(w) == ncol(x)) w <- w[cols]
kd_nn_df(x, cols, w, v, a, p, n)
} else {
kd_nn_df_no_validation(x, cols, w, v, a, p, n)
}
if (distances) return(as.data.frame(res))
return(res[[1]])
}
#' @rdname nneighb
#' @export
kd_nearest_neighbor <- function(x, v) UseMethod("kd_nearest_neighbor")
#' @rdname nneighb
#' @export
kd_nearest_neighbor.arrayvec <- function(x, v) {
return(kd_nearest_neighbor_(x, v))
}
#' @rdname nneighb
#' @export
kd_nearest_neighbor.matrix <- function(x, v) {
y <- matrix_to_tuples(x)
return(kd_nearest_neighbor_(y, v))
}