-
Notifications
You must be signed in to change notification settings - Fork 2
/
knn.R
293 lines (279 loc) · 10.8 KB
/
knn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#' Builds directed graph of correlations
#'
#' Builds directed graph of correlations where the nodes are the matrix
#' columns.
#'
#' This function builds a directed graph based on the edges in 'x' and their
#' ranks.
#'
#' 'x' is a data frame containing 4 columns named: 'col1', 'col2', 'val',
#' 'rank'. The third column ('val' can have a different name). The result in
#' the compatible format is returned, for example, by 'tgs_knn' function.
#'
#' 'tgs_graph' prunes some of the edges in 'x' based on the following steps:
#'
#' 1. A pair of columns i, j that appears in 'x' in 'col1', 'col2' implies the
#' edge in the graph from i to j: edge(i,j). Let the rank of i and j be
#' rank(i,j).
#'
#' 2. Calculate symmetrised rank of i and j: sym_rank(i,j) = rank(i,j) *
#' rank(j,i). If one of the ranks is missing from the previous result sym_rank
#' is set to NA.
#'
#' 3. Prune the edges: remove edge(i,j) if sym_rank(i,j) == NA OR sym_rank(i,j)
#' < knn * knn * k_expand
#'
#' 4. Prune excessive incoming edges: remove edge(i,j) if more than knn *
#' k_beta edges of type edge(node,j) exist and sym_rank(i,j) is higher than
#' sym_rank(node,j) for node != j.
#'
#' 5. Prune excessive outgoing edges: remove edge(i,j) if more than knn edges
#' of type edge(i,node) exist and sym_rank(i,j) is higher than sym_rank(i,node)
#' for node != i.
#'
#' @param x see below
#' @param knn maximal node degree
#' @param k_expand see below
#' @param k_beta see below
#' @return The graph edges are returned in a data frame, with the weight of
#' each edge. edge(i,j) receives weight 1 if its sym_rank is the lowest among
#' all edges of type edge(i,node). Formally defined: weight(i,j) = 1 -
#' (place(i,j) - 1) / knn, where place(i,j) is the location of edge(i,j) within
#' the sorted set of edges outgoing from i, i.e. edge(i,node). The sort is done
#' by sym_rank of the edges.
#' @keywords ~graph
#' @examples
#' \donttest{
#' # Note: all the available CPU cores might be used
#'
#' set.seed(seed = 1)
#' rows <- 100
#' cols <- 1000
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 30, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' }
#'
#' \dontshow{
#' options(tgs_use.blas = FALSE)
#' options(tgs_max.processes = 1)
#'
#' set.seed(seed = 1)
#' rows <- 100
#' cols <- 100
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 30, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' }
#'
#' @export tgs_graph
tgs_graph <- function(x, knn, k_expand, k_beta = 3) {
if (missing(x) || missing(knn) || missing(k_expand)) {
stop("Usage: tgs_graph(x, knn, k_expand, k_beta = 3)", call. = FALSE)
}
.Call("tgs_cor_graph", x, knn, k_expand, k_beta, new.env(parent = parent.frame()))
}
#' Clusters directed graph
#'
#' Clusters directed graph.
#'
#' The algorithm is explained in a "MetaCell: analysis of single-cell RNA-seq
#' data using K-nn graph partitions" paper, published in "Genome Biology" #20:
#' https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1812-2
#'
#' @param graph directed graph in the format returned by tgs_graph
#' @param min_cluster_size used to determine the candidates for seeding (= min
#' weight)
#' @param cooling factor that is used to gradually increase the chance of a
#' node to stay in the cluster
#' @param burn_in number of node reassignments after which cooling is applied
#' @return Data frame that maps each node to its cluster.
#' @seealso [tgs_graph()]
#' @keywords ~cluster
#' @examples
#' \donttest{
#' # Note: all the available CPU cores might be used
#'
#' set.seed(seed = 0)
#' rows <- 100
#' cols <- 1000
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 30, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' r4 <- tgs_graph_cover(r3, 5)
#' }
#'
#' \dontshow{
#' options(tgs_use.blas = FALSE)
#' options(tgs_max.processes = 1)
#'
#' set.seed(seed = 0)
#' rows <- 100
#' cols <- 100
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 30, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' r4 <- tgs_graph_cover(r3, 5)
#' }
#'
#' @export tgs_graph_cover
tgs_graph_cover <- function(graph, min_cluster_size, cooling = 1.05, burn_in = 10) {
if (missing(graph) || missing(min_cluster_size)) {
stop("Usage: tgs_graph_cover(graph, min_cluster_size, cooling = 1.05, burn_in = 10)", call. = FALSE)
}
.Call("tgs_graph2cluster", graph, min_cluster_size, cooling, burn_in, new.env(parent = parent.frame()))
}
#' Clusters directed graph multiple times with randomized sample subset
#'
#' Clusters directed graph multiple times with randomized sample subset.
#'
#' The algorithm is explained in a "MetaCell: analysis of single-cell RNA-seq
#' data using K-nn graph partitions" paper, published in "Genome Biology" #20:
#' https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1812-2
#'
#' @param graph directed graph in the format returned by tgs_graph
#' @param knn maximal number of edges used per node for each sample subset
#' @param min_cluster_size used to determine the candidates for seeding (= min
#' weight)
#' @param cooling factor that is used to gradually increase the chance of a
#' node to stay in the cluster
#' @param burn_in number of node reassignments after which cooling is applied
#' @param p_resamp fraction of total number of nodes used in each sample subset
#' @param n_resamp number iterations the clustering is run on different sample
#' subsets
#' @param method method for calculating co_cluster and co_sample; valid values:
#' "hash", "full", "edges"
#' @return If method == "hash", a list with two members. The first member is a
#' data frame with 3 columns: "node1", "node2" and "cnt". "cnt" indicates the
#' number of times "node1" and "node2" appeared in the same cluster. The second
#' member of the list is a vector of __number of nodes__ length reflecting how
#' many times each node was used in the subset.
#'
#' If method == "full", a list containing two matrices: co_cluster and
#' co_sample.
#'
#' If method == "edges", a list containing two data frames: co_cluster and
#' co_sample.
#' @seealso [tgs_graph()]
#' @keywords ~cluster
#' @examples
#' \donttest{
#' # Note: all the available CPU cores might be used
#'
#' set.seed(seed = 0)
#' rows <- 100
#' cols <- 200
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 20, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' r4 <- tgs_graph_cover_resample(r3, 10, 1)
#' }
#'
#' \dontshow{
#' set.seed(seed = 0)
#' rows <- 100
#' cols <- 200
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#'
#' r1 <- tgs_cor(m, pairwise.complete.obs = FALSE, spearman = TRUE)
#' r2 <- tgs_knn(r1, knn = 20, diag = FALSE)
#' r3 <- tgs_graph(r2, knn = 3, k_expand = 10)
#' r4 <- tgs_graph_cover_resample(r3, 10, 1, n_resamp = 5)
#' }
#'
#' @export tgs_graph_cover_resample
tgs_graph_cover_resample <- function(graph, knn, min_cluster_size, cooling = 1.05, burn_in = 10, p_resamp = 0.75, n_resamp = 500, method = "hash") {
if (missing(graph) || missing(knn) || missing(min_cluster_size)) {
stop("Usage: tgs_graph_cover_resample(graph, knn, min_cluster_size, cooling = 1.05, burn_in = 10, p_resamp = 0.75, n_resamp = 500)", call. = FALSE)
}
if (method == "hash") {
.Call("tgs_graph2cluster_multi_hash", graph, knn, min_cluster_size, cooling, burn_in, p_resamp, n_resamp, new.env(parent = parent.frame()))
} else if (method == "full") {
.Call("tgs_graph2cluster_multi_full", graph, knn, min_cluster_size, cooling, burn_in, p_resamp, n_resamp, new.env(parent = parent.frame()))
} else if (method == "edges") {
.Call("tgs_graph2cluster_multi_edges", graph, knn, min_cluster_size, cooling, burn_in, p_resamp, n_resamp, new.env(parent = parent.frame()))
} else {
stop("\"method\" argument must be equal to \"hash\", \"full\" or \"edges\"", call. = FALSE)
}
}
#' Returns k highest values of each column
#'
#' Returns k highest values of each column.
#'
#' 'tgs_knn' returns the highest 'knn' values of each column of 'x' (if 'x' is
#' a matrix). 'x' can be also a sparse matrix given in a data frame of 'col',
#' 'row', 'value' format.
#'
#' 'NA' and 'Inf' values are skipped as well as the values below 'threshold'.
#' If 'diag' is 'F' values of the diagonal (row == col) are skipped too.
#'
#' @param x numeric matrix or data frame (see below)
#' @param knn the number of highest values returned per column
#' @param diag if 'F' values of row 'i' and col 'j' are skipped for each i == j
#' @param threshold filter out values lower than threshold
#' @return A sparse matrix in a data frame format with 'col1', 'col2', 'val'
#' and 'rank' columns. 'col1' and 'col2' represent the column and the row
#' number of 'x'.
#' @keywords ~knn
#' @examples
#' \donttest{
#' # Note: all the available CPU cores might be used
#'
#' set.seed(seed = 1)
#' rows <- 100
#' cols <- 1000
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#' r <- tgs_knn(m, 3)
#' }
#'
#' \dontshow{
#' options(tgs_use.blas = FALSE)
#' options(tgs_max.processes = 1)
#'
#' set.seed(seed = 1)
#' rows <- 100
#' cols <- 100
#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
#' m <- matrix(vals, nrow = rows, ncol = cols)
#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
#' r <- tgs_knn(m, 3)
#' }
#'
#' @export tgs_knn
tgs_knn <- function(x, knn, diag = FALSE, threshold = 0) {
if (missing(x) || missing(knn)) {
stop("Usage: tgs_knn(x, knn, diag = FALSE, threshold = 0)", call. = FALSE)
}
if (is.integer(x)) {
tmp <- as.double(x)
attributes(tmp) <- attributes(x)
x <- tmp
} else if (is.data.frame(x) && ncol(x) >= 3 && is.integer(x[, 3])) {
tmp <- as.double(x[, 3])
attributes(tmp) <- attributes(x[, 3])
x[, 3] <- tmp
}
.Call("tgs_knn", x, knn, diag, threshold, new.env(parent = parent.frame()))
}