/
getIDER.R
executable file
·235 lines (202 loc) · 8.06 KB
/
getIDER.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#' @title Compute IDER-based similarity
#' @description Calculate the similarity matrix based on the metrics of
#' Inter-group Differential ExpRession (IDER) with the selected batch effects
#' regressed out.
#'
#' @param seu Seurat S4 object with the column of `initial_cluster` in
#' its meta.data. Required.
#' @param group.by.var initial clusters (batch-specific groups) variable.
#' Needs to be one of the `colnames(seu@meta.data)`. Default: "initial_cluster".
#' @param batch.by.var Batch variable. Needs to be one of the
#' `colnames(seu@meta.data)`. Default: "Batch".
#' @param verbose Boolean. Print the message and progress bar. (Default: TRUE)
#' @param use.parallel Boolean. Use parallel computation, which requires
#' doParallel; no progress bar will be printed out. Run time will be 1/n.cores
#' compared to the situation when no parallelisation is used. (Default: FALSE)
#' @param n.cores Numeric. Number of cores used for parallel computing
#' (default: 1).
#' @param downsampling.size Numeric. The number of cells representing each
#' group. (Default: 40)
#' @param downsampling.include Boolean. Using `include = TRUE` to include
#' the group smaller than required size. (Default: FALSE)
#' @param downsampling.replace Boolean. Using `replace = TRUE` if the group
#' is smaller than required size and some cells will be repeatedly used.
#' (Default: FALSE)
#'
#' @return A list of four objects: a similarity matrix, a numeric vector
#' recording cells used and the data frame of combinations included.
#'
#' @seealso \code{\link{plotNetwork}} \code{\link{finalClustering}}
#'
#' @export
#' @import limma edgeR foreach utils doParallel
#' @importFrom parallel detectCores
#' @importFrom parallel stopCluster
#' @importFrom stats model.matrix cor
#' @importFrom parallel makeCluster
#' @importFrom edgeR cpm
#'
getIDEr <- function(seu,
group.by.var = "initial_cluster",
batch.by.var = "Batch",
verbose = TRUE,
use.parallel = FALSE,
n.cores = 1,
downsampling.size = 40,
downsampling.include = TRUE,
downsampling.replace = TRUE) {
if(!group.by.var %in% colnames(seu@meta.data)) {
warning("group.by.var is not in the colnames of seu@meta.data.")
return(NULL)
}
if(!batch.by.var %in% colnames(seu@meta.data)) {
warning("batch.by.var is not in the colnames of seu@meta.data.")
return(NULL)
}
tmp <- seu@meta.data[,colnames(seu@meta.data) == group.by.var]
if(batch.by.var != "Batch") { # which is the batch var
seu$Batch <- seu@meta.data[,colnames(seu@meta.data) == batch.by.var]
}
## merge seurat list
metadata <- data.frame(
label = tmp,
batch = seu$Batch, # batch
stringsAsFactors = FALSE
)
select <- downsampling( # sampling
metadata = metadata,
n.size = downsampling.size,
include = downsampling.include,
replace = downsampling.replace
)
matrix <- as.matrix(seu@assays$RNA@counts[, select])
# matrix for dist calculation
colnames(matrix) <- paste0(colnames(matrix), seq_len(ncol(matrix)))
# avoid duplication
keep <- rowSums(matrix > 0.5) > 5
dge <- edgeR::DGEList(counts = matrix[keep, , drop = FALSE])
# make a edgeR object
dge <- dge[!grepl("ERCC-", rownames(dge)), ] # remove ERCC
dge <- dge[!grepl("MT-", rownames(dge)), ] # remove mitochondria genes
dge <- dge[!grepl("mt-", rownames(dge)), ]
# remove mitochondria genes for other species
logCPM <- cpm(dge, log = TRUE, prior.count = 3) # calculate cpm
df <- data.frame(
g = metadata$label[select], ## label
b = metadata$batch[select], ## batch
stringsAsFactors = FALSE
)
df$detrate <- scale(colMeans(matrix > 0))[, 1] # gene detection rate
rownames(df) <- colnames(matrix)
rm(matrix)
gc()
N <- length(unique(df$g)) # number of groups
# get the dataframe of combinations/pairs for comparison
combinations <- data.frame(g1 = rep(unique(df$g), each = N),
g2 = rep(unique(df$g), N),
stringsAsFactors = FALSE)
combinations <- combinations[combinations$g1 != combinations$g2, ]
combinations$b1 <- df$b[match(combinations$g1, df$g)]
combinations$b2 <- df$b[match(combinations$g2, df$g)]
combinations <- combinations[combinations$b1 != combinations$b2, ]
idx <- c()
for (i in 2:nrow(combinations)) {
if (!combinations$g2[i] %in% combinations$g1[seq_len(i - 1)]) {
idx <- c(idx, i)
}
}
combinations <- combinations[c(1, idx), ]
rownames(combinations) <- seq_len(nrow(combinations))
dist_coef <- matrix(0, nrow = N, ncol = N) # distance matrix
colnames(dist_coef) <- rownames(dist_coef) <- unique(df$g)
if(use.parallel & n.cores == 1){
n.cores <- detectCores(logical = FALSE) - 1
if(verbose){
message(sprintf("Use %d cores for calculation.", n.cores))
}
}
if (use.parallel == FALSE | n.cores == 1) { # not using parallel -----
# create progress bar
if (verbose) {
message("Generating distance matrix...")
pb <- txtProgressBar(min = 0, max = nrow(combinations), style = 3)
k <- 1
}
for (i in seq_len(nrow(combinations))) {
if (verbose) {
setTxtProgressBar(pb, k) # progress bar
k <- k + 1
}
df$tmp <- "bg"
df$tmp[which(df$g == combinations$g1[i])] <- "g1"
df$tmp[which(df$g == combinations$g2[i])] <- "g2"
design <- model.matrix(~ 0 + tmp + b + detrate, data = df)
contrast_m <- limma::makeContrasts(
contrasts = c("tmpg1-tmpbg", "tmpg2-tmpbg"),
levels = design
)
idx1 <- rownames(dist_coef) == combinations$g1[i]
idx2 <- colnames(dist_coef) == combinations$g2[i]
dist_coef[idx1, idx2] <- getGroupFit(logCPM, design, contrast_m)
}
if (verbose == TRUE) {
close(pb) # close progress bar
}
} else if (use.parallel == TRUE) { # using parallel -----
# decide OS and register parallel the parallel backend
if(Sys.info()["sysname"] == "Linux") {
registerDoParallel(cores = n.cores)
} else{
if(n.cores > detectCores(logical = FALSE) - 1){
warning("too many cores assign. Setting n.cores =
detectCores(logical = FALSE) - 1")
n.cores <- detectCores(logical = FALSE) - 1
}
cl <- makeCluster(n.cores)
registerDoParallel(cl = cl)
}
n.iter <- nrow(combinations) # number of iterations
i <- NULL
j <- NULL
df_dist <- foreach(i = combinations$g1, j = combinations$g2,
df = rep(list(df), n.iter),
logCPM = rep(list(logCPM), n.iter),
.combine = "rbind", .verbose = verbose) %dopar%
{
df$tmp <- "bg"
df$tmp[df$g == i] <- "g1"
df$tmp[df$g == j] <- "g2"
design <- model.matrix(~ 0 + tmp + b + detrate,
data = df)
contrast_m <- limma::makeContrasts(
contrasts = c("tmpg1-tmpbg", "tmpg2-tmpbg"),
levels = design
)
getGroupFit(logCPM, design, contrast_m)
}
if(Sys.info()["sysname"] == "Linux") {
stopImplicitCluster()
} else {
stopCluster(cl)
}
for (i in seq_len(nrow(combinations))) {
# assign the distance values into the matrix
idx1 <- rownames(dist_coef) == combinations$g1[i]
idx2 <- colnames(dist_coef) == combinations$g2[i]
dist_coef[idx1, idx2] <- df_dist[i, 1]
}
}
return(list(dist_coef + t(dist_coef), select, combinations))
}
#' @title Calculate IDER-based similarity between two groups
#'
#' @param logCPM logCPM
#' @param design design
#' @param contrast_m contrast matrix
#' @return Numeric. The IDER-based similarity between two groups.
#' @importFrom stats cor .lm.fit
getGroupFit <- function(logCPM, design, contrast_m){
fit <- .lm.fit(design, t(logCPM))
coef <- .zeroDominantMatrixMult(t(fit$coefficients), contrast_m)
return(cor(coef[, 1], coef[, 2]))
}