/
construct.R
475 lines (438 loc) · 16.8 KB
/
construct.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
# Functions for constructions
#
# Author: Xuran Wang
#' Construct artificial bulk tissue expression from single cell data
#'
#' Artificial bulk tissue expression is generated by adding gene expressions of all cells from
#' same subjects.
#'
#' @param sce SingleCellExperiment, single cell dataset
#' @param clusters character, the colData used as clusters
#' @param samples character, the colData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}.
#' Otherwise, only cells of selected cell types will be used for construction.
#'
#' @return Matrix of expression of constructed bulk tissue, matrix of cell number in each cluster by sample
#' @importFrom Matrix rowSums
#'
#' @export
bulk_construct = function(sce, clusters, samples, select.ct = NULL){
if(!is.null(select.ct)){
sce = sce[, sce@colData[, clusters] %in% select.ct]
}else{
select.ct = unique(sce@colData[, clusters])
}
mdf = sce@colData
mdf$index = 1:ncol(sce);
u.sample = as.character(unique(mdf[, samples]))
bulk.counts = NULL
bulk.counts = sapply(u.sample, function(x){
index = mdf$index[mdf[, samples] == x]
if(length(index) == 1){
return(counts(sce[, index]))
}else{
return(Matrix::rowSums(counts(sce[, index])))
}
})
num.real = t(sapply(u.sample, function(x){
index = mdf$index[mdf[, samples] == x]
return(summary(mdf[index, clusters]))
}))
num.real = num.real[, select.ct]
return(list(bulk.counts = bulk.counts, num.real = num.real))
}
############ Construct Design Matrix, Library Size, and Subject-level Variation of Relative abundance for MuSiC ############
## These functions are for cell type specific mean expression, cross-subject variance and mean library size for MuSiC deconvolution
#' Cross-subject Mean of Relative Abudance
#'
#' This function is for calculating the cross-subject mean of relative abundance for selected cell types.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, if true, remove all gene with zero expression
#' @param markers vector or list of gene names
#' @param clusters character, the phenoData used as clusters
#' @param samples character,the phenoData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return gene by cell type matrix of average relative abundance
#' @importFrom Matrix rowSums
#'
#' @export
music_M.theta = function(x, non.zero, markers, clusters, samples, select.ct){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){ ## eliminate non expressed genes
x <- x[Matrix::rowSums(counts(x))>0, ]
}
clusters <- as.character(colData(x)[, clusters])
samples <- as.character(colData(x)[, samples])
M.theta <- sapply(unique(clusters), function(ct){
my.rowMeans(sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
}), na.rm = TRUE)
})
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(M.theta))
M.theta = M.theta[, m.ct]
}
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
M.theta <- M.theta[m.ids, ]
}
return(M.theta)
}
#' Subject and cell type specific relative abundance
#'
#' This function is for calculating the subject and cell type specific relative abundance for selected cell types.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, default as F. If true, remove all gene with zero expression
#' @param markers vector or list of gene names
#' @param clusters character, the colData used as clusters
#' @param samples character,the colData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return gene*subject by cell type matrix of relative abundance
#' @importFrom Matrix rowSums
#'
#' @export
music_Theta <- function(x, non.zero = FALSE, clusters, samples, select.ct = NULL){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){
x <- x[rowSums(counts(x))>0, ]
}
nGenes = nrow(x);
clusters <- as.character(pData(x)[, clusters])
samples <- as.character(pData(x)[, samples])
Theta <- sapply(unique(clusters), function(ct){
sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
})
})
n.ct = length(unique(clusters));
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(Theta))
Theta = Theta[, m.ct]
n.ct = length(select.ct)
}
return(Theta = Theta)
}
#' Cross-subject Covariance of Relative Abundance
#'
#' This function is for calculating the cross-subject covariance of relative abundance for selected cell types.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, if true, remove all gene with zero expression
#' @param markers vector or list of gene names
#' @param clusters character, the ColData used as clusters
#' @param samples character,the ColData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return celltype^2 by gene matrix of covariance
#' @importFrom Matrix rowSums
#'
#' @export
music_Sigma.ct = function(x, non.zero, markers, clusters, samples, select.ct){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){ ## eliminate non expressed genes
x <- x[rowSums(counts(x))>0, ]
}
nGenes = nrow(x);
clusters <- as.character(colData(x)[, clusters])
samples <- as.character(colData(x)[, samples])
Sigma <- sapply(unique(clusters), function(ct){
sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
})
})
n.sub = length(unique(samples));
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(Sigma))
Sigma = Sigma[, m.ct]
n.ct = length(select.ct)
}
Sigma.ct = sapply(1:nGenes, function(g){cov(Sigma[nGenes*(0:(n.sub-1)) + g, ])})
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
Sigma.ct <- Sigma.ct[ , m.ids]
}
return(Sigma.ct = Sigma.ct)
}
#' Cross-subject Variance of Relative Abundance
#'
#' This function is for calculating the cross-subject variance of relative abundance for selected cell types.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, if true, remove all gene with zero expression
#' @param markers vector or list of gene names
#' @param clusters character, the colData used as clusters
#' @param samples character,the colData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return gene by cell type matrix of variance
#' @importFrom Matrix rowSums
#'
#' @export
music_Sigma = function(x, non.zero, markers, clusters, samples, select.ct){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){ ## eliminate non expressed genes
x <- x[rowSums(counts(x))>0, ]
}
clusters <- as.character(colData(x)[, clusters])
samples <- as.character(colData(x)[, samples])
Sigma <- sapply(unique(clusters), function(ct){
apply(sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
}), 1, var, na.rm = TRUE)
})
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(Sigma))
Sigma = Sigma[, m.ct]
}
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
Sigma <- Sigma[m.ids, ]
}
return(Sigma = Sigma)
}
#' Cell type specific library size
#'
#' This function is for calculating the cell type specific library size for selected cell types.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, if true, remove all gene with zero expression
#' @param clusters character, the colData used as clusters
#' @param samples character,the colData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return subject by cell type matrix of library
#' @importFrom Matrix rowSums
#'
#' @export
music_S = function(x, non.zero, clusters, samples, select.ct){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){ ## eliminate non expressed genes
x <- x[rowSums(counts(x))>0, ]
}
clusters <- as.character(colData(x)[, clusters])
samples <- as.character(colData(x)[, samples])
S <- sapply(unique(clusters), function(ct){
my.rowMeans(sapply(unique(samples), function(sid){
y = counts(x)[, clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(sum(y))
}else{
return(sum(y)/ncol(y))
}
}), na.rm = TRUE)
})
S[S == 0] = NA
M.S = colMeans(S, na.rm = TRUE)
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(S))
S = S[, m.ct]
}
return(S = S)
}
#' Cell type specific library size
#'
#' This function is for calculating the cell type specific library size for selected cell types.
#'
#' @inheritParams music_S
#' @inheritParams music_M.theta
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, if true, remove all gene with zero expression
#' @param clusters character, the colData used as clusters
#' @param samples character,the colData used as samples
#' @param select.ct vector of cell types included, default as \code{NULL}. If \code{NULL}, include all cell types in \code{x}
#' @return subject by cell type matrix of library
#'
#' @export
#' @seealso
#' \code{\link{music_S}}, \code{\link{music_M.theta}},
music_Design.matrix = function(x, non.zero, markers, clusters, samples, select.ct){
S = music_S(x = x, non.zero = non.zero, clusters = clusters, samples = samples, select.ct = select.ct)
M.theta = music_M.theta(x = x, non.zero = non.zero, markers = markers, clusters = clusters, samples = samples,
select.ct = select.ct)
S[S == 0] = NA
M.S = colMeans(S, na.rm = TRUE)
D <- t(t(M.theta)*M.S)
return(D)
}
#' Prepare Design matrix and Cross-subject Variance for MuSiC Deconvolution
#'
#' This function is used for generating cell type specific cross-subject mean and variance for each gene. Cell type specific library size is also calcualted.
#'
#' @param x SingleCellExperiment, single cell dataset
#' @param non.zero logical, default as TRUE. If true, remove all gene with zero expression.
#' @param markers vector or list of gene names. Default as NULL. If NULL, then use all genes provided.
#' @param clusters character, the colData used as clusters;
#' @param samples character,the colData used as samples;
#' @param select.ct vector of cell types. Default as NULL. If NULL, then use all cell types provided.
#' @param cell_size data.frame of cell sizes. 1st column contains the names of cell types, 2nd column has the cell sizes per cell type. Default as NULL. If NULL, then estimate cell size from data.
#' @param ct.cov logical. If TRUE, use the covariance across cell types.
#' @param verbose logical, default as TRUE.
#' @return a list of
#' \itemize{
#' \item {gene by cell type matrix of Design matrix;}
#' \item {subject by celltype matrix of Library size;}
#' \item {vector of average library size for each cell type;}
#' \item {gene by celltype matrix of average relative abundance;}
#' \item {gene by celltype matrix of cross-subject variation.}
#' }
#' @importFrom Matrix rowSums
#'
#' @export
music_basis = function(x, non.zero = TRUE, markers = NULL, clusters, samples, select.ct = NULL, cell_size = NULL, ct.cov = FALSE, verbose = TRUE){
if(!is.null(select.ct)){
x = x[, x@colData[, clusters] %in% select.ct]
}
if(non.zero){ ## eliminate non expressed genes
x <- x[rowSums(counts(x))>0, ]
}
clusters <- as.character(colData(x)[, clusters])
samples <- as.character(colData(x)[, samples])
M.theta <- sapply(unique(clusters), function(ct){
my.rowMeans(sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
}), na.rm = TRUE)
})
if(verbose){message("Creating Relative Abudance Matrix...")}
if(ct.cov){
nGenes = nrow(x);
n.ct = length(unique(clusters));
nSubs = length(unique(samples))
Theta <- sapply(unique(clusters), function(ct){
sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return( rowSums(y)/sum(y) )
}
})
})
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(Theta))
Theta = Theta[, m.ct]
}
Sigma.ct = sapply(1:nGenes, function(g){
sigma.temp = Theta[nGenes*(0:(nSubs - 1)) + g, ];
Cov.temp = cov(sigma.temp)
Cov.temp1 = cov(sigma.temp[rowSums(is.na(Theta[nGenes*(0:(nSubs - 1)) + 1, ])) == 0, ])
Cov.temp[which(colSums(is.na(sigma.temp))>0), ] = Cov.temp1[which(colSums(is.na(sigma.temp))>0), ]
Cov.temp[, which(colSums(is.na(sigma.temp))>0)] = Cov.temp1[, which(colSums(is.na(sigma.temp))>0)]
return(Cov.temp)
})
colnames(Sigma.ct) = rownames(x);
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
Sigma.ct <- Sigma.ct[ , m.ids]
}
if(verbose){message("Creating Covariance Matrix...")}
}else{
Sigma <- sapply(unique(clusters), function(ct){
apply(sapply(unique(samples), function(sid){
y = counts(x)[,clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(y/sum(y))
}else{
return(rowSums(y)/sum(y))
}
}), 1, var, na.rm = TRUE)
})
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(Sigma))
Sigma = Sigma[, m.ct]
}
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
Sigma <- Sigma[m.ids, ]
}
if(verbose){message("Creating Variance Matrix...")}
}
S <- sapply(unique(clusters), function(ct){
my.rowMeans(sapply(unique(samples), function(sid){
y = counts(x)[, clusters %in% ct & samples %in% sid]
if(is.null(dim(y))){
return(sum(y))
}else{
return(sum(y)/ncol(y))
}
}), na.rm = TRUE)
})
if(verbose){message("Creating Library Size Matrix...")}
S[S == 0] = NA
M.S = colMeans(S, na.rm = TRUE)
#S.ra = relative.ab(S, by.col = FALSE)
#S.ra[S.ra == 0] = NA
#S[S == 0] = NA
#M.S = mean(S, na.rm = TRUE)*ncol(S)*colMeans(S.ra, na.rm = TRUE)
if(!is.null(cell_size)){
if(!is.data.frame(cell_size)){
stop("cell_size paramter should be a data.frame with 1st column for cell type names and 2nd column for cell sizes")
}else if(sum(names(M.S) %in% cell_size[, 1]) != length(names(M.S))){
stop("Cell type names in cell_size must match clusters")
}else if (any(is.na(as.numeric(cell_size[, 2])))){
stop("Cell sizes should all be numeric")
}
my_ms_names <- names(M.S)
cell_size <- cell_size[my_ms_names %in% cell_size[, 1], ]
M.S <- cell_size[match(my_ms_names, cell_size[, 1]),]
M.S <- M.S[, 2]
names(M.S) <- my_ms_names
}
D <- t(t(M.theta)*M.S)
if(!is.null(select.ct)){
m.ct = match(select.ct, colnames(D))
D = D[, m.ct]
S = S[, m.ct]
M.S = M.S[m.ct]
M.theta = M.theta[, m.ct]
}
if (!is.null(markers)){
ids <- intersect(unlist(markers), rownames(x))
m.ids = match(ids, rownames(x))
D <- D[m.ids, ]
M.theta <- M.theta[m.ids, ]
}
if(ct.cov){
return(list(Disgn.mtx = D, S = S, M.S = M.S, M.theta = M.theta, Sigma.ct = Sigma.ct))
}else{
return(list(Disgn.mtx = D, S = S, M.S = M.S, M.theta = M.theta, Sigma = Sigma))
}
}