-
Notifications
You must be signed in to change notification settings - Fork 0
/
detectDiffTSS.R
464 lines (420 loc) · 17.5 KB
/
detectDiffTSS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#' Calculate normalization factors from CapSet object
#'
#' @rdname getNormFactors
#' @param CSobject An object of class \code{\link{CapSet}}
#' @param features A \link[GenomicRanges]{GRanges-class}.object to count the reads on.
#' @param method Method to use for normalization. Options : "TMM","RLE","upperquartile","none"
#' @param ... Additional arguments passed to \link[edgeR]{calcNormFactors}
#'
#' @return Numeric vector of calculated normalization factors.
#' @importFrom GenomicAlignments summarizeOverlaps
#' @export
#'
#' @examples
#'
#' # load a txdb object
#' library("TxDb.Dmelanogaster.UCSC.dm6.ensGene")
#' seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL"
#'
#' # get genes (only X chromsome, for simplicity)
#' seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X"
#' dm6genes <- genes(TxDb.Dmelanogaster.UCSC.dm6.ensGene)
#'
#' # get norm factors by counting reads on genes
#' cs <- exampleCSobject()
#' normfacs <- getNormFactors(cs, dm6genes, method = "RLE")
#'
setMethod("getNormFactors",
signature = "CapSet",
function(CSobject,
features,
method,
...) {
# get bam files
si <- sampleInfo(CSobject)
bam.files <- si$filtered_file
## get 5' read counts on the TSS from the bam.files
counts <- summarizeOverlaps(features = features,
reads = bam.files,
preprocess.reads = readsTo5p)
# make DGElist
y <- edgeR::DGEList(counts = assay(counts))
normfacs <- edgeR::calcNormFactors(y, method = method, ...)
return(normfacs)
}
)
#' @rdname fitDiffTSS
#' @param CSobject An object of class \code{\link{CapSet}}
#' @param TSSfile A .bed file with TSS positions to test for differential TSS analysis. If left
#' empty, the union of detected TSS present within the provided CSobject would be plotted.
#' @param groups Character vector indicating the group into which each sample within the CSobject falls.
#' the groups would be use to create a design matrix. As an example, replicates for one
#' condition could be in the same group.
#' @param method Which method to use for differential expression analysis? options are "DESeq2" or "edgeR".
#' If "DESeq2" is chosen, the library size is either estimated via DESeq2 (using "median of ratios")
#' or can be provided via the "normFactors" option below. Setting the "normalization"
#' (below) has no effect in that case.
#' @param normalization A character indicating the type of normalization to perform. Options are "windowTMM",
#' "TMM", "RLE", "upperquartile" or NULL (don't compute normalization factors).
#' If "windowTMM" is chosen, the normalization factors are calculated using the TMM
#' method on 10 kb windows of the genome. "TMM" computes TMM normalization using counts
#' from all the evaluated TSSs. If NULL, the external normalization factors can be used
#' (provided using `normFactors`).
#' @param normFactors external normalization factors (from Spike-Ins, for example).
#'
#' @param outplots Output pdf filename for plots. If provided, the plots for BCV, dispersion and
#' MDS plot is created and saved in this file.
#' @param plotRefSample Name of reference sample to plot for detection of composition bias in the
#' data. Samples could be normalized using one of the provided normalization methods to
#' control for composition bias.
#' @param ncores No. of cores/threads to use
#'
#' @return An object of class \link[edgeR]{DGEGLM-class} or
#' \link[DESeq2]{DESeqDataSet}
#'
#' @importFrom graphics abline par plot smoothScatter
#' @importFrom grDevices dev.off pdf
#' @importFrom stats model.matrix
#' @importFrom methods as
#'
#' @export
#' @examples
#' # before running this
#' # 1. Create a CapSet object
#' # 2. de-multiplex the fastqs
#' # 3. map them
#' # 4. filter duplicate reads from mapped BAM
#' # 5. detect TSS
#' # 6. fit the diffTSS model
#' \dontrun{
#' # load a previously saved CapSet object
#' cs <- exampleCSobject()
#'
#' # count reads on all TSS (union) and fit a model using replicates within groups
#' csfit <- fitDiffTSS(cs, groups = rep(c("wt","mut"), each = 2), normalization = "internal",
#' outplots = NULL, plotRefSample = "embryo1")
#' save(csfit, file = "diffTSS_fit.Rdata")
#' }
#'
setMethod("fitDiffTSS",
signature = "CapSet",
function(CSobject,
TSSfile,
groups,
method,
normalization,
normFactors,
outplots,
plotRefSample,
ncores) {
# get bam files and design
si <- sampleInfo(CSobject)
if (all(is.na(si$filtered_file))) {
warning("Filtered files not found under sampleInfo(CSobject). Using mapped files")
bam.files <- si$mapped_file
} else {
bam.files <- si$filtered_file
}
if (any(is.na(bam.files))) stop("Some or all of the bam files are not defined!")
if (sum(file.exists(bam.files)) != length(bam.files)) {
stop("One or more bam files don't exist! Check sampleInfo(CSobject) ")
}
samples <- as.character(si$samples)
designDF <- data.frame(row.names = samples, group = groups)
# Import tss locations to test
if (is.null(TSSfile)) {
if (is.null(CSobject@tss_detected)) stop("Detected TSS absent or not provided.")
merged <- CSobject@tss_detected
stopifnot(!(is.null(merged)))
mergedall <- base::Reduce(S4Vectors::union, merged)
} else {
stopifnot(file.exists(TSSfile))
mergedall <- rtracklayer::import.bed(TSSfile)
}
## get 5' read counts on the TSS from the bam.files
bp_param <- getMCparams(ncores)
tsscounts <-
GenomicAlignments::summarizeOverlaps(features = mergedall,
reads = bam.files,
singleEnd = TRUE,
preprocess.reads = ResizeReads,
BPPARAM = bp_param)
## check method
if (method == "DESeq2") {
counts <- assay(tsscounts)
colnames(counts) <- samples
dds <- DESeq2::DESeqDataSetFromMatrix(
counts,
rowRanges = SummarizedExperiment::rowRanges(tsscounts),
colData = designDF,
design = ~group)
if (is.null(normFactors)) {
message("Performing DESeq normalization")
dds <- DESeq2::estimateSizeFactors(dds)
} else {
message("Using external normalization factors")
DESeq2::sizeFactors(dds) <- normFactors
}
normalization <- "skip"
} else if (method == "edgeR") {
# make DGElist
y <- edgeR::DGEList(counts = SummarizedExperiment::assay(tsscounts))
}
## Get norm factors
if (is.null(normalization)) {
## if normalization method not provided, look for external Norm factors
normfacs <- normFactors
# norm factors should be either NULL or a numeric vector
if (!is.null(normfacs)) {
stopifnot(is.numeric(normfacs) & length(normfacs) == length(groups))
}
y$samples$norm.factors <- normfacs
} else if (normalization == "windowTMM") {
## normalization factors calculated using TMM on large Windows
# fail early if plotRefSample is not given
if (is.null(plotRefSample))
stop("Please indicate reference sample for plotting of composition bias!")
## Internal normalization for composition bias : TMM
# useful to try different bin sizes and see if the values are close to unity
# (low composition effect)
regionparam <- csaw::readParam(restrict = NULL,
pe = ifelse(isTRUE(CSobject@paired_end), "first", "none"))
binned <-
csaw::windowCounts(bam.files,
bin = TRUE,
width = 10000,
param = regionparam)
normfacs <- csaw::normOffsets(binned) # close to unity
names(normfacs) <- samples
y.bin <- csaw::asDGEList(binned)
y$samples$norm.factors <- normfacs
if (!(is.na(plotRefSample))) {
print(plotCompBias(dgelist = y.bin, samples, plotRefSample))
}
} else if (normalization %in% c("TMM", "RLE", "upperquartile", "none")) {
## normalization factors calculated using TMM on all TSSs
y <- edgeR::calcNormFactors(y, method = normalization)
if (!(is.na(plotRefSample))) {
print(plotCompBias(dgelist = y, samples, plotRefSample))
}
} else if (normalization == "skip") {
message("skipping additional normalizations")
} else stop("Unknown normalization method!")
if (method == "DESeq2") {
## proceed with DESeq2
dds <- DESeq2::estimateDispersions(dds)
fit <- DESeq2::nbinomWaldTest(dds)
y <- NA
} else if (method == "edgeR") {
# proceed with edgeR
designm <- model.matrix( ~ 0 + group, designDF)
y <- edgeR::estimateDisp(y, designm)
fit <- edgeR::glmQLFit(y, designm, robust = TRUE)
# check prior degrees of freedom (avoid Infs)
#message("Prior degrees of freedom : ")
#print(summary(fit$df.prior))
}
## make plots if asked
if (!is.null(outplots)) {
pdf(outplots)
diffQCplots(method, fit, y, designMat = designDF)
dev.off()
}
## return the fit
return(fit)
}
)
#' Make DESeq2 or edgeR QC plots
#' @importFrom ggplot2 ggplot aes aes_string geom_hline geom_vline geom_point labs theme_bw
#' scale_shape_manual
#' @param method one of "DESeq2" or "edgeR"
#' @param fit output of fitDiffTSS (if method = "edgeR")
#' @param y output of fitDiffTSS (if method = "DESeq2")
#' @param designMat design matrix (data frame)
#'
#' @return Sparsity, dispersion and PCA plot (if method = DESeq2),
#' BCV, dispersion and MDS plot (if method = "edgeR")
#'
diffQCplots <- function(method, fit, y, designMat) {
if (method == "DESeq2") {
## plot sparsity
DESeq2::plotSparsity(fit)
## plot Dispersions
DESeq2::plotDispEsts(fit)
## plot PCA
dds_rlog <- DESeq2::rlog(fit)
PCAdata <- DESeq2::plotPCA(dds_rlog, intgroup = "group", returnData=TRUE)
percentVar <- round(100 * attr(PCAdata, "percentVar"))
print(ggplot(PCAdata, aes_string("PC1", "PC2", color = "group")) +
geom_hline(aes(yintercept = 0), colour = "grey") +
geom_vline(aes(xintercept = 0), colour = "grey") +
geom_point(size = 5) +
labs(x = paste0("PC1: ", percentVar[1], "% variance"),
y = paste0("PC2: ", percentVar[2], "% variance"),
title = "PCA\n") +
theme_bw(base_size = 14) +
scale_shape_manual(values = c(0:18,33:17))
)
} else if (method == "edgeR") {
## plot BCV and fit
par(mfrow = c(1, 2))
o <- order(y$AveLogCPM)
plot(
y$AveLogCPM[o],
sqrt(y$trended.dispersion[o]),
type = "l",
lwd = 2,
ylim = c(0, 1),
xlab = expression("Ave." ~ Log[2] ~ "CPM"),
ylab = ("Biological coefficient of variation")
)
## plot dispersion
edgeR::plotQLDisp(fit)
## plot MDS (top genes/tss)
par(mfrow = c(2, 2), mar = c(5, 4, 2, 2))
for (top in c(100, 500, 1000, 5000)) {
out <- limma::plotMDS(
edgeR::cpm(y, log = TRUE),
main = top,
labels = designMat$group,
top = top
)
}
}
}
## for "windowTMM" norm : visualize Effect of TMM normalization on composition bias
plotCompBias <- function(dgelist, samples, plotRefSample) {
normfacs <- dgelist$samples$norm.factors
abundances <- edgeR::aveLogCPM(dgelist)
adjc <- edgeR::cpm(dgelist, log = TRUE)
colnames(adjc) <- samples
# plot ref sample vs all other samples
message("plotting the composition effect")
sampnumber <- ncol(adjc) - 1
cols_toplot <- grep(plotRefSample, samples, invert = TRUE)
n <- ceiling(sampnumber / 3) #roundup to make divisible by 3
par(cex.lab = 1.5, mfrow = c(n, 3))
lapply(cols_toplot, function(x) {
smoothScatter(
abundances,
adjc[, plotRefSample] - adjc[, x],
ylim = c(-6, 6),
xlab = "Average abundance",
ylab = paste0("Log-ratio (", plotRefSample, " vs ", x, ")")
)
abline(h = log2(normfacs[plotRefSample] / normfacs[x]), col = "red")
})
}
#' @rdname detectDiffTSS
#' @importFrom limma makeContrasts
#' @importFrom edgeR glmQLFTest
#' @importFrom ggplot2 ggplot aes_string geom_point geom_abline scale_color_manual labs theme_gray theme
#'
#' @export
#' @examples
#'
#' # before running this
#' # 1. Create a CapSet object
#' # 2. de-multiplex the fastqs
#' # 3. map them
#' # 4. filter duplicate reads from mapped BAM
#' # 5. detect TSS
#' # 6. fit the diff TSS model.
#'
#' \dontrun{
#' # load a previously saved DGEGLM object from step 5
#' csfit <- load("diffTSS_fit.Rdata")
#' dir <- system.file("extdata", package = "icetea")
#' # detect differentially expressed TSS between groups (return MA plot)
#' detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt",
#' tssFile = file.path(dir, "testTSS_merged.bed"), MAplot_fdr = 0.05)
#'
#' }
#'
setMethod("detectDiffTSS",
signature = "DGEGLM",
function(fit,
testGroup,
contGroup,
TSSfile,
MAplot_fdr = NA) {
# Import tss locations to test
mergedall <- rtracklayer::import.bed(TSSfile)
## Testing the differential TSS
# make contrast matrix
contr <- paste0("group", testGroup , "-group", contGroup)
contrast <-
makeContrasts(contrasts = contr, levels = fit$design)
# test
results <- glmQLFTest(fit, contrast = contrast)
top <- as.data.frame(edgeR::topTags(results, n = Inf))
# sort output by pvalue and return
difftss <- mergedall[as.numeric(rownames(top))]
difftss$score <- top$FDR
difftss$logFC <- top$logFC
difftss$logCPM <- top$logCPM
# MA plot
if (!(is.na(MAplot_fdr))) {
p <-
ggplot(top, aes_string("logCPM", "logFC", col = factor(top$FDR < MAplot_fdr))) +
geom_point(alpha = 0.5) +
geom_abline(slope = 0, intercept = 0) +
scale_color_manual(values = c("grey40", "darkred")) +
labs(col = "Differentially Expressed") +
theme_gray(base_size = 14) +
theme(legend.position = "top")
print(p)
}
return(difftss)
}
)
#' @rdname detectDiffTSS
#' @export
#'
#' @importFrom DESeq2 results
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom ggplot2 ggplot aes_string geom_point geom_abline scale_color_manual labs
#' theme_gray theme scale_x_log10
#' @examples
#' \dontrun{
#' # load a previously saved DGEGLM object from step 5
#' csfit <- load("diffTSS_fit.Rdata")
#' dir <- system.file("extdata", package = "icetea")
#' # detect differentially expressed TSS between groups (return MA plot)
#' detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt", MAplot_fdr = 0.05)
#'
#' }
#'
setMethod("detectDiffTSS",
signature = "DESeqDataSet",
function(fit,
testGroup,
contGroup,
MAplot_fdr = NA) {
# Get TSS location as GRanges from dds metadata
mergedall <- rowRanges(fit)
## extract results from dds
contrastvec <- c("group", testGroup, contGroup)
ddr <- results(fit, contrast = contrastvec)
# add information to TSS GRanges
mergedall$score <- ddr$padj
mergedall$logFC <- ddr$log2FoldChange
mergedall$basemean <- ddr$baseMean
# MA plot
if (!(is.na(MAplot_fdr))) {
p <-
ggplot(as.data.frame(ddr), aes_string("baseMean", "log2FoldChange",
col = factor(ddr$padj < MAplot_fdr))) +
geom_point(alpha = 0.5) +
geom_abline(slope = 0, intercept = 0) +
scale_color_manual(values = c("grey40", "darkred")) +
labs(col = "Differentially Expressed") +
theme_gray(base_size = 14) +
theme(legend.position = "top") +
labs(x = "Mean Count", y = "log2(Fold-Change)") +
scale_x_log10()
print(p)
}
return(mergedall)
}
)