-
Notifications
You must be signed in to change notification settings - Fork 11
/
phyloseq_filter.R
342 lines (275 loc) · 12.3 KB
/
phyloseq_filter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#' @title Remove samples from phyloseq object that have less than n taxa
#'
#' @param physeq A phyloseq-class object
#' @param mintaxa Minimum number of taxa that should be present in a sample (default, 10)
#'
#' @return Trimmed phyloseq object (All samples will have >= N taxa)
#' @export
#'
#' @examples
#' data("esophagus")
#' esophagus
#' phyloseq_richness_filter(esophagus, mintaxa = 30)
#' phyloseq_richness_filter(esophagus, mintaxa = 100)
#'
phyloseq_richness_filter <- function(physeq, mintaxa = 10){
## Estimate number of OTUs per sample
sp <- phyloseq::estimate_richness(physeq, measures = "Observed")
samples_to_keep <- rownames(sp)[ which(sp$Observed >= mintaxa) ]
if(length(samples_to_keep) == 0){
stop("All samples will be removed.\n")
}
if(length(samples_to_keep) == phyloseq::nsamples(physeq)){
cat("All samples will be preserved\n")
res <- physeq
}
if(length(samples_to_keep) < phyloseq::nsamples(physeq)){
res <- phyloseq::prune_samples(samples = samples_to_keep, x = physeq)
}
return(res)
}
#' @title Remove taxa with small mean relative abundance.
#'
#' @param physeq A phyloseq-class object
#' @param frac The minimum cutoff for the relative OTU abundance
#' @details This function searches for taxa with small mean relative abundance and removes them. Result will be returned with original counts in the abundance table.
#' @return Phyloseq object with a subset of taxa.
#' @export
#'
#' @examples
#' data("esophagus")
#' phyloseq_filter_taxa_rel_abund(esophagus, frac = 0.01)
#' phyloseq_filter_taxa_rel_abund(esophagus, frac = 0.1)
#'
phyloseq_filter_taxa_rel_abund <- function(physeq, frac = 1e-4){
# require(phyloseq)
## Transform OTU counts to relative abundance
rel <- phyloseq::transform_sample_counts(physeq, function(x) x / sum(x) )
## Filter OTUs
rel.subs <- phyloseq::filter_taxa(rel, function(x){ mean(x) > frac }, prune = FALSE)
## if prune = TRUE
# tn <- taxa_names(rel.subs) # OTUs to preserve
# tr <- setdiff(taxa_names(physeq), tn) # OTUs to remove
## Taxa to remove
tr <- names(rel.subs)[ which(rel.subs == FALSE) ]
## If all taxa should be removed
if(length(tr) == phyloseq::ntaxa(physeq)){
stop("Error: all taxa will be removed with the specified 'frac' cutoff.\n")
}
## If there is nothing to remove
if(length(tr) == 0){
res <- physeq
cat("Warning: no taxa removed.\n")
}
## Keep taxa which satisfies the truncation threshold
if(length(tr) > 0){
res <- phyloseq::prune_taxa(taxa = rel.subs, physeq)
}
return(res)
}
#' @title Remove taxa with abundance less then a certain fraction of total abundance.
#'
#' @param physeq A phyloseq-class object
#' @param frac The minimum cutoff for the OTU abundance in the table. This number is a fraction, not a percent.
#' @details
#' If frac = 0.0001, this will retain all OTU's that have at least a 0.01\% total abundance in the OTU table.
#' If you wanted to retain OTUs with at least 1\% total abundance, you must specify, 0.01.
#'
#' @return Phyloseq object with a subset of taxa.
#' @export
#' @seealso http://qiime.org/scripts/filter_otus_from_otu_table.html
#' @examples
#' data("esophagus")
#' phyloseq_filter_taxa_tot_fraction(esophagus, frac = 0.01)
#'
phyloseq_filter_taxa_tot_fraction <- function(physeq, frac = 0.01){
# require(phyloseq)
## Estimate total abundance of OTUs
tot <- sum(phyloseq::taxa_sums(physeq))
## Remove OTUs
res <- phyloseq::filter_taxa(physeq, function(x){ ( sum(x)/tot ) > frac }, prune = TRUE)
return(res)
}
#' @title Filter low-prevalence OTUs.
#' @description This function will remove taxa (OTUs) with low prevalence, where prevalence is the fraction of total samples in which an OTU is observed.
#' @param physeq A phyloseq-class object
#' @param prev.trh Prevalence threshold (default, 0.05 = 5\% of samples)
#' @param abund.trh Abundance threshold (default, NULL)
#' @param threshold_condition Indicates type of prevalence and abundance conditions, can be "OR" (default) or "AND"
#' @param abund.type Character string indicating which type of OTU abundance to take into account for filtering ("total", "mean", or "median")
#' @details
#' Abundance threshold defines if the OTU should be preserved if its abundance is larger than threshold (e.g., >= 50 reads).
#' Parameter "threshold_condition" indicates whether OTU should be kept if it occurs in many samples AND/OR it has high abundance.
#' @return Phyloseq object with a subset of taxa.
#' @seealso \code{\link{phyloseq_prevalence_plot}}
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' GlobalPatterns # 19216 taxa
#'
#' # OTUs that are found in at least 5% of samples
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.05, abund.trh = NULL) # 15389 taxa
#'
#' # The same, but if total OTU abundance is >= 10 reads it'll be preserved too
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.05, abund.trh = 10, threshold_condition = "OR") # 15639 taxa
#'
#' # Include only taxa with more than 10 reads (on average) in at least 10% samples
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.1, abund.trh = 10, abund.type = "mean", threshold_condition = "AND") # 4250 taxa
#'
phyloseq_filter_prevalence <- function(physeq, prev.trh = 0.05, abund.trh = NULL, threshold_condition = "OR", abund.type = "total"){
## Threshold validation
if(prev.trh > 1 | prev.trh < 0){ stop("Prevalence threshold should be non-negative value in the range of [0, 1].\n") }
if(!is.null(abund.trh)){
if(abund.trh <= 0){ stop("Abundance threshold should be non-negative value larger 0.\n") }
}
# ## Check for the low-prevalence species (compute the total and average prevalences of the features in each phylum)
# prevdf_smr <- function(prevdf){
# plyr::ddply(prevdf, "Phylum", function(df1){
# data.frame(
# Average = mean(df1$Prevalence),
# Total = sum(df1$Prevalence))
# })
# }
# prevdf_smr( prevalence(physeq) )
## Check the prevalence threshold
# phyloseq_prevalence_plot(prevdf, physeq)
## Define prevalence threshold as % of total samples
## This function is located in 'phyloseq_prevalence_plot.R' file
prevalenceThreshold <- prev.trh * phyloseq::nsamples(physeq)
## Calculate prevalence (number of samples with OTU) and OTU total abundance
prevdf <- prevalence(physeq)
## Get the abundance type
if(abund.type == "total") { prevdf$AbundFilt <- prevdf$TotalAbundance }
if(abund.type == "mean") { prevdf$AbundFilt <- prevdf$MeanAbundance }
if(abund.type == "median"){ prevdf$AbundFilt <- prevdf$MedianAbundance }
## Which taxa to preserve
if(is.null(abund.trh)) { tt <- prevdf$Prevalence >= prevalenceThreshold }
if(!is.null(abund.trh)){
## Keep OTU if it either occurs in many samples OR it has high abundance
if(threshold_condition == "OR"){
tt <- (prevdf$Prevalence >= prevalenceThreshold | prevdf$AbundFilt >= abund.trh)
}
## Keep OTU if it occurs in many samples AND it has high abundance
if(threshold_condition == "AND"){
tt <- (prevdf$Prevalence >= prevalenceThreshold & prevdf$AbundFilt >= abund.trh)
}
}
## Extract names for the taxa we whant to keep
keepTaxa <- prevdf$Taxa[ tt ]
## Execute prevalence filter
res <- phyloseq::prune_taxa(taxa = keepTaxa, x = physeq)
return(res)
}
#' @title Filter rare OTUs based on minimum abundance threshold.
#' @description This function performs sample-wise OTU abundance trimming.
#' @param physeq A phyloseq-class object
#' @param minabund Abundance threshold (default, 10)
#' @param relabund Logical; perform trimming based on relative abundances (default, FALSE)
#' @param rm_zero_OTUs Logical, remove OTUs with zero total abundance
#' @details
#' OTUs can be considered as rare if they comprise fewer than X (e.g., 10) sequences within a sample.
#' This function is intented to censore OTU abundance (unsing an arbitrary threshold) on a sample-wise basis.
#'
#' Trimming can be performed based on relative abundances of OTUs within a sample (`relabund = TRUE`), but the orginal OTU count will be returned.
#' For this purpose `minabund` parameter should be provided in a range of (0,1] (e.g., use `minabund = 0.1, relabund = TRUE` to remove OTUs with relative abundance < 10% in each sample).
#'
#' @return Phyloseq object with a filtered data.
#' @export
#'
#' @examples
#' # Load data
#' data(GlobalPatterns)
#'
#' # Trim GlobalPatterns data (19216 taxa) by removing OTUs with less that 10 reads
#' GP1 <- phyloseq_filter_sample_wise_abund_trim(GlobalPatterns, minabund = 10) # 10605 taxa
#'
#' # Trim GlobalPatterns data by removing OTUs with relative abundance less than 1%
#' GP2 <- phyloseq_filter_sample_wise_abund_trim(GlobalPatterns, minabund = 0.01, relabund = TRUE) # 258 taxa
#'
#' # Compare raw and trimmed data
#' phyloseq_summary(GlobalPatterns, GP1, GP2, cols = c("GlobalPatterns", "Trimmed 10 reads", "Trimmed 1 percent"))
#'
phyloseq_filter_sample_wise_abund_trim <- function(physeq, minabund = 10, relabund = FALSE, rm_zero_OTUs = TRUE){
## Censore OTU abundance
if(relabund == FALSE){ # trim based on absolute OTU counts
res <- phyloseq::transform_sample_counts(physeq, function(OTU, ab = minabund){ ifelse(OTU <= ab, 0, OTU) })
} else { # trim based on relative abundances within sample, but return original counts
if(!minabund > 0 & minabund <= 1){
stop("Error: for relative abundance trimmin 'minabund' should be in (0,1] interval.\n")
}
## Convert data to relative abundances
res <- phyloseq_standardize_otu_abundance(physeq, method = "total")
## Remove relative abundances less than the threshold value
res <- phyloseq::transform_sample_counts(res, function(OTU, ab = minabund){ ifelse(OTU <= ab, 0, OTU) })
## Sample sums and data orientation
smps <- phyloseq::sample_sums(physeq)
if(phyloseq::taxa_are_rows(physeq) == TRUE){
mar <- 2
} else {
mar <- 1
}
## Convert back to counts by multiplying relative abundances by sample sums
phyloseq::otu_table(res) <- phyloseq::otu_table(
sweep(x = phyloseq::otu_table(res), MARGIN = mar, STATS = smps, FUN = `*`),
taxa_are_rows = phyloseq::taxa_are_rows(physeq))
}
## Remove zero-OTUs
if(rm_zero_OTUs == TRUE){
res <- phyloseq::prune_taxa(taxa_sums(res) > 0, res)
}
return(res)
}
#' @title Extract the most abundant taxa.
#' @param physeq A phyloseq-class object
#' @param perc Percentage of the most abundant taxa to retain
#' @param n Number of the most abundant taxa to retain (this argument will override perc argument)
#' @return Phyloseq object with a filtered data.
#' @export
#'
#' @examples
#'
phyloseq_filter_top_taxa <- function(physeq, perc = 10, n = NULL){
## Arguments validation
if(perc <= 0 | perc > 100){ stop("Error: percentage should be in 1-100 range.\n") }
## Get total abundances for all taxa
taxx <- sort(phyloseq::taxa_sums(physeq), decreasing = TRUE)
## Find how many taxa to preserve (if percentage is specified)
if(is.null(n)){
n <- phyloseq::ntaxa(physeq) * perc / 100
n <- floor(n)
}
## Extract names for the taxa that should be preserved
keepTaxa <- names(taxx)[1:n]
## Extract this taxa
physeq_pruned <- phyloseq::prune_taxa(keepTaxa, physeq)
return(physeq_pruned)
}
#' @title Check the range of the top-taxa filtering values to determine the optimal threshold.
#' @description This function performs taxa filtering by retaining the most abundant taxa.
#' A range of abundance percentages (5 - 95\%) will be explored.
#' @param physeq A phyloseq-class object
#' @param show_plot Logical; if TRUE, shows the plot on screen
#' @return ggplot-object.
#' @export
#'
#' @examples
#'
phyloseq_filter_top_taxa_range <- function(physeq, show_plot = TRUE){
percs <- seq(5, 95, 5)
fr <- plyr::mlply(.data = data.frame(perc = percs), .fun = function(...){ phyloseq_filter_top_taxa(physeq, ...) })
names(fr) <- percs
fr_tab <- plyr::ldply(.data = fr, .fun = function(z){
sz <- phyloseq::sample_sums(z)
res <- data.frame(Sample = names(sz), Preserved = sz)
return(res)
})
pp <- ggplot(data = fr_tab, aes(x = perc, y = Preserved, group = Sample)) + # color = Sample
geom_vline(xintercept=75, color="grey", linetype = "longdash") +
geom_line() +
geom_point() +
labs(x = "Number of most abundant taxa retained, %", y = "Percentage of total sample abundance") +
theme(legend.position = "none")
if(show_plot == TRUE){ print(pp) }
invisible(pp)
}