-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_sstacks.R
265 lines (221 loc) · 9.85 KB
/
run_sstacks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#' @name run_sstacks
#' @title Run STACKS sstacks module
#' @description Run \href{http://catchenlab.life.illinois.edu/stacks/}{STACKS}
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}
#' module inside R!
#' Inside the \code{P} folder (where the \href{http://catchenlab.life.illinois.edu/stacks/comp/ustacks.php}{ustacks} and the \href{http://catchenlab.life.illinois.edu/stacks/comp/cstacks.php}{cstacks} files are),
#' you should have:
#' \itemize{
#' \item \strong{3 Catalog files:} the files created in
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/cstacks.php}{cstacks}
#' and usually looking like this:
#' \code{batch_1.catalog.alleles.tsv.gz,
#' batch_1.catalog.snps.tsv.gz,
#' batch_1.catalog.tags.tsv.gz}
#' \item \strong{3 files for each samples:} The sample name is the prefix of
#' the files ending with:
#' \code{.alleles.tsv.gz, .snps.tsv.gz, .tags.tsv.gz}.
#' Those files are created in the
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/ustacks.php}{ustacks}
#' module.
#' }
#' @param P (character) Path to the directory containing STACKS files.
#' Contrary to \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks},
#' this argument can be use with \code{sample.list} (the \code{s} in \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks})
#' Default: \code{P = "06_ustacks_2_gstacks"}.
#' @param M (character) Path to a population map file from which to take sample
#' names (this argument won't work if \code{sample.list}).
#' Advice: don't modify the default (please see details).
#' Default: \code{M = NULL}.
#' @param sample.list This is for the \code{s} option in
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}.
#' \code{s: Filename prefix from which to load sample loci}.
#' Here, you have 2 choices: 1. you leave empty and let the function use the
#' default:\code{sample.list = NULL} which will scan for the files in the
#' \code{P} folder given above. 2. you supply a character string of the
#' samples. This could come from the \code{INDIVIDUALS_REP} column of the
#' project info file, e.g. \code{sample.list = project.info$INDIVIDUALS_REP}.
#' (Please see details).
#' @param c (character, path) Path to the catalog.
#' Default: \code{c = "06_ustacks_2_gstacks"}
#' @param parallel.core (Integer) Enable parallel execution with num_threads threads.
#' Default: \code{parallel.core = parallel::detectCores() - 1}
#' @param o output path to write results.
#' Default: \code{o = "06_ustacks_2_gstacks"}
#' @param x Don't verify haplotype of matching locus.
#' Default: \code{x = FALSE}
#' @param v Print program version.
#' Default: \code{v = FALSE}
#' @param h Display this help messsage.
#' Default: \code{h = FALSE}
#' @param disable.gapped Disable gapped alignments between stacks.
#' Default: \code{disable.gapped = FALSE} (use gapped alignments).
#' @rdname run_sstacks
#' @export
#' @return \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}
#' returns a \code{.matches.tsv.gz file for each sample}.
#' If \code{lnl_dist = TRUE}, the function will also return a
#' summary of catalog loci log-likelihood.
#' @details \strong{Computer or server problem during the sstacks ?}
#' Just launch again the same command, the function will start again, but only
#' with the unmatched samples!
#'
#' \strong{Some argument can't be use together:}
#'
#' Don't use these arguments together: \code{M} and \code{sample.list/c}.
#' You can't add samples to an existing catalog using a population map.
#' If you want to match new samples to an existing catalog, very easy,
#' just add the samples in the folder and let \code{run_sstacks} function find
#' the samples that were not matched to the catalog.
#' @examples
#' \dontrun{
#' # The simplest form of the function when using the stackr workflow:
#' run_sstacks()
#' # that's it !
#'
#'
#' Now if you have your own workflow folders, etc. Enter them like this:
#' sstacks <- run_sstacks (P = "/my/input/path", p = 32, b = 2,
#' sample.list = c("ind1", "ind2", "..."), o = "/my/output/path",
#' x = FALSE)
#' }
#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
# sstacks ----------------------------------------------------------------------
run_sstacks <- function(
P = "06_ustacks_2_gstacks",
M = NULL,
sample.list = NULL,
c = "06_ustacks_2_gstacks",
parallel.core = parallel::detectCores() - 1,
o = "06_ustacks_2_gstacks",
x = FALSE,
disable.gapped = FALSE,
v = FALSE,
h = FALSE
) {
cat("#######################################################################\n")
cat("######################## stackr::run_sstacks ##########################\n")
cat("#######################################################################\n")
timing <- proc.time()
res <- list() # return results in this list
# Check directory ------------------------------------------------------------
if (!dir.exists(P)) dir.create(P)
if (!dir.exists("09_log_files")) dir.create("09_log_files")
# sstacks options ------------------------------------------------------------
# p: enable parallel execution with num_threads threads.
p <- stringi::stri_join("-p ", parallel.core)
# o: output path to write results.
ustacks.folder <- o
o <- stringi::stri_join("-o ", shQuote(o))
# x: don't verify haplotype of matching locus.
if (x) {
x <- stringi::stri_join("-x ")
} else {
x <- ""
}
# v: print program version.
if (v) {
v <- stringi::stri_join("-v ")
} else {
v <- ""
}
# h: display this help messsage.
if (h) {
h <- stringi::stri_join("-h ")
} else {
h <- ""
}
# Gapped assembly options ---------------------------------------------------
if (disable.gapped) {
disable.gapped <- stringi::stri_join("--disable-gapped ")
} else {
disable.gapped <- ""
}
# Pop map --------------------------------------------------------------------
if (is.null(M)) {
c <- stringi::stri_join("-c ", c)
# s: filename prefix from which to load sample loci---------------------------
# Get the samples in the folder
samples.in.folder <- tibble::tibble(INDIVIDUALS_REP = list.files(P)) %>%
dplyr::filter(!grepl("catalog", INDIVIDUALS_REP))
# Search for those already matched
samples.matched <- samples.in.folder %>%
dplyr::filter(grepl("matches", INDIVIDUALS_REP)) %>%
dplyr::mutate(INDIVIDUALS_REP = stringi::stri_replace_all_fixed(
str = INDIVIDUALS_REP,
pattern = ".matches.tsv.gz", replacement = "",
vectorized_all = FALSE)
) %>%
dplyr::distinct(INDIVIDUALS_REP)
message(stringi::stri_join("Samples in folder already matched to the catalog: ",
length(samples.matched$INDIVIDUALS_REP)))
# Get the name of samples that need to be match to the catalog
samples.to.match <- samples.in.folder %>%
dplyr::filter(grepl("alleles", INDIVIDUALS_REP)) %>%
dplyr::mutate(INDIVIDUALS_REP = stringi::stri_replace_all_fixed(
str = INDIVIDUALS_REP,
pattern = ".alleles.tsv.gz", replacement = "",
vectorized_all = FALSE)
) %>%
dplyr::distinct(INDIVIDUALS_REP) %>%
dplyr::filter(!INDIVIDUALS_REP %in% samples.matched$INDIVIDUALS_REP)
message(stringi::stri_join("Samples in folder not matched to the catalog: ", length(samples.to.match$INDIVIDUALS_REP)))
if (is.null(sample.list)) {
s <- stringi::stri_join("-s ", shQuote(stringi::stri_join(P, "/", samples.to.match$INDIVIDUALS_REP)))
message(stringi::stri_join("Matching ", length(samples.to.match$INDIVIDUALS_REP), " sample(s) to the catalog..."))
} else {
sample.list.before <- sample.list
sample.list <- purrr::keep(.x = sample.list, .p = sample.list %in% samples.to.match$INDIVIDUALS_REP)
sample.list <- stringi::stri_join(P, "/", sample.list)
s <- stringi::stri_join("-s ", shQuote(sample.list))
samples.to.remove <- length(sample.list) - length(sample.list.before)
if (samples.to.remove != 0) {
message(stringi::stri_join("Removed ", samples.to.remove, " problematic sample(s) from the samples list to match"))
}
message(stringi::stri_join("Matching ", length(sample.list), " sample(s) to the catalog..."))
}
# remove unwanted // (better solution ? Please tell me!)
s <- stringi::stri_replace_all_fixed(
str = s,
pattern = "//", replacement = "/", vectorize_all = FALSE)
M <- ""
P <- ""
} else {
P <- stringi::stri_join("-P ", P)
M <- stringi::stri_join("-M ", M)
s <- ""
c <- ""
o <- ""
message("Automatically turning off sstacks arguments: -s, -c, -o")
}
# logs files -----------------------------------------------------------------
file.date.time <- format(Sys.time(), "%Y%m%d@%H%M")
log.file <- stringi::stri_join("09_log_files/sstacks_", file.date.time,".log")
message(stringi::stri_join("For progress, look in the log file: ", log.file))
# command --------------------------------------------------------------------
command.arguments <- c(P, M, s, c, p, o, x, disable.gapped, v, h)
# command
system2(
command = "sstacks",
args = command.arguments,
# stdout = log.file,
stderr = log.file
)
# Summary sstacks ------------------------------------------------------------
sum <- stackr::summary_sstacks(
sstacks.log = log.file,
verbose = FALSE
)
sum <- NULL
timing <- proc.time() - timing
message("\nComputation time: ", round(timing[[3]]), " sec")
cat("############################## completed ##############################\n")
return(res)
}# end run_sstacks