-
Notifications
You must be signed in to change notification settings - Fork 0
/
mapCaps.R
executable file
·157 lines (149 loc) · 5.59 KB
/
mapCaps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' Map the data from 5' profiling techniques
#'
#' @rdname mapCaps
#' @param CSobject An object of class \code{\link{CapSet}}
#' @param genomeIndex character. Path to the Subread index file. Should end with the basename of the index.
#' @param outdir character. Output directory path
#' @param ncores integer. Number of cores/threads to use for mapping.
#' @param logfile character. A log file to write the processing message.
#' @param externalGTF character. provide external annotation file in `GTF` format , if present
#' to increase alignment accuracy
#'
#' @return modified CapSet object with mapping information. Mapped and sorted BAM
#' files are saved in `outdir`.
#'
#' @importFrom utils capture.output
#' @importFrom methods validObject is
#' @importFrom Rsamtools sortBam indexBam countBam ScanBamParam scanBamFlag
#'
#' @examples
#' \dontrun{
#' # before mapping :
#' # 1. Create a CapSet object
#' # 2. de-multiplex the fastqs
#'
#' # load a previously saved CapSet object
#' cs <- exampleCSobject()
#'
#' # map the data (not available on windows)
#' library(Rsubread)
#' dir.create("bam")
#' buildindex(basename = "dm6", reference = "/path/to/dm6_genome.fa")
#' cs <- mapCaps(cs, genomeIndex = "dm6", outdir = "bam", nthreads = 10)
#'
#' }
#'
setMethod("mapCaps",
signature = "CapSet",
function(
CSobject,
genomeIndex,
outdir,
externalGTF,
ncores,
logfile) {
## extract info
sampleInfo <- sampleInfo(CSobject)
expMethod <- CSobject@expMethod
# test whether the CapSet object has demultiplexed fastqs
demult <- !(is.null(sampleInfo$demult_R1)) # TRUE/FALSE
if (demult) {
if (sum(vapply(sampleInfo$demult_R1, file.exists, logical(1))) != nrow(sampleInfo)) {
stop(
"One or more demultiplxed fastq files don't exist.
Please check file paths under sampleInfo(CSobject)"
)
}
}
if (demult) {
samplelist <- sampleInfo$samples
R1_list <- as.character(sampleInfo$demult_R1)
R2_list <- as.character(sampleInfo$demult_R2)
} else {
samplelist <- list("raw")
R1_list <- CSobject@fastq_R1
R2_list <- CSobject@fastq_R2
}
# get annotation if provided
if (!is.null(externalGTF)) {
useAnnot <- TRUE
} else {
useAnnot <- FALSE
}
mapstat <- mapply(function(sample, R1, R2) {
message(paste0("Mapping sample : ", sample))
# If R2 = NA, convert it to NULL such that subread uses single-end
if (is.na(R2)) R2 <- NULL
# Align using RSubread
tmpout <- file.path(outdir, paste0(sample, ".tmp.bam"))
capture.output(
Rsubread::subjunc(
index = genomeIndex,
# change here
readfile1 = R1,
readfile2 = R2,
output_file = tmpout,
annot.ext = externalGTF,
isGTF = TRUE,
# annotation (v1.28 onwards)
useAnnotation = useAnnot,
annot.inbuilt = NULL,
# neglected
GTF.featureType = "exon",
GTF.attrType = "gene_id",
chrAliases = NULL,
input_format = "gzFASTQ",
output_format = "BAM",
nsubreads = 20,
TH1 = 1,
TH2 = 1,
maxMismatches = 3,
nthreads = ncores,
indels = 5,
complexIndels = FALSE,
phredOffset = 33,
unique = TRUE,
nBestLocations = 1,
minFragLength = 10,
maxFragLength = 600,
PE_orientation = "fr",
nTrim5 = 0,
nTrim3 = 0,
readGroupID = NULL,
readGroup = NULL,
color2base = FALSE,
# subjunc-specific
reportAllJunctions = FALSE,
),
file = logfile,
append = TRUE
)
# Sort and Index
message("Sorting and Indexing")
dest <- file.path(outdir, sample)
sortBam(file = tmpout, destination = dest) # adds .bam suffix
indexBam(paste0(dest, ".bam"))
file.remove(tmpout)
# Get mapping stats
stat <- countBam(paste0(dest, ".bam"),
param = ScanBamParam(
flag = scanBamFlag(
isUnmappedQuery = FALSE,
isFirstMateRead = TRUE,
isSecondaryAlignment = FALSE
)
))[, 5:6] # "file" and "records"
stat$file <- as.character(stat$file)
stat$records <- as.integer(stat$records)
return(stat)
}, samplelist, R1_list, R2_list)
# edit sampleinfo of CapSet
si <- sampleInfo(CSobject)
maptable <- as.data.frame(t(mapstat))
si$mapped_file <- file.path(outdir, as.character(maptable$file))
si$num_mapped <- as.integer(maptable$records)
sampleInfo(CSobject) <- si
validObject(CSobject)
return(CSobject)
}
)