/
parallelSites.R
329 lines (321 loc) · 13.9 KB
/
parallelSites.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#' @importFrom ape Ntip
#' @rdname parallelSites
#' @title Mutation across multiple phylogenetic lineages
#' @description A site may have mutated on parallel lineages. Mutation can occur
#' on the same site across the phylogenetic lineages solved by
#' \code{\link{lineagePath}}. The site will be considered mutated in parallel
#' if the mutation occurs on the non-overlap part of more than two lineages.
#' The amino acid/nucleotide before and after the mutation can be allowed
#' different on different lineages or only the exact same mutations are
#' considered.
#' @param x A \code{\link{lineagePath}} or a \code{\link{sitesMinEntropy}}
#' object.
#' @param minSNP The minimum number of mutations to be qualified as parallel on
#' at least two lineages. The default is 1.
#' @param mutMode The strategy for finding parallel site. The default \code{all}
#' is to consider any mutation regardless of the amino acid/nucleotide before
#' and after mutation; Or \code{exact} to force mutation to be the same; Or
#' \code{pre}/\code{post} to select the site having amino acid/nucleotide
#' before/after mutation.
#' @param ... The arguments in \code{\link{sitesMinEntropy}}.
#' @return A \code{parallelSites} object
#' @export
#' @examples
#' data(zikv_tree_reduced)
#' data(zikv_align_reduced)
#' tree <- addMSA(zikv_tree_reduced, alignment = zikv_align_reduced)
#' paths <- lineagePath(tree)
#' x <- sitesMinEntropy(paths)
#' parallelSites(x)
parallelSites <- function(x, ...) {
UseMethod("parallelSites")
}
#' @rdname parallelSites
#' @export
parallelSites.lineagePath <- function(x,
minSNP = NULL,
mutMode = c("all", "exact",
"pre", "post"),
...) {
minEntropy <- sitesMinEntropy.lineagePath(x, ...)
res <- parallelSites.sitesMinEntropy(x = minEntropy,
minSNP = minSNP,
mutMode = mutMode)
return(res)
}
#' @rdname parallelSites
#' @export
parallelSites.sitesMinEntropy <- function(x,
minSNP = NULL,
mutMode = c("all", "exact",
"pre", "post"),
...) {
paths <- attr(x, "paths")
align <- attr(paths, "align")
reference <- attr(paths, "msaNumbering")
unambiguous <- .unambiguousChars(paths)
# Set the 'minSNP' constrain
if (is.null(minSNP)) {
minSNP <- attr(attr(x, "paths"), "minSize")
} else if (minSNP != 1) {
tree <- as.phylo.phyMSAmatched(paths)
nTips <- Ntip(tree)
minSNP <- .checkMinEffectiveSize(
x = minSNP,
varName = "minSNP",
totalSize = nTips
)
}
# The index of 'mutName' attribute to be used for applying constrains
mutNameIndex <- switch(
match.arg(mutMode),
"all" = 2,
"exact" = 4,
"pre" = 1,
"post" = 3
)
# To collect the result by site
res <- list()
# Collect the sporadic and fixation mutation on each lineage
sporadicParallel <- list()
fixationParallel <- list()
mixedParallel <- list()
# Iterate entropy minimization result for each lineage. This part is to
# remove the duplicate mutations on the overlapped part of the lineages
for (segs in x) {
# To collect mutation on the current lineage
sporadicMut <- list()
fixationMut <- list()
# The site have mutated on at least two lineages
for (siteName in names(segs)) {
# Convert the site to index of multiple sequence alignment
site <- reference[as.integer(siteName)]
# The entropy minimization of a single site on the lineage
seg <- segs[[siteName]]
# Find all sporadic mutations by comparing AA/nucleotide of each tip
# with the fixed one
for (tips in seg) {
# The fixed AA/nucleotide of the group
fixedAA <- attr(tips, "AA")
# Skip the group if the fixed AA/nucleotide is ambiguous or gap
if (!fixedAA %in% unambiguous ||
length(attr(tips, "aaSummary")) < 2)
next
# The real AA/nucleotide of each tip named with tip name
tipsAA <- substr(x = align[tips],
start = site,
stop = site)
# The tips with AA/nucleotide different from the fixed one
mut <- lapply(
X = which(tipsAA != fixedAA & tipsAA %in% unambiguous),
FUN = function(i) {
# The tip name to return
mutTips <- names(tipsAA[i])
# The mutation info used later
mutName <- c(fixedAA, siteName, tipsAA[[i]])
mutName <- c(mutName,
paste0(mutName, collapse = ""))
# Add the tip name
attr(mutTips, "mutName") <- mutName
attr(mutTips, "fixed") <- FALSE
return(mutTips)
}
)
# Add the mutation info to sporadic mutation collection of the
# lineage
for (mutNode in names(mut)) {
mutTips <- list(mut[[mutNode]])
if (mutNode %in% names(sporadicMut)) {
sporadicMut[[mutNode]] <- c(sporadicMut[[mutNode]],
mutTips)
} else {
sporadicMut[[mutNode]] <- mutTips
}
}
}
# There has to be at least one fixation on the lineage and at least
# two of the mutation is neither gap nor ambiguous character
# Collect all fixation mutations if any for the site
if (.qualifiedFixation(seg, unambiguous)) {
# Compare the fixed AA/nucleotide between two adjacent groups
for (i in seq_along(seg)[-1]) {
prevTips <- seg[[i - 1]]
currTips <- seg[[i]]
prevAA <- attr(prevTips, "AA")
currAA <- attr(currTips, "AA")
mutNode <- attr(currTips, "node")
# The mutation info used later
mutName <- c(prevAA, siteName, currAA)
mutName <- c(mutName,
paste0(mutName, collapse = ""))
# Get the tip names
mutTips <- names(align[currTips])
# Add the tip name
attr(mutTips, "mutName") <- mutName
attr(mutTips, "fixed") <- TRUE
# Add the mutation info to fixation mutation collection
mutTips <- list(mutTips)
if (mutNode %in% names(fixationMut)) {
fixationMut[[mutNode]] <- c(fixationMut[[mutNode]],
mutTips)
} else {
fixationMut[[mutNode]] <- mutTips
}
}
}
}
mixedRef <- c(sporadicMut, fixationMut)
# Compare mutation result on each pair of lineages
for (mixedMut in mixedParallel) {
res <- .collectParallelSites(mixedRef,
mixedMut,
res,
mutNameIndex,
minSNP)
}
# Add the mutation result to the collection
mixedParallel <- c(mixedParallel, list(mixedRef))
sporadicParallel <- c(sporadicParallel, list(sporadicMut))
fixationParallel <- c(fixationParallel, list(fixationMut))
}
attr(res, "paths") <- paths
attr(res, "clustersByPath") <- attr(x, "clustersByPath")
class(res) <- "parallelSites"
return(res)
}
.collectParallelSites <- function(mutatNodes,
otherNodes,
res,
mutNameIndex,
minSNP) {
allMutatNodes <- names(mutatNodes)
allOtherNodes <- names(otherNodes)
mutatSitesFull <- .groupMutationsBySites(mutatNodes,
allMutatNodes)
otherSitesFull <- .groupMutationsBySites(otherNodes,
allOtherNodes)
# Use the ancestral node to to remove overlapped part
mutatDiff <- setdiff(allMutatNodes, allOtherNodes)
otherDiff <- setdiff(allOtherNodes, allMutatNodes)
# There has to have non-overlap part for both lineage
if (length(mutatDiff) != 0 && length(otherDiff) != 0) {
mutatSites <- .groupMutationsBySites(mutatNodes, mutatDiff)
otherSites <- .groupMutationsBySites(otherNodes, otherDiff)
candidateSites <- intersect(names(mutatSites),
names(otherSites))
# Check if the candidate sites are qualified because the uniqueness
# judged by ancestral node could be incorrect if entropy minimization
# result is off on the two lineages
for (site in candidateSites) {
# All candidate groups of tip(s) on the two lineages for the site
mutat <- mutatSites[[site]]
other <- otherSites[[site]]
# Assume all tip groups in one set has actual parallel site
mutatQualifed <- rep(TRUE, length(mutat))
# Iterate one set of groups and compare with the other group set
for (i in seq_along(mutat)) {
mTips <- mutat[[i]]
for (j in seq_along(other)) {
oTips <- other[[j]]
if (length(intersect(mTips, oTips)) != 0) {
# Disqualify the group if it has overlap with the group
# in another set
mutatQualifed[i] <- FALSE
# The group with overlap in another set is dropped
other <- other[-j]
# There is no point looking at the rest
break
}
}
}
# The site is qualified only when both sets pass the check
if (any(mutatQualifed) && length(other) != 0) {
# Apply the constrains to get the mutations that meet the
# constrain of "minSNP" and filtering "mutMode"
mutat <- mutat[which(mutatQualifed)]
# The qualified 'mutName' (indexed according to the constrain
# mode) on the two lineages
qualifedAA <- intersect(
.qualifiedMutAA(mutat, mutNameIndex, minSNP),
.qualifiedMutAA(other, mutNameIndex, minSNP)
)
# Collect all mutation names that qualify the constrain
qualifiedMut <- c(
.selectMutByAA(mutat, mutNameIndex, qualifedAA),
.selectMutByAA(other, mutNameIndex, qualifedAA)
)
if (length(qualifiedMut) > 0) {
# Only the tips with qualified mutation will be added to the
# result. The result still keeps the mutation separate of
# the two lineages
toAdd <- c(
Filter(function(mut) {
attr(mut, "mutName")[4] %in% qualifiedMut
}, mutatSitesFull[[site]]),
Filter(function(mut) {
attr(mut, "mutName")[4] %in% qualifiedMut
}, otherSitesFull[[site]])
)
# The result of a site will contain qualified tips on the
# pair of lineages
res[[site]] <- c(res[[site]], list(toAdd))
# Re-assign or assign the class
attr(res[[site]], "site") <- site
class(res[[site]]) <- "sitePara"
}
}
}
}
return(res)
}
.groupMutationsBySites <- function(nodeGrouped, nodeNames) {
nodeGrouped <- nodeGrouped[nodeNames]
res <- list()
for (node in names(nodeGrouped)) {
for (mut in nodeGrouped[[node]]) {
site <- attr(mut, "mutName")[2]
mut <- list(mut)
names(mut) <- node
res[[site]] <- c(res[[site]], mut)
}
}
return(res)
}
.qualifiedMutAA <- function(mutat, i, minSNP) {
# Fixation mutation will ignore 'minSNP' constrain
fixationMutAA <- character()
sporadicMutAA <- character()
for (mutTips in mutat) {
# Get the indexed 'mutName' for deciding parallelity
mutAA <- attr(mutTips, "mutName")[i]
if (attr(mutTips, "fixed")) {
fixationMutAA <- c(fixationMutAA, mutAA)
} else {
sporadicMutAA <- c(sporadicMutAA, mutAA)
}
}
# Summarize the number of each amino acid/nucleotide of each sporadic
# mutation
c(fixationMutAA, names(which(table(sporadicMutAA) >= minSNP)))
}
.selectMutByAA <- function(mutat, i, qualifiedAA) {
# Select the mutation name according to the result of qualification
res <- vapply(
X = mutat,
FUN = function(mut) {
mutName <- attr(mut, "mutName")
if (mutName[i] %in% qualifiedAA) {
return(mutName[4])
}
return(NA_character_)
},
FUN.VALUE = character(1)
)
res[which(!is.na(res))]
}
#' @rdname parallelSites
#' @export
parallelSites.paraFixSites <- function(x, ...) {
res <- attr(x, "allParaSites")
return(res)
}