-
Notifications
You must be signed in to change notification settings - Fork 7
/
consensus.go
220 lines (201 loc) · 7.42 KB
/
consensus.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
package sam
import (
"fmt"
"github.com/vertgenlab/gonomics/dna"
"github.com/vertgenlab/gonomics/numbers"
)
// consensusType represents the possible types of consensus variants in a Pile,
// which includes Bases, INDELs, or undefined (in the case where the Pile contains no data).
type consensusType byte
const (
Base consensusType = 0
Insertion consensusType = 1
Deletion consensusType = 2
Undefined consensusType = 3
)
// Consensus contains information on the most observed variant in a Pile struct.
type Consensus struct {
Base dna.Base
Insertion []dna.Base
Deletion int
Type consensusType
}
// String for debug.
func (c Consensus) String() string {
var typeString string
switch c.Type {
case 0:
typeString = "Base"
case 1:
typeString = "Insertion"
case 2:
typeString = "Deletion"
case 3:
typeString = "Undefined"
}
return fmt.Sprintf("Base: %s. Insertion: %s. Deletion: %v. Type: %s.", dna.BaseToString(c.Base), dna.BasesToString(c.Insertion), c.Deletion, typeString)
}
// PileConsensus returns a Consensus struct from an input Pile struct
// representing information about the consensus variant observed in that Pile.
// An insertion is called as the consensus if the count number for the max insertion if
// maxCount > insertionThreshold * (total counts to bases).
func PileConsensus(p Pile, substitutionsOnly bool, insertionThreshold float64) Consensus {
// first we check if consensus is a base
var max int = p.CountF[dna.A] + p.CountR[dna.A]
tiedConsensus := make([]Consensus, 1)
tiedConsensus[0] = Consensus{Base: dna.A, Type: Base}
max, tiedConsensus = getMaxBase(p, max, dna.C, tiedConsensus)
max, tiedConsensus = getMaxBase(p, max, dna.G, tiedConsensus)
max, tiedConsensus = getMaxBase(p, max, dna.T, tiedConsensus)
if substitutionsOnly { //legacy feature
//max, tiedConsensus = getMaxBase(p, max, dna.Gap, tiedConsensus) Dan I know that this is a legacy featuer but it's not technically substitutionsOnly, coordinates will change in the output
if max < 1 {
return Consensus{Type: Undefined}
}
return tiedConsensus[numbers.RandIntInRange(0, len(tiedConsensus))] //we don't need to check the length because this wil return tiedConsensus[0] even if length is 1.
} else {
max, tiedConsensus = getMaxDeletion(p, max, tiedConsensus)
if max < 1 {
return Consensus{Type: Undefined}
}
return getMaxInsertion(p, tiedConsensus, insertionThreshold)
}
}
func getDeletionCounts(p Pile) int {
var answer = 0
var val int
for _, val = range p.DelCountF {
answer += val
}
for _, val = range p.DelCountR {
answer += val
}
return answer
}
// helper function of PileConsensus, finds the max insertion in a Pile struct.
// InsThreshold is the proportion of Base reads for which an insertion needs to be observed to be called.
func getMaxInsertion(p Pile, tiedConsensus []Consensus, InsThreshold float64) Consensus {
var count int
var inMap bool
var seenOnPosStrand = make(map[string]int, 0)
var deletionSum = getDeletionCounts(p)
var totalBaseCounts = p.CountF[0] + p.CountF[1] + p.CountF[2] + p.CountF[3] + p.CountR[0] + p.CountR[1] + p.CountR[2] + p.CountR[3] + deletionSum
var insertionThreshold = int(InsThreshold * float64(totalBaseCounts))
var maxInsertionScore, deletionScore = 0, 0
for i := range p.InsCountF {
seenOnPosStrand[i] = 1
count = p.InsCountF[i]
if _, inMap = p.InsCountR[i]; inMap {
count += p.InsCountR[i]
}
switch tiedConsensus[0].Type {
case Base:
if count > insertionThreshold {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Type = Insertion
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
} //note no ties for insertions. If the insertion is equal to the insertionThreshold, it is not called
case Deletion:
deletionScore = p.DelCountF[tiedConsensus[0].Deletion] + p.DelCountR[tiedConsensus[0].Deletion]
if count > deletionScore {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Type = Insertion
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
} //note no ties for insertions. If the insertion is equal to the insertionThreshold, it is not called
case Insertion:
if count > maxInsertionScore {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
} else if count == maxInsertionScore {
tiedConsensus = append(tiedConsensus, Consensus{Base: tiedConsensus[0].Base, Type: Insertion, Insertion: dna.StringToBases(i)})
}
default:
return Consensus{Type: Undefined}
}
}
//we have to check the p.InsCountR map for any insertions observed only on the minus strand.
for i := range p.InsCountR {
if _, inMap = seenOnPosStrand[i]; !inMap {
count = p.InsCountR[i] //we don't have to sum since we are guaranteed not to have seen this insertion on the positive strand.
switch tiedConsensus[0].Type {
case Base:
if count > insertionThreshold {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Type = Insertion
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
}
case Deletion:
deletionScore = p.DelCountF[tiedConsensus[0].Deletion] + p.DelCountR[tiedConsensus[0].Deletion]
if count > deletionScore {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Type = Insertion
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
}
case Insertion:
if count > maxInsertionScore {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0].Insertion = dna.StringToBases(i)
maxInsertionScore = count
} else if count == maxInsertionScore {
tiedConsensus = append(tiedConsensus, Consensus{Base: tiedConsensus[0].Base, Type: Insertion, Insertion: dna.StringToBases(i)})
}
default:
return Consensus{Type: Undefined}
}
}
}
return tiedConsensus[numbers.RandIntInRange(0, len(tiedConsensus))]
}
// helper function of PileConsensus, finds the max deletion in a Pile struct.
func getMaxDeletion(p Pile, currMax int, tiedConsensus []Consensus) (int, []Consensus) {
var count, i int
var inMap bool
var seenOnPosStrand = make(map[int]int, 0)
for i = range p.DelCountF {
seenOnPosStrand[i] = 1
count = p.DelCountF[i]
if _, inMap = p.DelCountR[i]; inMap {
count += p.DelCountR[i]
}
if count > currMax {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0] = Consensus{Deletion: i, Type: Deletion}
currMax = count
}
if count == currMax {
tiedConsensus = append(tiedConsensus, Consensus{Deletion: i, Type: Deletion})
}
}
for i = range p.DelCountR {
if _, inMap = seenOnPosStrand[i]; !inMap {
count = p.DelCountR[i]
if count > currMax {
tiedConsensus = tiedConsensus[:1]
tiedConsensus[0] = Consensus{Deletion: i, Type: Deletion}
currMax = count
}
if count == currMax {
tiedConsensus = append(tiedConsensus, Consensus{Deletion: i, Type: Deletion})
}
}
}
return currMax, tiedConsensus
}
// getMaxBase is a helper function of PileConsensus, finds the consensus base in a Pile struct.
func getMaxBase(p Pile, currMax int, testBase dna.Base, tiedConsensus []Consensus) (int, []Consensus) {
var count int = p.CountF[testBase] + p.CountR[testBase]
if count > currMax {
tiedConsensus = tiedConsensus[:1] // reset tied bases
tiedConsensus[0] = Consensus{Base: testBase, Type: Base}
return count, tiedConsensus
}
if count == currMax {
tiedConsensus = append(tiedConsensus, Consensus{Base: testBase, Type: Base})
}
return currMax, tiedConsensus
}