-
Notifications
You must be signed in to change notification settings - Fork 1
/
msa.go
276 lines (265 loc) · 8.33 KB
/
msa.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
package gfa
import (
"fmt"
"os"
"sort"
"strconv"
"github.com/biogo/biogo/alphabet"
"github.com/biogo/biogo/io/seqio/alignio"
"github.com/biogo/biogo/io/seqio/fasta"
"github.com/biogo/biogo/seq"
"github.com/biogo/biogo/seq/linear"
"github.com/biogo/biogo/seq/multi"
)
// ReadMSA will read in an MSA file and store it as a Multi (MSA)
func ReadMSA(fileName string) (*multi.Multi, error) {
// open a file
fh, err := os.Open(fileName)
if err != nil {
return nil, err
}
defer fh.Close()
// read in the MSA
r := fasta.NewReader(fh, linear.NewSeq("", nil, alphabet.DNA))
m, _ := multi.NewMulti("", nil, seq.DefaultConsensus)
msa, err := alignio.NewReader(r, m).Read()
if err != nil {
return nil, err
}
return msa, nil
}
// MSA2GFA converts an MSA to a GFA instance
func MSA2GFA(msa *multi.Multi) (*GFA, error) {
// create an empty GFA instance and then add version (1)
myGFA := NewGFA()
err := myGFA.AddVersion(1)
if err != nil {
return nil, err
}
// clean the MSA to remove consensus seq
msaCleaner(msa)
// create nodes for each unique base in every column of the alignment
msaNodes, _ := getNodes(msa)
// draw edges between nodes
if err := msaNodes.drawEdges(); err != nil {
return nil, err
}
// squash neighbouring nodes and update edges
if err := msaNodes.squashNodes(); err != nil {
return nil, err
}
// populate the GFA instance
orderedList := msaNodes.getOrderedList()
for _, nodeID := range orderedList {
node := msaNodes.nodeHolder[nodeID]
// create the segment(s)
seg, err := NewSegment([]byte(strconv.Itoa(nodeID)), []byte(node.base))
if err != nil {
return nil, err
}
seg.Add(myGFA)
// create link(s)
for outEdge := range node.outEdges {
link, err := NewLink([]byte(strconv.Itoa(nodeID)), []byte("+"), []byte(strconv.Itoa(outEdge)), []byte("+"), []byte("0M"))
if err != nil {
return nil, err
}
link.Add(myGFA)
}
}
// get paths for each MSA entry
type pathHolder struct {
seqID []byte
segments [][]byte
overlaps [][]byte
}
for _, seqID := range msaNodes.seqIDs {
tmpPath := &pathHolder{seqID: []byte(seqID)}
for _, nodeID := range orderedList {
node := msaNodes.nodeHolder[nodeID]
for _, seq := range node.parentSeqIDs {
if seq == seqID {
segment := strconv.Itoa(nodeID) + "+"
overlap := strconv.Itoa(len(node.base)) + "M"
tmpPath.segments = append(tmpPath.segments, []byte(segment))
tmpPath.overlaps = append(tmpPath.overlaps, []byte(overlap))
break
}
}
}
// add the path
path, err := NewPath(tmpPath.seqID, tmpPath.segments, tmpPath.overlaps)
if err != nil {
return nil, err
}
path.Add(myGFA)
}
return myGFA, nil
}
// msaCleaner removes the consensus entry (if present) from a MSA
func msaCleaner(msa *multi.Multi) {
for i := 0; i < msa.Rows(); i++ {
id := msa.Row(i).Name()
if id == "consensus" {
msa.Delete(i)
}
}
}
// the node type is used to store unique bases and associated IDs from each MSA column
type node struct {
parentSeqIDs []string
base string
inEdges map[int]struct{}
outEdges map[int]struct{}
}
// the msaNodes store all the nodes from a MSA
type msaNodes struct {
nodeHolder map[int]*node
seqIDs []string
}
// this function returns a sorted list of node IDs, sorted by increasing ID value
// it's used to iterate over the nodeHolder nodes in order
func (msaNodes *msaNodes) getOrderedList() []int {
list := []int{}
for node := range msaNodes.nodeHolder {
list = append(list, node)
}
sort.Ints(list)
return list
}
// getNodes is an msaNodes constructor. It moves through each column of an MSA, making a node for each unique base per column
func getNodes(msa *multi.Multi) (*msaNodes, error) {
nodeHolder := make(map[int]*node)
nodeID := 1
// get all the entry ids
ids := []string{}
for i := 0; i < msa.Rows(); i++ {
ids = append(ids, string(msa.Row(i).Name()))
}
// process each column of the MSA
for i := 0; i < msa.Len(); i++ {
columnBases := msa.Column(i, true)
// keep a record of seen bases so that identical bases can be consolidated
record := make(map[string][]string)
// process each row of the column
for rowIterator, base := range columnBases {
stringifiedBase := string(base)
if _, ok := record[stringifiedBase]; !ok {
record[stringifiedBase] = []string{ids[rowIterator]}
} else {
record[stringifiedBase] = append(record[stringifiedBase], ids[rowIterator])
}
}
// convert the record of bases and ids to a slice of nodes for each column
columnNodes := []*node{}
for base, ids := range record {
// treat all gaps as individual nodes
if base == "-" {
for _, id := range ids {
columnNodes = append(columnNodes, &node{parentSeqIDs: []string{id}, base: "", outEdges: make(map[int]struct{}), inEdges: make(map[int]struct{})})
}
} else {
columnNodes = append(columnNodes, &node{parentSeqIDs: ids, base: base, outEdges: make(map[int]struct{}), inEdges: make(map[int]struct{})})
}
}
// add the nodes for the column to the msaNodes
for _, node := range columnNodes {
nodeHolder[nodeID] = node
nodeID++
}
}
// construct the msaNodes
msaNodes := &msaNodes{nodeHolder: nodeHolder, seqIDs: ids}
return msaNodes, nil
}
// drawEdges connects neighbouring nodes derived from the same MSA entry
func (msaNodes *msaNodes) drawEdges() error {
// getFirstNode returns the nodeID for the first node in the MSA derived from a specified sequence
getFirstNode := func(seqID string) int {
for nodeID := 1; nodeID <= len(msaNodes.nodeHolder); nodeID++ {
for _, id := range msaNodes.nodeHolder[nodeID].parentSeqIDs {
if seqID == id {
return nodeID
}
}
}
return 0
}
// findNextNode returns the ID of the next node that is derived from a query MSA sequence
findNextNode := func(seqID string, startNode int) int {
for nextNode := startNode + 1; nextNode <= len(msaNodes.nodeHolder); nextNode++ {
for _, parentSeqID := range msaNodes.nodeHolder[nextNode].parentSeqIDs {
if seqID == parentSeqID {
return nextNode
}
}
}
return 0
}
// iterate over each MSA sequence, connecting edges for each one
for _, seqID := range msaNodes.seqIDs {
startNode := getFirstNode(seqID)
if startNode == 0 {
return fmt.Errorf("Node parse error: Could not identify start node for %v", seqID)
}
for {
nextNode := findNextNode(seqID, startNode)
if nextNode == 0 {
break
}
// draw edges
msaNodes.nodeHolder[startNode].outEdges[nextNode] = struct{}{}
msaNodes.nodeHolder[nextNode].inEdges[startNode] = struct{}{}
startNode = nextNode
}
}
return nil
}
// squashNodes collapses neighbouring nodes into a single node if no branches exist
func (msaNodes *msaNodes) squashNodes() error {
squashableNodes := make(map[int]int)
squashNodesSortList := []int{}
// iterate through all the nodes in order and identify squashable nodes
for nodeIterator := 1; nodeIterator <= len(msaNodes.nodeHolder); nodeIterator++ {
// don't bother squashing gaps
if msaNodes.nodeHolder[nodeIterator].base == "" {
//delete(msaNodes.nodeHolder, nodeIterator)
continue
}
// if there is only one node connected via an out edge, see if the out node has multiple in edges
if len(msaNodes.nodeHolder[nodeIterator].outEdges) == 1 {
var outNode int
for key := range msaNodes.nodeHolder[nodeIterator].outEdges {
outNode = key
}
// if the out node has only one in edge, we can squash these nodes together
if len(msaNodes.nodeHolder[outNode].inEdges) == 1 {
squashableNodes[outNode] = nodeIterator
squashNodesSortList = append(squashNodesSortList, outNode)
}
}
}
// reverse sort the identified squashable nodes and squash them
sort.Sort(sort.Reverse(sort.IntSlice(squashNodesSortList)))
for _, outNode := range squashNodesSortList {
inNode := squashableNodes[outNode]
msaNodes.nodeHolder[inNode].base += msaNodes.nodeHolder[outNode].base
delete(msaNodes.nodeHolder, outNode)
}
// remove all gaps, create new IDs and clear edges
newNodeHolder := make(map[int]*node)
nodeNamer := 1
orderedList := msaNodes.getOrderedList()
for _, i := range orderedList {
if msaNodes.nodeHolder[i].base != "" {
msaNodes.nodeHolder[i].outEdges = make(map[int]struct{})
msaNodes.nodeHolder[i].inEdges = make(map[int]struct{})
newNodeHolder[nodeNamer] = msaNodes.nodeHolder[i]
nodeNamer++
}
}
msaNodes.nodeHolder = newNodeHolder
// draw edges between the new nodes
err := msaNodes.drawEdges()
return err
}