From 1c635b99c728a60c46fe7ac663a57b0f417163fa Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Tue, 26 Jun 2018 12:02:36 -0700 Subject: [PATCH] [anchor] Add iterative merges (not working yet) --- anchor.go | 82 +++++++++++++++++++++++++++++++------------------------ graph.go | 53 ++++++++++++++++++++++++----------- 2 files changed, 83 insertions(+), 52 deletions(-) diff --git a/anchor.go b/anchor.go index 0701e97..46ee832 100644 --- a/anchor.go +++ b/anchor.go @@ -40,15 +40,49 @@ type Link struct { apos, bpos int // Positions } +// iterations controls how many merges are we doing +const iterations = 2 + // Run kicks off the merging algorithm func (r *Anchorer) Run() { + var G Graph r.ExtractInterContigLinks() - G := r.makeGraph() - G = r.makeConfidenceGraph(G) - r.generatePathAndCycle(G) + paths := r.makeInitialPath() + for i := 0; i < iterations; i++ { + log.Noticef("Starting iteration %d with %d paths", i, len(paths)) + G = r.makeGraph(paths) + G = r.makeConfidenceGraph(G) + paths = r.generatePathAndCycle(G) + fmt.Println(paths) + } + log.Notice("Success") } +// makeInitialPath starts the initial construction of Path object, with one +// contig per Path (trivial Path) +func (r *Anchorer) makeInitialPath() []Path { + // Initially make every contig a single Path object + paths := make([]Path, len(r.contigs)) + r.registry = make(Registry) + for i, contig := range r.contigs { + paths[i] = Path{ + contigs: []*Contig{contig}, + orientations: []int{1}, + } + } + + return paths +} + +// registerPaths stores the mapping between contig to node +func (r *Anchorer) registerPaths(paths []Path) { + nodes := make([]Node, 2*len(paths)) + for i, path := range paths { + path.bisect(r.registry, &nodes[2*i], &nodes[2*i+1]) + } +} + // ExtractInterContigLinks extracts links from the Bamfile func (r *Anchorer) ExtractInterContigLinks() { fh, _ := os.Open(r.Bamfile) @@ -115,7 +149,16 @@ func (r *Anchorer) ExtractInterContigLinks() { sort.Slice(contig.links, func(i, j int) bool { return contig.links[i].apos < contig.links[j].apos }) + // Sanity check - this produced inconsistent pairing + // if contig.name == "idcChr1.ctg434" { + // for _, link := range contig.links { + // if link.b.name == "idcChr1.ctg433" { + // fmt.Println(link.a.name, link.b.name, link.apos, link.bpos) + // } + // } + // } } + // Write intra-links to .dis file for contig, links := range intraLinks { links = unique(links) @@ -169,39 +212,6 @@ type Graph map[*Node]map[*Node]float64 // We iterate through 1 or 2 ranges per contig ID type Registry map[*Contig][]Range -// makeGraph makes a contig linkage graph -func (r *Anchorer) makeGraph() Graph { - // Initially make every contig a single Path object - paths := make([]Path, len(r.contigs)) - nodes := make([]Node, 2*len(r.contigs)) - r.registry = make(Registry) - for i, contig := range r.contigs { - path := Path{ - contigs: []*Contig{contig}, - orientations: []int{1}, - } - paths[i] = path - path.bisect(r.registry, &nodes[2*i], &nodes[2*i+1]) - } - - G := make(Graph) - // Go through the links for each node and compile edges - for _, contig := range r.contigs { - for _, link := range contig.links { - a, b := r.linkToNodes(link) - r.insertEdge(G, a, b) - r.insertEdge(G, b, a) - } - } - nEdges := 0 - for _, node := range G { - nEdges += len(node) - } - log.Noticef("Graph contains %d nodes and %d edges", len(G), nEdges) - // fmt.Println(G) - return G -} - // contigToNode takes as input contig and position, returns the nodeID func (r *Anchorer) contigToNode(contig *Contig, pos int) *Node { for _, rr := range r.registry[contig] { diff --git a/graph.go b/graph.go index 2d73e67..6ccadb0 100644 --- a/graph.go +++ b/graph.go @@ -10,7 +10,6 @@ package allhic import ( - "fmt" "math" "sort" ) @@ -31,8 +30,28 @@ func (r *Edge) isSister() bool { return r.weight == 0 } -// updateGraph takes input a list of paths and fuse the nodes -func (r *Anchorer) updateGraph(G Graph) Graph { +// makeGraph makes a contig linkage graph +func (r *Anchorer) makeGraph(paths []Path) Graph { + G := make(Graph) + r.registerPaths(paths) + // Go through the links for each node and compile edges + for _, contig := range r.contigs { + for _, link := range contig.links { + a, b := r.linkToNodes(link) + r.insertEdge(G, a, b) + r.insertEdge(G, b, a) + // if (link.a.name == "idcChr1.ctg433" && link.b.name == "idcChr1.ctg434") || + // (link.a.name == "idcChr1.ctg434" && link.b.name == "idcChr1.ctg433") { + // fmt.Println("***", link.a.name, link.b.name, link.apos, link.bpos, + // a, b, G[a][b], G[b][a]) + // } + } + } + nEdges := 0 + for _, node := range G { + nEdges += len(node) + } + log.Noticef("Graph contains %d nodes and %d edges", len(G), nEdges) return G } @@ -92,11 +111,11 @@ func getSecondLargest(a, b []float64) float64 { // generatePathAndCycle makes new paths by merging the unique extensions // in the graph -func (r *Anchorer) generatePathAndCycle(G Graph) { - fmt.Println(G) +func (r *Anchorer) generatePathAndCycle(G Graph) []Path { visited := map[*Node]bool{} var isCycle bool nodeToPath := make(map[*Node]*Path) + paths := []Path{} for a := range G { if _, ok := visited[a]; ok { continue @@ -105,22 +124,23 @@ func (r *Anchorer) generatePathAndCycle(G Graph) { path1, isCycle = dfs(G, a, path1, visited, true) if isCycle { path1 = breakCycle(path1) - mergePath(path1, nodeToPath) + paths = append(paths, mergePath(path1, nodeToPath)) continue } delete(visited, a) path2, _ = dfs(G, a, path2, visited, false) path1 = append(reversePath(path1), path2...) - mergePath(path1, nodeToPath) + paths = append(paths, mergePath(path1, nodeToPath)) } - log.Noticef("A total of %d nodes mapped to new path", len(nodeToPath)) + log.Noticef("A total of %d nodes mapped to %d paths", len(nodeToPath), len(paths)) + return paths } // mergePath converts a single edge path into a node path -func mergePath(path []Edge, nodeToPath map[*Node]*Path) { +func mergePath(path []Edge, nodeToPath map[*Node]*Path) Path { p := []string{} - s := &Path{} + s := Path{} for _, edge := range path { if !edge.isSister() { continue @@ -134,18 +154,19 @@ func mergePath(path []Edge, nodeToPath map[*Node]*Path) { // TODO: take orientations into account s.contigs = append(s.contigs, ep.contigs...) s.orientations = append(s.orientations, ep.orientations...) - nodeToPath[edge.a] = s - nodeToPath[edge.b] = s + nodeToPath[edge.a] = &s + nodeToPath[edge.b] = &s // Special care needed for reverse orientation - for _, contig := range edge.a.path.contigs { + for _, contig := range ep.contigs { p = append(p, tag+contig.name) } } s.Length() - fmt.Println(path) - fmt.Println(s) - fmt.Println(p) + // fmt.Println(path) + // fmt.Println(s) + // fmt.Println(p) + return s } // reversePath reverses a single edge path into its reverse direction