From 14020a4e7d5a93a8d5b1fe6ef3d821abd8c962c1 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Fri, 29 Jun 2018 10:17:30 -0700 Subject: [PATCH] [anchor] Use int64 instead of float64 --- anchor.go | 48 +++++++++++++++++++-------------------- base.go | 2 ++ graph.go | 67 +++++++++++++++++++++++++++++++++---------------------- 3 files changed, 66 insertions(+), 51 deletions(-) diff --git a/anchor.go b/anchor.go index 9dca7b6..38db19c 100644 --- a/anchor.go +++ b/anchor.go @@ -32,31 +32,31 @@ type Anchorer struct { // Contig stores the name and length of each contig type Contig struct { name string - length int + length int64 links []*Link path *Path - start int - orientation int // 1 => +, -1 => - + start int64 + orientation int8 // 1 => +, -1 => - segments []Range } // Link contains a specific inter-contig link type Link struct { a, b *Contig // Contig ids - apos, bpos int // Positions + apos, bpos int64 // Positions } // Path is a collection of ordered contigs type Path struct { contigs []*Contig // List of contigs LNode, RNode *Node // Two nodes at each end - length int // Cumulative length of all contigs + length int64 // Cumulative length of all contigs } // Range tracks contig:start-end type Range struct { - start int - end int + start int64 + end int64 node *Node } @@ -182,7 +182,7 @@ func (r *Anchorer) ExtractInterContigLinks() { for _, ref := range refs { contig := Contig{ name: ref.Name(), - length: ref.Len(), + length: int64(ref.Len()), } r.contigs = append(r.contigs, &contig) r.nameToContig[contig.name] = &contig @@ -218,7 +218,7 @@ func (r *Anchorer) ExtractInterContigLinks() { // An inter-contig link a.links = append(a.links, &Link{ - a: a, b: b, apos: apos, bpos: bpos, + a: a, b: b, apos: int64(apos), bpos: int64(bpos), }) interTotal++ } @@ -264,7 +264,7 @@ func (r *Path) String() string { } // contigToNode takes as input contig and position, returns the nodeID -func contigToNode(contig *Contig, pos int) *Node { +func contigToNode(contig *Contig, pos int64) *Node { for _, rr := range contig.segments { // multiple 'segments' if pos >= rr.start && pos < rr.end { return rr.node @@ -284,21 +284,21 @@ func (r *Anchorer) linkToNodes(link *Link) (*Node, *Node) { // insertEdge adds just one link to the graph func (r *Anchorer) insertEdge(G Graph, a, b *Node) { if _, aok := G[a]; aok { - G[a][b] += 1.0 + G[a][b]++ } else { - G[a] = map[*Node]float64{b: 1.0} + G[a] = map[*Node]int64{b: 1} } } // findMidPoint find the center of a path for bisect -func (r *Path) findMidPoint() (int, int) { +func (r *Path) findMidPoint() (int, int64) { r.length = 0 for _, contig := range r.contigs { r.length += contig.length } - midpos := r.length / 2 - cumsize := 0 + midpos := int64(math.Ceil(float64(r.length) / 2)) + cumsize := int64(0) i := 0 var contig *Contig for i, contig = range r.contigs { @@ -367,7 +367,7 @@ func (r *Path) bisect() { // makeContigStarts returns starts of contigs within a path func (r *Anchorer) makeContigStarts() { - pos := 0 + pos := int64(0) for _, contig := range r.path.contigs { contig.start = pos pos += contig.length @@ -375,17 +375,17 @@ func (r *Anchorer) makeContigStarts() { } // findBin returns the i-th bin along the path -func findBin(contig *Contig, pos, resolution int) int { +func findBin(contig *Contig, pos, resolution int64) int { offset := pos if contig.orientation < 0 { offset = contig.length - pos } - return (contig.start + offset) / resolution + return int((contig.start + offset) / resolution) } // splitPath takes a path and look at joins that are weak // Scans the path at certain resolution r, and search radius is d -func (r *Anchorer) splitPath(res, d int) { +func (r *Anchorer) splitPath(res int64, d int) { // Look at all intra-path links, then store the counts to a // sparse matrix, indexed by i, j, C[i, j] = # of links between // i-th locus and j-th locus @@ -466,13 +466,13 @@ func inspectGaps(path *Path) { } // identifyGap prints out all the gaps that lie within the bin -func (r *Anchorer) identifyGap(breakPoints []int, res int) { - contigStart := 0 +func (r *Anchorer) identifyGap(breakPoints []int, res int64) { + contigStart := int64(0) j := 0 for i, contig := range r.path.contigs { - if contigStart >= breakPoints[j]*res { - if contigStart >= (breakPoints[j]+1)*res { - for j < len(breakPoints) && contigStart >= (breakPoints[j]+1)*res { + if contigStart >= int64(breakPoints[j])*res { + if contigStart >= int64(breakPoints[j]+1)*res { + for j < len(breakPoints) && contigStart >= int64(breakPoints[j]+1)*res { j++ } // Exhausted, terminate diff --git a/base.go b/base.go index e348e36..a4ffcb8 100644 --- a/base.go +++ b/base.go @@ -54,6 +54,8 @@ const ( EffLinkDist = 1 << 19 // BinNorm is a ratio to make the link density human readable BinNorm = 1000000.0 + // BigNorm is a big integer multiplier so we don't have to mess with float64 + BigNorm = int64(1000000000000) // EPS is that Q must be larger than this value, used in cluster.go EPS = 1e-5 // MinAvgLinkage is the minimum cutoff for merging clusters diff --git a/graph.go b/graph.go index 2230988..5afb43e 100644 --- a/graph.go +++ b/graph.go @@ -10,6 +10,7 @@ package allhic import ( + "fmt" "math" "sort" ) @@ -23,11 +24,11 @@ type Node struct { // Edge is between two nodes in a graph type Edge struct { a, b *Node - weight float64 + weight int64 } // Graph is an adjacency list -type Graph map[*Node]map[*Node]float64 +type Graph map[*Node]map[*Node]int64 // isLNode returns if a Node is an LNode (5`-end`) func (r *Node) isLNode() bool { @@ -40,8 +41,12 @@ func (r *Node) isRNode() bool { } // length returns the length of a node, which is half of the path length -func (r *Node) length() float64 { - return float64(r.path.length) / 2 +func (r *Node) length() int64 { + ans := int64(r.path.length) / 2 + if r.isRNode() { + return ans + } + return r.path.length - ans } // isReverse returns the orientation of an edge @@ -76,10 +81,10 @@ func (r *Anchorer) makeGraph(paths PathSet) Graph { } } - // Normalize against the product of two paths + // Normalize against the product of lengths of two paths for a, nb := range G { for b, score := range nb { - G[a][b] = score / a.length() / b.length() + G[a][b] = score * BigNorm / (a.length() * b.length()) } } @@ -99,10 +104,10 @@ func (r *Anchorer) makeGraph(paths PathSet) Graph { // 1 - calculate the link density as links divided by the product of two contigs // 2 - calculate the confidence as the weight divided by the second largest edge func (r *Anchorer) makeConfidenceGraph(G Graph) Graph { - twoLargest := map[*Node][]float64{} + twoLargest := map[*Node][]int64{} for a, nb := range G { - first, second := 0.0, 0.0 + first, second := int64(0), int64(0) for _, score := range nb { if score > first { first, second = score, first @@ -110,42 +115,51 @@ func (r *Anchorer) makeConfidenceGraph(G Graph) Graph { second = score } } - twoLargest[a] = []float64{first, second} + twoLargest[a] = []int64{first, second} } + // fmt.Println(G) confidenceGraph := Graph{} // Now a second pass to compute confidence for a, nb := range G { for b, weight := range nb { secondLargest := getSecondLargest(twoLargest[a], twoLargest[b]) - confidence := weight / secondLargest - if confidence > 1 { + if secondLargest == 0 { + continue + } + confidence := weight * BigNorm / secondLargest + if confidence > BigNorm { if _, ok := confidenceGraph[a]; ok { confidenceGraph[a][b] = confidence } else { - confidenceGraph[a] = map[*Node]float64{b: confidence} + confidenceGraph[a] = map[*Node]int64{b: confidence} } // There can be ties in terms of scores - // if len(confidenceGraph[a]) > 1 { - // fmt.Println(a.path, "<=>", b.path, G[a], "\n", confidenceGraph[a], - // twoLargest[a], twoLargest[b], secondLargest) - // for k, v := range confidenceGraph[a] { - // fmt.Println(k, k.path, k.sister, v) - // } - // } + if len(confidenceGraph[a]) > 1 { + // if confidence < BigNorm*101/100 { + fmt.Println(a.path, "<=>", b.path, a.isLNode(), b.isLNode(), + G[a], "\n", confidenceGraph[a], + twoLargest[a], twoLargest[b], secondLargest) + for k, v := range confidenceGraph[a] { + fmt.Println(k, k.path, k.sister, v) + } + } } } } + // fmt.Println(confidenceGraph) return confidenceGraph } // Get the second largest number without sorting // a, b are both 2-item arrays -func getSecondLargest(a, b []float64) float64 { +func getSecondLargest(a, b []int64) int64 { A := append(a, b...) - sort.Float64s(A) + sort.Slice(A, func(i, j int) bool { + return A[i] < A[j] + }) // Some edge will appear twice in this list so need to remove it - for i := 2; i >= 0; i-- { // Is precision an issue here? + for i := 2; i >= 0; i-- { if A[i] < A[3] { return A[i] } @@ -209,8 +223,9 @@ func (r *Anchorer) generatePathAndCycle(G Graph) PathSet { path2, _ = dfs(G, a, path2, visited, false) path1 = append(reversePath(path1), path2...) } - + // fmt.Println("path1", path1) path = mergePath(path1) + // fmt.Println("path from", a, path) for _, contig := range path.contigs { contig.path = path } @@ -232,8 +247,6 @@ func mergePath(path []Edge) *Path { s.contigs = append(s.contigs, ep.contigs...) } s.bisect() - // fmt.Println(path) - // fmt.Println(s) return s } @@ -251,7 +264,7 @@ func reversePath(path []Edge) []Edge { // breakCycle breaks a single edge path into two edge paths // breakage occurs at the weakest link func breakCycle(path []Edge) []Edge { - minI, minWeight := 0, math.MaxFloat64 + minI, minWeight := 0, int64(math.MaxInt64) for i, edge := range path { if edge.weight > 1 && edge.weight < minWeight { minI, minWeight = i, edge.weight @@ -277,7 +290,7 @@ func dfs(G Graph, a *Node, path []Edge, visited map[*Node]bool, visitSister bool } if nb, ok := G[a]; ok { var maxb *Node - maxWeight := 0.0 // for tie breaking + maxWeight := int64(0) // for tie breaking for b, weight := range nb { if weight > maxWeight { maxb, maxWeight = b, weight