Skip to content

Commit

Permalink
[assess] Add MakeProbDist()
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jun 24, 2018
1 parent 4549ee3 commit 572509a
Showing 1 changed file with 134 additions and 15 deletions.
149 changes: 134 additions & 15 deletions assess.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"bufio"
"fmt"
"io"
"math"
"os"
"sort"
"strconv"
Expand All @@ -31,13 +32,15 @@ import (
// Step 3. Normalize the likelihood to get the posterior probability (implicit assumption)
// of equal prior probability for each contig
type Assesser struct {
Bamfile string
Bedfile string
Seqid string
contigs []BedLine
contigIntraLinks []int // Contig link sizes for all intra-contig links
contigInterLinksFwd [][]int // Contig link sizes assuming same dir
contigInterLinksRev [][]int // Contig link sizes assuming other dir
Bamfile string
Bedfile string
Seqid string
linkDensity []float64
binStarts []int
contigs []BedLine
intraLinks []int // Contig link sizes for all intra-contig links
interLinksFwd [][]int // Contig link sizes assuming same dir
interLinksRev [][]int // Contig link sizes assuming other dir
}

// BedLine stores the information from each line in the bedfile
Expand All @@ -46,12 +49,15 @@ type BedLine struct {
start int
end int
name string
size int
}

// Run calls the Assessor
func (r *Assesser) Run() {
r.ReadBed()
r.ExtractContigLinks()
r.Makebins()
r.MakeProbDist()
r.ComputeLikelihood()
r.ComputePosteriorProb()
}
Expand All @@ -74,12 +80,14 @@ func (r *Assesser) ReadBed() {
continue
}
start, _ := strconv.Atoi(words[1])
start-- // To handle sometimes 1-based offset
end, _ := strconv.Atoi(words[2])
r.contigs = append(r.contigs, BedLine{
seqid: seqid,
start: start - 1,
start: start,
end: end,
name: words[3],
size: end - start,
})
}

Expand All @@ -102,11 +110,12 @@ func (r *Assesser) ExtractContigLinks() {
defer br.Close()

// Import links into pairs of contigs
r.contigInterLinksFwd = make([][]int, len(r.contigs))
r.contigInterLinksRev = make([][]int, len(r.contigs))
r.interLinksFwd = make([][]int, len(r.contigs))
r.interLinksRev = make([][]int, len(r.contigs))
var a, b int
nIntraLinks := 0
nInterLinks := 0
nSkippedTooShort := 0
ci := 0 // Use this to index into r.contigs, the current contig under consideration
for {
rec, err := br.Read()
Expand Down Expand Up @@ -140,11 +149,20 @@ func (r *Assesser) ExtractContigLinks() {
fmt.Println(r.contigs[ci], a, nIntraLinks, nInterLinks)
}

// TODO: remove this
if ci > 10 {
break
}

link := abs(a - b)
if link < MinLinkDist {
nSkippedTooShort++
continue
}
// For intra-contig link it's easy, just store the distance between two ends
// An intra-contig link
if checkInRange(b, r.contigs[ci].start, r.contigs[ci].end) {
r.contigIntraLinks = append(r.contigIntraLinks, link)
r.intraLinks = append(r.intraLinks, link)
nIntraLinks++
continue
}
Expand All @@ -156,14 +174,115 @@ func (r *Assesser) ExtractContigLinks() {
// To check this is correct, link_start = contig_start ==> contig_end
// and, link_start = contig_end => contig_start
// An inter-contig link
r.contigInterLinksFwd[ci] = append(r.contigInterLinksFwd[ci], link)
r.interLinksFwd[ci] = append(r.interLinksFwd[ci], link)
// Assuming flipped orientation
link = abs(r.contigs[ci].start + r.contigs[ci].end - a - b)
r.contigInterLinksRev[ci] = append(r.contigInterLinksRev[ci], link)
r.interLinksRev[ci] = append(r.interLinksRev[ci], link)
nInterLinks++
}
log.Noticef("A total of %d intra-contig links and %d inter-contig links imported",
nIntraLinks, nInterLinks)
log.Noticef("A total of %d intra-contig links and %d inter-contig links imported (%d skipped, too short)",
nIntraLinks, nInterLinks, nSkippedTooShort)
}

// LinkBin takes a link distance and convert to a binID
func (r *Assesser) LinkBin(dist int) int {
if dist < MinLinkDist {
return -1
}
distOverMin := dist / MinLinkDist
log2i := uintLog2(uint(distOverMin))
log2f := uintLog2Frac(float64(dist) / float64(int(MinLinkDist)<<log2i))
return int(16*log2i + log2f)
}

// BinSize returns the size of each bin
func (r *Assesser) BinSize(i int) int {
return r.binStarts[i+1] - r.binStarts[i]
}

// Makebins makes geometric bins and count links that fall in each bin
// This heavily borrows the method form LACHESIS
// https://github.com/shendurelab/LACHESIS/blob/master/src/LinkSizeExtracter.cc
func (r *Assesser) Makebins() {
// Step 1: make geometrically sized bins
// We fit 16 bins into each power of 2
linkRange := math.Log2(float64(MaxLinkDist) / float64(MinLinkDist))
nBins := int(math.Ceil(linkRange * 16))

maxLinkDist := math.MinInt32
for _, link := range r.intraLinks {
if link > maxLinkDist {
maxLinkDist = link
}
}

for i := 0; 16*i <= nBins; i++ {
jpower := 1.0
for j := 0; j < 16 && 16*i+j <= nBins; j++ {
binStart := MinLinkDist << uint(i)
r.binStarts = append(r.binStarts, int(float64(binStart)*jpower))
jpower *= GeometricBinSize
}
}

// Step 2: calculate assayable sequence length
// Find the length of assayable intra-contig sequence in each bin
intraContigLinkRange := math.Log2(float64(maxLinkDist) / float64(MinLinkDist))
nIntraContigBins := int(math.Ceil(intraContigLinkRange * 16))

binNorms := make([]int, nBins)
nLinks := make([]int, nBins)
for _, contig := range r.contigs {
for j := 0; j < nIntraContigBins; j++ {
z := contig.size - r.binStarts[j]
if z < 0 {
break
}
binNorms[j] += z
}
}

// Step 3: loop through all links and tabulate the counts
for _, link := range r.intraLinks {
bin := r.LinkBin(link)
if bin == -1 {
continue
}
nLinks[bin]++
}

// Step 4: normalize to calculate link density
r.linkDensity = make([]float64, nBins)
for i := 0; i < nIntraContigBins; i++ {
r.linkDensity[i] = float64(nLinks[i]) * BinNorm / float64(binNorms[i]) / float64(r.BinSize(i))
}

// Step 5: assume the distribution approximates 1/x for large x
topBin := nIntraContigBins - 1
nTopLinks := 0
nTopLinksNeeded := len(r.intraLinks) / 100
for ; nTopLinks < nTopLinksNeeded; topBin-- {
nTopLinks += nLinks[topBin]
}

avgLinkDensity := 0.0
for i := topBin; i < nIntraContigBins; i++ {
avgLinkDensity += r.linkDensity[i] * float64(r.BinSize(i))
}
avgLinkDensity /= float64(nIntraContigBins - topBin)

// Overwrite the values of last few bins, or a bin with na values
for i := 0; i < nBins; i++ {
if r.linkDensity[i] == 0 || i >= topBin {
r.linkDensity[i] = avgLinkDensity / float64(r.BinSize(i))
}
}
}

// MakeProbDist calculates the expected number of links in each bin, which is then
// normalized to make a probability distribution
func (r *Assesser) MakeProbDist() {

}

// ComputeLikelihood computes the likelihood of link sizes assuming + orientation
Expand Down

0 comments on commit 572509a

Please sign in to comment.