Skip to content

Commit

Permalink
[extract] Export RE counts to file
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jul 5, 2018
1 parent 92c9f7b commit 01bde3b
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
15 changes: 12 additions & 3 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,30 @@ func main() {
Name: "extract",
Usage: "Extract Hi-C link size distribution",
UsageText: `
allhic extract bamfile [options]
allhic extract bamfile fastafile [options]
Extract function:
Given a bamfile, the goal of the extract step is to calculate an empirical
distribution of Hi-C link size based on intra-contig links. The Extract function
also prepares for the latter steps of ALLHIC.
`,
Flags: []cli.Flag{
cli.StringFlag{
Name: "RE",
Usage: "Restriction site pattern",
Value: "GATC",
},
},
Action: func(c *cli.Context) error {
if len(c.Args()) < 1 {
if len(c.Args()) < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
}

bamfile := c.Args().Get(0)
p := allhic.Extracter{Bamfile: bamfile}
fastafile := c.Args().Get(1)
RE := c.String("RE")
p := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE}
p.Run()
return nil
},
Expand Down
37 changes: 34 additions & 3 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ package allhic

import (
"bufio"
"bytes"
"fmt"
"io"
"math"
Expand All @@ -19,6 +20,7 @@ import (
"strings"

"github.com/biogo/hts/bam"
"github.com/shenwei356/bio/seqio/fastx"
)

var b = [...]uint{0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}
Expand All @@ -27,6 +29,8 @@ var s = [...]uint{1, 2, 4, 8, 16}
// Extracter processes the distribution step
type Extracter struct {
Bamfile string
Fastafile string
RE string
maxLinkDist int
nBins int
logFactorials []float64
Expand Down Expand Up @@ -114,12 +118,39 @@ func (r *Extracter) Run() {
r.ExtractInterContigLinks()
r.ExtractIntraContigLinks()
r.Makebins()
r.WriteExtracter("distribution.txt")
r.writeRE("counts_" + r.RE + ".txt")
r.WriteDistribution("distribution.txt")
r.FindEnrichmentOnContigs("enrichment.txt")
r.PreComputeLogFactorials()
r.FindDistanceBetweenContigs("distance.txt")
}

// WriteRE writes out the number of restriction fragments, one per line
func (r *Extracter) writeRE(outfile string) {
reader, _ := fastx.NewDefaultReader(r.Fastafile)
f, _ := os.Create(outfile)
w := bufio.NewWriter(f)
defer f.Close()
totalCounts := 0
totalBp := int64(0)

for {
rec, err := reader.Read()
if err == io.EOF {
break
}

count := bytes.Count(rec.Seq.Seq, []byte(r.RE))
totalCounts += count
totalBp += int64(rec.Seq.Length())
fmt.Printf("%s\t%d\r", rec.Name, count)
fmt.Fprintf(w, "%s\t%d\n", rec.Name, count)
}
w.Flush()
log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
totalCounts, totalBp/int64(totalCounts), outfile)
}

// Makebins makes geometric bins and count links that fall in each bin
// This heavily borrows the method form LACHESIS
// https://github.com/shendurelab/LACHESIS/blob/master/src/LinkSizeExtracter.cc
Expand Down Expand Up @@ -200,8 +231,8 @@ func (r *Extracter) Makebins() {
}
}

// WriteExtracter writes the link size distribution to file
func (r *Extracter) WriteExtracter(outfile string) {
// WriteDistribution writes the link
func (r *Extracter) WriteDistribution(outfile string) {
f, _ := os.Create(outfile)
w := bufio.NewWriter(f)
defer f.Close()
Expand Down

0 comments on commit 01bde3b

Please sign in to comment.