diff --git a/cmd/allhic.go b/cmd/allhic.go index 352ede4..4539c94 100644 --- a/cmd/allhic.go +++ b/cmd/allhic.go @@ -50,21 +50,30 @@ func main() { Name: "extract", Usage: "Extract Hi-C link size distribution", UsageText: ` - allhic extract bamfile [options] + allhic extract bamfile fastafile [options] Extract function: Given a bamfile, the goal of the extract step is to calculate an empirical distribution of Hi-C link size based on intra-contig links. The Extract function also prepares for the latter steps of ALLHIC. `, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "RE", + Usage: "Restriction site pattern", + Value: "GATC", + }, + }, Action: func(c *cli.Context) error { - if len(c.Args()) < 1 { + if len(c.Args()) < 2 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1) } bamfile := c.Args().Get(0) - p := allhic.Extracter{Bamfile: bamfile} + fastafile := c.Args().Get(1) + RE := c.String("RE") + p := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE} p.Run() return nil }, diff --git a/extract.go b/extract.go index 9379beb..29a9167 100644 --- a/extract.go +++ b/extract.go @@ -11,6 +11,7 @@ package allhic import ( "bufio" + "bytes" "fmt" "io" "math" @@ -19,6 +20,7 @@ import ( "strings" "github.com/biogo/hts/bam" + "github.com/shenwei356/bio/seqio/fastx" ) var b = [...]uint{0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000} @@ -27,6 +29,8 @@ var s = [...]uint{1, 2, 4, 8, 16} // Extracter processes the distribution step type Extracter struct { Bamfile string + Fastafile string + RE string maxLinkDist int nBins int logFactorials []float64 @@ -114,12 +118,39 @@ func (r *Extracter) Run() { r.ExtractInterContigLinks() r.ExtractIntraContigLinks() r.Makebins() - r.WriteExtracter("distribution.txt") + r.writeRE("counts_" + r.RE + ".txt") + r.WriteDistribution("distribution.txt") r.FindEnrichmentOnContigs("enrichment.txt") r.PreComputeLogFactorials() r.FindDistanceBetweenContigs("distance.txt") } +// WriteRE writes out the number of restriction fragments, one per line +func (r *Extracter) writeRE(outfile string) { + reader, _ := fastx.NewDefaultReader(r.Fastafile) + f, _ := os.Create(outfile) + w := bufio.NewWriter(f) + defer f.Close() + totalCounts := 0 + totalBp := int64(0) + + for { + rec, err := reader.Read() + if err == io.EOF { + break + } + + count := bytes.Count(rec.Seq.Seq, []byte(r.RE)) + totalCounts += count + totalBp += int64(rec.Seq.Length()) + fmt.Printf("%s\t%d\r", rec.Name, count) + fmt.Fprintf(w, "%s\t%d\n", rec.Name, count) + } + w.Flush() + log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`", + totalCounts, totalBp/int64(totalCounts), outfile) +} + // Makebins makes geometric bins and count links that fall in each bin // This heavily borrows the method form LACHESIS // https://github.com/shendurelab/LACHESIS/blob/master/src/LinkSizeExtracter.cc @@ -200,8 +231,8 @@ func (r *Extracter) Makebins() { } } -// WriteExtracter writes the link size distribution to file -func (r *Extracter) WriteExtracter(outfile string) { +// WriteDistribution writes the link +func (r *Extracter) WriteDistribution(outfile string) { f, _ := os.Create(outfile) w := bufio.NewWriter(f) defer f.Close()