Skip to content

Commit

Permalink
[partition] Add skipContigsWithFewREs()
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jul 5, 2018
1 parent 01bde3b commit e4ba144
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 11 deletions.
2 changes: 2 additions & 0 deletions base.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ const (
MinInterLinks = 1
// MinLinkDist is the minimum link distance we care about
MinLinkDist = 1 << 11
// MinREs is the minimum number of RE sites in a contig to be clustered
MinREs = 25
// MaxLinkDist is the maximum link distance we care about
MaxLinkDist = 1 << 27
// EffLinkDist is the link distance where we are willing to accept contig pairs
Expand Down
19 changes: 12 additions & 7 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"strings"

"github.com/biogo/hts/bam"
"github.com/shenwei356/bio/seq"
"github.com/shenwei356/bio/seqio/fastx"
)

Expand Down Expand Up @@ -134,17 +135,20 @@ func (r *Extracter) writeRE(outfile string) {
totalCounts := 0
totalBp := int64(0)

fmt.Fprintf(w, "#Contig\tRECounts\tLength\n")
seq.ValidateSeq = false

for {
rec, err := reader.Read()
if err == io.EOF {
break
}

count := bytes.Count(rec.Seq.Seq, []byte(r.RE))
length := rec.Seq.Length()
totalCounts += count
totalBp += int64(rec.Seq.Length())
fmt.Printf("%s\t%d\r", rec.Name, count)
fmt.Fprintf(w, "%s\t%d\n", rec.Name, count)
totalBp += int64(length)
fmt.Fprintf(w, "%s\t%d\t%d\n", rec.Name, count, length)
}
w.Flush()
log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
Expand Down Expand Up @@ -250,16 +254,17 @@ func (r *Extracter) WriteDistribution(outfile string) {
// ContigInfo stores results calculated from FindEnrichmentOnContigs
type ContigInfo struct {
name string
length int
recounts int
length int64
nExpectedLinks float64
nObservedLinks int
lde float64
skip bool
}

// String() outputs the string representation of ContigInfo
func (r ContigInfo) String() string {
return fmt.Sprintf("%s\t%d\t%.1f\t%d\t%.4f",
r.name, r.length, r.nExpectedLinks, r.nObservedLinks, r.lde)
return fmt.Sprintf("%s\t%d\t%d", r.name, r.recounts, r.length)
}

// FindEnrichmentOnContigs determine the local enrichment of links on this contig.
Expand Down Expand Up @@ -287,7 +292,7 @@ func (r *Extracter) FindEnrichmentOnContigs(outfile string) {
} else if LDE > 5.0 {
LDE = 5.0
}
ci := ContigInfo{name: contig, length: L, nExpectedLinks: sumf(nExpectedLinks),
ci := ContigInfo{name: contig, length: int64(L), nExpectedLinks: sumf(nExpectedLinks),
nObservedLinks: nObservedLinks, lde: LDE}
fmt.Fprintln(w, ci)

Expand Down
53 changes: 49 additions & 4 deletions partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,23 @@ type Partitioner struct {
Contigsfile string
Distfile string
K int
contigs []ContigInfo
contigs []*ContigInfo
contigToIdx map[string]int
matrix [][]float64
longestLength int
}

// Run is the main function body of partition
func (r *Partitioner) Run() {
r.ParseContigLines()
r.readRE()
r.contigToIdx = make(map[string]int)
for i, ci := range r.contigs {
r.contigToIdx[ci.name] = i
}
dists := r.ParseDist()
M := r.MakeMatrix(dists)
r.skipContigsWithFewREs()

clusters := Cluster(M, r.K)

for _, ids := range clusters {
Expand All @@ -50,6 +52,30 @@ func (r *Partitioner) Run() {
log.Notice("Success")
}

// skipContigsWithFewREs skip contigs with fewere than MinREs
// This reads in the `counts_RE.txt` file generated by extract()
func (r *Partitioner) skipContigsWithFewREs() {
log.Noticef("skipContigsWithFewREs with MinREs = %d", MinREs)
nShort := 0
shortRE := 0
shortLen := int64(0)

for _, contig := range r.contigs {
if contig.recounts < MinREs {
nShort++
shortRE += contig.recounts
shortLen += contig.length
contig.skip = true
}
}
avgRE, avgLen := 0.0, int64(0)
if nShort > 0 {
avgRE, avgLen = float64(shortRE)/float64(nShort), shortLen/int64(nShort)
}
log.Noticef("Marked %d contigs (avg %.1f RE sites, len %d) since they contain too few REs (MinREs = %d)",
nShort, avgRE, avgLen, MinREs)
}

// MakeMatrix creates an adjacency matrix containing normalized score
func (r *Partitioner) MakeMatrix(edges []ContigPair) [][]float64 {
M := Make2DSliceFloat64(len(r.contigs), len(r.contigs))
Expand Down Expand Up @@ -96,6 +122,25 @@ func FilterEdges(edges []ContigPair) []ContigPair {
return goodEdges
}

// readRE reads in a three-column tab-separated file
// #Contig REcounts Length
func (r *Partitioner) readRE() {
recs := ReadCSVLines(r.Contigsfile)
for _, rec := range recs {
name := rec[0]
recounts, _ := strconv.Atoi(rec[1])
length, _ := strconv.ParseInt(rec[2], 10, 64)
ci := &ContigInfo{
name: name,
recounts: recounts,
length: length,
}
r.contigs = append(r.contigs, ci)
}
log.Noticef("Loading contig RE lengths for normalization from `%s`",
r.Contigsfile)
}

// ParseContigLines imports the contig infor into a slice of ContigInfo
// ContigInfo stores the data struct of the contigfile
// #Contig Length Expected Observed LDE
Expand All @@ -113,8 +158,8 @@ func (r *Partitioner) ParseContigLines() {
nObservedLinks, _ := strconv.Atoi(rec[3])
lde, _ := strconv.ParseFloat(rec[4], 64)

ci := ContigInfo{
name: name, length: length,
ci := &ContigInfo{
name: name, length: int64(length),
nExpectedLinks: nExpectedLinks, nObservedLinks: nObservedLinks,
lde: lde,
}
Expand Down

0 comments on commit e4ba144

Please sign in to comment.