Skip to content

Commit

Permalink
[partition] Add partitionFlags
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Sep 5, 2018
1 parent 4d7935d commit 8185b0a
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 41 deletions.
2 changes: 1 addition & 1 deletion base.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ const (
// *** The following parameters are modeled after LACHESIS ***

// MinREs is the minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES)
// MinREs = 25
MinREs = 8

// MaxLinkDensity is the density threshold before marking a contig as 'repetitve' (CLUSTER_MAX_LINK_DENSITY)
MaxLinkDensity = 2
Expand Down
14 changes: 5 additions & 9 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,12 @@ func (r *Partitioner) setClusters(clusterID []int) {
}
r.clusters = clusters

if !(NonInformativeRatio == 0 || NonInformativeRatio > 1) {
if !(r.NonInformativeRatio == 0 || r.NonInformativeRatio > 1) {
log.Errorf("NonInformativeRatio needs to either 0 or > 1")
}

// Now try to recover previously skipped contigs
if NonInformativeRatio == 0 {
if r.NonInformativeRatio == 0 {
r.sortClusters()
return
}
Expand All @@ -249,14 +249,10 @@ func (r *Partitioner) setClusters(clusterID []int) {
sort.Slice(linkages, func(i, j int) bool {
return linkages[i].avgLinkage > linkages[j].avgLinkage
})
// for _, linkage := range linkages {
// fmt.Println(r.contigs[i].name, linkage.avgLinkage,
// linkage.cID, r.clusters[linkage.cID])
// }

passRatio := linkages[0].avgLinkage >= NonInformativeRatio &&
passRatio := linkages[0].avgLinkage >= float64(r.NonInformativeRatio) &&
(len(linkages) == 1 || linkages[1].avgLinkage == 0 ||
linkages[0].avgLinkage/linkages[1].avgLinkage >= NonInformativeRatio)
linkages[0].avgLinkage/linkages[1].avgLinkage >= float64(r.NonInformativeRatio))
if !passRatio {
nFailRatio++
continue
Expand All @@ -266,7 +262,7 @@ func (r *Partitioner) setClusters(clusterID []int) {
}

log.Noticef("setClusters summary (NonInformativeRatio = %d): nPassRatio = %d, nFailRatio = %d, nFailCluster=%d",
NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster)
r.NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster)

// Insert the skipped contigs into clusters
for contigID, cID := range skippedClusters {
Expand Down
48 changes: 25 additions & 23 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,24 @@ func main() {
},
}

partitionFlags := []cli.Flag{
cli.IntFlag{
Name: "minREs",
Usage: "Minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES in LACHESIS)",
Value: allhic.MinREs,
},
cli.IntFlag{
Name: "maxLinkDensity",
Usage: "Density threshold before marking contig as repetive (CLUSTER_MAX_LINK_DENSITY in LACHESIS)",
Value: allhic.MaxLinkDensity,
},
cli.IntFlag{
Name: "nonInformativeRatio",
Usage: "cutoff for recovering skipped contigs back into the clusters (CLUSTER_NONINFORMATIVE_RATIO in LACHESIS)",
Value: allhic.NonInformativeRatio,
},
}

optimizeFlags := []cli.Flag{
cli.BoolFlag{
Name: "skipGA",
Expand Down Expand Up @@ -160,28 +178,6 @@ of the "extract" command.
return nil
},
},
// {
// Name: "anchor",
// Usage: "Anchor contigs based on an iterative merging method",
// UsageText: `
// allhic anchor bamfile [options]

// Anchor function:
// Given a bamfile, we anchor contigs based on an iterative merging method similar
// to the method used in 3D-DNA. The method is based on maximum weight matching
// of the contig linkage graph.
// `,
// Action: func(c *cli.Context) error {
// if c.NArg() < 1 {
// cli.ShowSubcommandHelp(c)
// return cli.NewExitError("Must specify bamfile", 1)
// }
// bamfile := c.Args().Get(0)
// p := allhic.Anchorer{Bamfile: bamfile}
// p.Run()
// return nil
// },
// },
{
Name: "partition",
Usage: "Separate contigs into k groups",
Expand All @@ -195,6 +191,7 @@ algorithm, there is an optimization goal here. The LACHESIS algorithm is
a hierarchical clustering algorithm using average links. The two input files
can be generated with the "extract" sub-command.
`,
Flags: partitionFlags,
Action: func(c *cli.Context) error {
if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
Expand All @@ -204,7 +201,12 @@ can be generated with the "extract" sub-command.
contigsfile := c.Args().Get(0)
pairsFile := c.Args().Get(1)
k, _ := strconv.Atoi(c.Args().Get(2))
p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k}
minREs := c.Int("minREs")
maxLinkDensity := c.Int("maxLinkDensity")
nonInformativeRatio := c.Int("nonInformativeRatio")
p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k,
MinREs: minREs, MaxLinkDensity: maxLinkDensity,
NonInformativeRatio: nonInformativeRatio}
p.Run()
return nil
},
Expand Down
20 changes: 12 additions & 8 deletions partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ type Partitioner struct {
clusters Clusters
// Output files
OutREfiles []string
// Parameters
MinREs int
MaxLinkDensity int
NonInformativeRatio int
}

// Run is the main function body of partition
Expand Down Expand Up @@ -73,16 +77,16 @@ func (r *Partitioner) getRE() string {
// This reads in the `counts_RE.txt` file generated by extract()
func (r *Partitioner) skipContigsWithFewREs() {
RE := r.getRE()
MinREs := 8
MinREs := r.MinREs
switch len(RE) {
case 4:
MinREs = 32 // 32 * (4 ** 4) = 8Kb
MinREs = 4 * r.MinREs // 32 * (4 ** 4) = 8Kb
case 5:
MinREs = 16 // 16 * (4 ** 5) = 16Kb
MinREs = 2 * r.MinREs // 16 * (4 ** 5) = 16Kb
case 6:
MinREs = 8 // 8 * (4 ** 6) = 32Kb
MinREs = r.MinREs // 8 * (4 ** 6) = 32Kb
default:
MinREs = 8 // We normally only use RE between 4 to 6 base cutter, but just in case
MinREs = r.MinREs // We normally only use RE between 4 to 6 base cutter, but just in case
}
log.Noticef("skipContigsWithFewREs with MinREs = %d (RE = %s)", MinREs, RE)
nShort := 0
Expand Down Expand Up @@ -110,7 +114,7 @@ func (r *Partitioner) skipContigsWithFewREs() {
// skipRepeats skip contigs likely from repetitive regions. Contigs are repetitive if they have more links
// compared to the average contig. This should be run after contig length normalization.
func (r *Partitioner) skipRepeats() {
log.Noticef("skipRepeats with multiplicity = %d", MaxLinkDensity)
log.Noticef("skipRepeats with multiplicity = %d", r.MaxLinkDensity)
// Find the number of Hi-C links on each contig
totalLinks := int64(0)
N := len(r.contigs)
Expand All @@ -137,7 +141,7 @@ func (r *Partitioner) skipRepeats() {
}
}

if factor >= MaxLinkDensity {
if factor >= float64(r.MaxLinkDensity) {
fmt.Printf("Contig #%d (%s) has %.1fx the average number of Hi-C links -> MARKED REPETITIVE\n",
i, contig.name, factor)
nRepetitive++
Expand All @@ -153,7 +157,7 @@ func (r *Partitioner) skipRepeats() {

// Note that the contigs reported as repetitive may have already been marked as skip (e.g. skipContigsWithFewREs)
log.Noticef("Marked %d contigs (avg len %d) as repetitive (MaxLinkDensity = %d)",
nRepetitive, avgRepetiveLength, MaxLinkDensity)
nRepetitive, avgRepetiveLength, r.MaxLinkDensity)
}

// makeMatrix creates an adjacency matrix containing normalized score
Expand Down

0 comments on commit 8185b0a

Please sign in to comment.