diff --git a/base.go b/base.go index 5027081..c0c4f82 100644 --- a/base.go +++ b/base.go @@ -61,7 +61,7 @@ const ( // *** The following parameters are modeled after LACHESIS *** // MinREs is the minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES) - // MinREs = 25 + MinREs = 8 // MaxLinkDensity is the density threshold before marking a contig as 'repetitve' (CLUSTER_MAX_LINK_DENSITY) MaxLinkDensity = 2 diff --git a/cluster.go b/cluster.go index 7baf942..d2a75f3 100644 --- a/cluster.go +++ b/cluster.go @@ -218,12 +218,12 @@ func (r *Partitioner) setClusters(clusterID []int) { } r.clusters = clusters - if !(NonInformativeRatio == 0 || NonInformativeRatio > 1) { + if !(r.NonInformativeRatio == 0 || r.NonInformativeRatio > 1) { log.Errorf("NonInformativeRatio needs to either 0 or > 1") } // Now try to recover previously skipped contigs - if NonInformativeRatio == 0 { + if r.NonInformativeRatio == 0 { r.sortClusters() return } @@ -249,14 +249,10 @@ func (r *Partitioner) setClusters(clusterID []int) { sort.Slice(linkages, func(i, j int) bool { return linkages[i].avgLinkage > linkages[j].avgLinkage }) - // for _, linkage := range linkages { - // fmt.Println(r.contigs[i].name, linkage.avgLinkage, - // linkage.cID, r.clusters[linkage.cID]) - // } - passRatio := linkages[0].avgLinkage >= NonInformativeRatio && + passRatio := linkages[0].avgLinkage >= float64(r.NonInformativeRatio) && (len(linkages) == 1 || linkages[1].avgLinkage == 0 || - linkages[0].avgLinkage/linkages[1].avgLinkage >= NonInformativeRatio) + linkages[0].avgLinkage/linkages[1].avgLinkage >= float64(r.NonInformativeRatio)) if !passRatio { nFailRatio++ continue @@ -266,7 +262,7 @@ func (r *Partitioner) setClusters(clusterID []int) { } log.Noticef("setClusters summary (NonInformativeRatio = %d): nPassRatio = %d, nFailRatio = %d, nFailCluster=%d", - NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster) + r.NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster) // Insert the skipped contigs into clusters for contigID, cID := range skippedClusters { diff --git a/cmd/allhic.go b/cmd/allhic.go index 4ebe364..cf5b033 100644 --- a/cmd/allhic.go +++ b/cmd/allhic.go @@ -67,6 +67,24 @@ func main() { }, } + partitionFlags := []cli.Flag{ + cli.IntFlag{ + Name: "minREs", + Usage: "Minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES in LACHESIS)", + Value: allhic.MinREs, + }, + cli.IntFlag{ + Name: "maxLinkDensity", + Usage: "Density threshold before marking contig as repetive (CLUSTER_MAX_LINK_DENSITY in LACHESIS)", + Value: allhic.MaxLinkDensity, + }, + cli.IntFlag{ + Name: "nonInformativeRatio", + Usage: "cutoff for recovering skipped contigs back into the clusters (CLUSTER_NONINFORMATIVE_RATIO in LACHESIS)", + Value: allhic.NonInformativeRatio, + }, + } + optimizeFlags := []cli.Flag{ cli.BoolFlag{ Name: "skipGA", @@ -160,28 +178,6 @@ of the "extract" command. return nil }, }, - // { - // Name: "anchor", - // Usage: "Anchor contigs based on an iterative merging method", - // UsageText: ` - // allhic anchor bamfile [options] - - // Anchor function: - // Given a bamfile, we anchor contigs based on an iterative merging method similar - // to the method used in 3D-DNA. The method is based on maximum weight matching - // of the contig linkage graph. - // `, - // Action: func(c *cli.Context) error { - // if c.NArg() < 1 { - // cli.ShowSubcommandHelp(c) - // return cli.NewExitError("Must specify bamfile", 1) - // } - // bamfile := c.Args().Get(0) - // p := allhic.Anchorer{Bamfile: bamfile} - // p.Run() - // return nil - // }, - // }, { Name: "partition", Usage: "Separate contigs into k groups", @@ -195,6 +191,7 @@ algorithm, there is an optimization goal here. The LACHESIS algorithm is a hierarchical clustering algorithm using average links. The two input files can be generated with the "extract" sub-command. `, + Flags: partitionFlags, Action: func(c *cli.Context) error { if c.NArg() < 3 { cli.ShowSubcommandHelp(c) @@ -204,7 +201,12 @@ can be generated with the "extract" sub-command. contigsfile := c.Args().Get(0) pairsFile := c.Args().Get(1) k, _ := strconv.Atoi(c.Args().Get(2)) - p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k} + minREs := c.Int("minREs") + maxLinkDensity := c.Int("maxLinkDensity") + nonInformativeRatio := c.Int("nonInformativeRatio") + p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k, + MinREs: minREs, MaxLinkDensity: maxLinkDensity, + NonInformativeRatio: nonInformativeRatio} p.Run() return nil }, diff --git a/partition.go b/partition.go index 67a4889..d0e5454 100644 --- a/partition.go +++ b/partition.go @@ -29,6 +29,10 @@ type Partitioner struct { clusters Clusters // Output files OutREfiles []string + // Parameters + MinREs int + MaxLinkDensity int + NonInformativeRatio int } // Run is the main function body of partition @@ -73,16 +77,16 @@ func (r *Partitioner) getRE() string { // This reads in the `counts_RE.txt` file generated by extract() func (r *Partitioner) skipContigsWithFewREs() { RE := r.getRE() - MinREs := 8 + MinREs := r.MinREs switch len(RE) { case 4: - MinREs = 32 // 32 * (4 ** 4) = 8Kb + MinREs = 4 * r.MinREs // 32 * (4 ** 4) = 8Kb case 5: - MinREs = 16 // 16 * (4 ** 5) = 16Kb + MinREs = 2 * r.MinREs // 16 * (4 ** 5) = 16Kb case 6: - MinREs = 8 // 8 * (4 ** 6) = 32Kb + MinREs = r.MinREs // 8 * (4 ** 6) = 32Kb default: - MinREs = 8 // We normally only use RE between 4 to 6 base cutter, but just in case + MinREs = r.MinREs // We normally only use RE between 4 to 6 base cutter, but just in case } log.Noticef("skipContigsWithFewREs with MinREs = %d (RE = %s)", MinREs, RE) nShort := 0 @@ -110,7 +114,7 @@ func (r *Partitioner) skipContigsWithFewREs() { // skipRepeats skip contigs likely from repetitive regions. Contigs are repetitive if they have more links // compared to the average contig. This should be run after contig length normalization. func (r *Partitioner) skipRepeats() { - log.Noticef("skipRepeats with multiplicity = %d", MaxLinkDensity) + log.Noticef("skipRepeats with multiplicity = %d", r.MaxLinkDensity) // Find the number of Hi-C links on each contig totalLinks := int64(0) N := len(r.contigs) @@ -137,7 +141,7 @@ func (r *Partitioner) skipRepeats() { } } - if factor >= MaxLinkDensity { + if factor >= float64(r.MaxLinkDensity) { fmt.Printf("Contig #%d (%s) has %.1fx the average number of Hi-C links -> MARKED REPETITIVE\n", i, contig.name, factor) nRepetitive++ @@ -153,7 +157,7 @@ func (r *Partitioner) skipRepeats() { // Note that the contigs reported as repetitive may have already been marked as skip (e.g. skipContigsWithFewREs) log.Noticef("Marked %d contigs (avg len %d) as repetitive (MaxLinkDensity = %d)", - nRepetitive, avgRepetiveLength, MaxLinkDensity) + nRepetitive, avgRepetiveLength, r.MaxLinkDensity) } // makeMatrix creates an adjacency matrix containing normalized score