[partition] Add partitionFlags

tanghaibao · Sep 5, 2018 · 8185b0a · 8185b0a
1 parent 4d7935d
commit 8185b0a
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 41 deletions.
diff --git a/base.go b/base.go
@@ -61,7 +61,7 @@ const (
 	// *** The following parameters are modeled after LACHESIS ***
 
 	// MinREs is the minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES)
-	// MinREs = 25
+	MinREs = 8
 
 	// MaxLinkDensity is the density threshold before marking a contig as 'repetitve' (CLUSTER_MAX_LINK_DENSITY)
 	MaxLinkDensity = 2

diff --git a/cluster.go b/cluster.go
@@ -218,12 +218,12 @@ func (r *Partitioner) setClusters(clusterID []int) {
 	}
 	r.clusters = clusters
 
-	if !(NonInformativeRatio == 0 || NonInformativeRatio > 1) {
+	if !(r.NonInformativeRatio == 0 || r.NonInformativeRatio > 1) {
 		log.Errorf("NonInformativeRatio needs to either 0 or > 1")
 	}
 
 	// Now try to recover previously skipped contigs
-	if NonInformativeRatio == 0 {
+	if r.NonInformativeRatio == 0 {
 		r.sortClusters()
 		return
 	}
@@ -249,14 +249,10 @@ func (r *Partitioner) setClusters(clusterID []int) {
 		sort.Slice(linkages, func(i, j int) bool {
 			return linkages[i].avgLinkage > linkages[j].avgLinkage
 		})
-		// for _, linkage := range linkages {
-		// 	fmt.Println(r.contigs[i].name, linkage.avgLinkage,
-		// 		linkage.cID, r.clusters[linkage.cID])
-		// }
 
-		passRatio := linkages[0].avgLinkage >= NonInformativeRatio &&
+		passRatio := linkages[0].avgLinkage >= float64(r.NonInformativeRatio) &&
 			(len(linkages) == 1 || linkages[1].avgLinkage == 0 ||
-				linkages[0].avgLinkage/linkages[1].avgLinkage >= NonInformativeRatio)
+				linkages[0].avgLinkage/linkages[1].avgLinkage >= float64(r.NonInformativeRatio))
 		if !passRatio {
 			nFailRatio++
 			continue
@@ -266,7 +262,7 @@ func (r *Partitioner) setClusters(clusterID []int) {
 	}
 
 	log.Noticef("setClusters summary (NonInformativeRatio = %d): nPassRatio = %d, nFailRatio = %d, nFailCluster=%d",
-		NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster)
+		r.NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster)
 
 	// Insert the skipped contigs into clusters
 	for contigID, cID := range skippedClusters {

diff --git a/cmd/allhic.go b/cmd/allhic.go
@@ -67,6 +67,24 @@ func main() {
 		},
 	}
 
+	partitionFlags := []cli.Flag{
+		cli.IntFlag{
+			Name:  "minREs",
+			Usage: "Minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES in LACHESIS)",
+			Value: allhic.MinREs,
+		},
+		cli.IntFlag{
+			Name:  "maxLinkDensity",
+			Usage: "Density threshold before marking contig as repetive (CLUSTER_MAX_LINK_DENSITY in LACHESIS)",
+			Value: allhic.MaxLinkDensity,
+		},
+		cli.IntFlag{
+			Name:  "nonInformativeRatio",
+			Usage: "cutoff for recovering skipped contigs back into the clusters (CLUSTER_NONINFORMATIVE_RATIO in LACHESIS)",
+			Value: allhic.NonInformativeRatio,
+		},
+	}
+
 	optimizeFlags := []cli.Flag{
 		cli.BoolFlag{
 			Name:  "skipGA",
@@ -160,28 +178,6 @@ of the "extract" command.
 				return nil
 			},
 		},
-		// 		{
-		// 			Name:  "anchor",
-		// 			Usage: "Anchor contigs based on an iterative merging method",
-		// 			UsageText: `
-		// 	allhic anchor bamfile [options]
-
-		// Anchor function:
-		// Given a bamfile, we anchor contigs based on an iterative merging method similar
-		// to the method used in 3D-DNA. The method is based on maximum weight matching
-		// of the contig linkage graph.
-		// `,
-		// 			Action: func(c *cli.Context) error {
-		// 				if c.NArg() < 1 {
-		// 					cli.ShowSubcommandHelp(c)
-		// 					return cli.NewExitError("Must specify bamfile", 1)
-		// 				}
-		// 				bamfile := c.Args().Get(0)
-		// 				p := allhic.Anchorer{Bamfile: bamfile}
-		// 				p.Run()
-		// 				return nil
-		// 			},
-		// 		},
 		{
 			Name:  "partition",
 			Usage: "Separate contigs into k groups",
@@ -195,6 +191,7 @@ algorithm, there is an optimization goal here. The LACHESIS algorithm is
 a hierarchical clustering algorithm using average links. The two input files
 can be generated with the "extract" sub-command.
 `,
+			Flags: partitionFlags,
 			Action: func(c *cli.Context) error {
 				if c.NArg() < 3 {
 					cli.ShowSubcommandHelp(c)
@@ -204,7 +201,12 @@ can be generated with the "extract" sub-command.
 				contigsfile := c.Args().Get(0)
 				pairsFile := c.Args().Get(1)
 				k, _ := strconv.Atoi(c.Args().Get(2))
-				p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k}
+				minREs := c.Int("minREs")
+				maxLinkDensity := c.Int("maxLinkDensity")
+				nonInformativeRatio := c.Int("nonInformativeRatio")
+				p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k,
+					MinREs: minREs, MaxLinkDensity: maxLinkDensity,
+					NonInformativeRatio: nonInformativeRatio}
 				p.Run()
 				return nil
 			},

diff --git a/partition.go b/partition.go
@@ -29,6 +29,10 @@ type Partitioner struct {
 	clusters    Clusters
 	// Output files
 	OutREfiles []string
+	// Parameters
+	MinREs              int
+	MaxLinkDensity      int
+	NonInformativeRatio int
 }
 
 // Run is the main function body of partition
@@ -73,16 +77,16 @@ func (r *Partitioner) getRE() string {
 // This reads in the `counts_RE.txt` file generated by extract()
 func (r *Partitioner) skipContigsWithFewREs() {
 	RE := r.getRE()
-	MinREs := 8
+	MinREs := r.MinREs
 	switch len(RE) {
 	case 4:
-		MinREs = 32 // 32 * (4 ** 4) = 8Kb
+		MinREs = 4 * r.MinREs // 32 * (4 ** 4) = 8Kb
 	case 5:
-		MinREs = 16 // 16 * (4 ** 5) = 16Kb
+		MinREs = 2 * r.MinREs // 16 * (4 ** 5) = 16Kb
 	case 6:
-		MinREs = 8 // 8 * (4 ** 6) = 32Kb
+		MinREs = r.MinREs // 8 * (4 ** 6) = 32Kb
 	default:
-		MinREs = 8 // We normally only use RE between 4 to 6 base cutter, but just in case
+		MinREs = r.MinREs // We normally only use RE between 4 to 6 base cutter, but just in case
 	}
 	log.Noticef("skipContigsWithFewREs with MinREs = %d (RE = %s)", MinREs, RE)
 	nShort := 0
@@ -110,7 +114,7 @@ func (r *Partitioner) skipContigsWithFewREs() {
 // skipRepeats skip contigs likely from repetitive regions. Contigs are repetitive if they have more links
 // compared to the average contig. This should be run after contig length normalization.
 func (r *Partitioner) skipRepeats() {
-	log.Noticef("skipRepeats with multiplicity = %d", MaxLinkDensity)
+	log.Noticef("skipRepeats with multiplicity = %d", r.MaxLinkDensity)
 	// Find the number of Hi-C links on each contig
 	totalLinks := int64(0)
 	N := len(r.contigs)
@@ -137,7 +141,7 @@ func (r *Partitioner) skipRepeats() {
 			}
 		}
 
-		if factor >= MaxLinkDensity {
+		if factor >= float64(r.MaxLinkDensity) {
 			fmt.Printf("Contig #%d (%s) has %.1fx the average number of Hi-C links -> MARKED REPETITIVE\n",
 				i, contig.name, factor)
 			nRepetitive++
@@ -153,7 +157,7 @@ func (r *Partitioner) skipRepeats() {
 
 	// Note that the contigs reported as repetitive may have already been marked as skip (e.g. skipContigsWithFewREs)
 	log.Noticef("Marked %d contigs (avg len %d) as repetitive (MaxLinkDensity = %d)",
-		nRepetitive, avgRepetiveLength, MaxLinkDensity)
+		nRepetitive, avgRepetiveLength, r.MaxLinkDensity)
 }
 
 // makeMatrix creates an adjacency matrix containing normalized score