diff --git a/README.md b/README.md
index f4a0a8b..5d1209e 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,6 @@ go install github.com/tanghaibao/allhic/cmd/allhic
## Usage
-### Prune
-
-Prune bamfile to remove weak links. WIP.
-
### Extract
Extract does a fair amount of preprocessing: 1) extract inter-contig links into a more compact form, specifically into `.clm`; 2) extract intra-contig links and build a distribution; 3) count up the restriction sites to be used in normalization (similar to LACHESIS); 4) bundles the inter-contig links into pairs of contigs.
@@ -39,6 +35,20 @@ Extract does a fair amount of preprocessing: 1) extract inter-contig links into
allhic extract tests/test.bam tests/seq.fasta.gz
```
+### Prune
+
+This prune step is **optional** for typical inbreeding diploid genomes.
+However, pruning will improve the quality of assembly of polyploid genomes.
+Prune pairs file to remove allelic/cross-allelic links.
+
+```console
+allhic prune tests/Allele.ctg.table tests/test.pairs.txt
+```
+
+Please see help string of `allhic prune` on the formatting of
+`Allele.ctg.table`.
+
+
### Partition
Given a target `k`, number of partitions, the goal of the partitioning
diff --git a/cluster.go b/cluster.go
index 011a078..8e12924 100644
--- a/cluster.go
+++ b/cluster.go
@@ -329,7 +329,7 @@ func (r *Partitioner) sortClusters() {
// printClusters shows the contents of the clusters
func (r *Partitioner) printClusters() {
- clusterfile := RemoveExt(RemoveExt(r.Distfile)) + ".clusters.txt"
+ clusterfile := RemoveExt(RemoveExt(r.PairsFile)) + ".clusters.txt"
f, _ := os.Create(clusterfile)
defer f.Close()
w := bufio.NewWriter(f)
diff --git a/cmd/allhic.go b/cmd/allhic.go
index de31f8d..4ebe364 100644
--- a/cmd/allhic.go
+++ b/cmd/allhic.go
@@ -99,28 +99,6 @@ func main() {
}
app.Commands = []cli.Command{
- {
- Name: "prune",
- Usage: "Prune bamfile to remove weak links",
- UsageText: `
- allhic prune bamfile [options]
-
-Prune function:
-Given a bamfile, the goal of the pruning step is to remove all inter-allelic
-links, then it is possible to reconstruct allele-separated assemblies.
-`,
- Action: func(c *cli.Context) error {
- if c.NArg() < 1 {
- cli.ShowSubcommandHelp(c)
- return cli.NewExitError("Must specify bamfile", 1)
- }
-
- bamfile := c.Args().Get(0)
- p := allhic.Pruner{Bamfile: bamfile}
- p.Run()
- return nil
- },
- },
{
Name: "extract",
Usage: "Extract Hi-C link size distribution",
@@ -136,7 +114,7 @@ also prepares for the latter steps of ALLHIC.
Action: func(c *cli.Context) error {
if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
- return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
+ return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1)
}
bamfile := c.Args().Get(0)
@@ -148,6 +126,40 @@ also prepares for the latter steps of ALLHIC.
return nil
},
},
+ {
+ Name: "prune",
+ Usage: "Prune allelic, cross-allelic and weak links",
+ UsageText: `
+ allhic prune alleles.table pairs.txt [options]
+
+Prune function:
+Given contig pairing, the goal of the prune step is to remove all inter-allelic
+links, then it is possible to reconstruct allele-separated assemblies in the following
+steps. The alleles.table file contains tab-separated columns containing contigs that
+are considered "allelic". For example:
+
+Chr10 18902 tig00030660 tig00003333
+Chr10 35071 tig00038687 tig00038686 tig00065419
+
+These lines means that at certain locations in the genome (typically based on synteny
+to a closely-related genome), several contigs are considered allelic. The Hi-C links
+between these contigs are removed, in addition, other contigs linking to these contigs
+simultaneously would only consider one single-best edge. The "pairs.txt" file is the output
+of the "extract" command.
+`,
+ Action: func(c *cli.Context) error {
+ if c.NArg() < 2 {
+ cli.ShowSubcommandHelp(c)
+ return cli.NewExitError("Must specify alleles.table and pairs.txt", 1)
+ }
+
+ allelesFile := c.Args().Get(0)
+ pairsFile := c.Args().Get(1)
+ p := allhic.Pruner{AllelesFile: allelesFile, PairsFile: pairsFile}
+ p.Run()
+ return nil
+ },
+ },
// {
// Name: "anchor",
// Usage: "Anchor contigs based on an iterative merging method",
@@ -172,7 +184,7 @@ also prepares for the latter steps of ALLHIC.
// },
{
Name: "partition",
- Usage: "Separate bamfile into k groups",
+ Usage: "Separate contigs into k groups",
UsageText: `
allhic partition counts_RE.txt pairs.txt k [options]
@@ -186,13 +198,13 @@ can be generated with the "extract" sub-command.
Action: func(c *cli.Context) error {
if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
- return cli.NewExitError("Must specify distfile", 1)
+ return cli.NewExitError("Must specify countsFile, pairsFile and value k", 1)
}
contigsfile := c.Args().Get(0)
- distfile := c.Args().Get(1)
+ pairsFile := c.Args().Get(1)
k, _ := strconv.Atoi(c.Args().Get(2))
- p := allhic.Partitioner{Contigsfile: contigsfile, Distfile: distfile, K: k}
+ p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k}
p.Run()
return nil
},
@@ -332,7 +344,7 @@ A convenience driver function. Chain the following steps sequentially.
Action: func(c *cli.Context) error {
if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
- return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
+ return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1)
}
bamfile := c.Args().Get(0)
@@ -354,7 +366,7 @@ A convenience driver function. Chain the following steps sequentially.
// Partition into k groups
banner(fmt.Sprintf("Partition into %d groups", k))
partitioner := allhic.Partitioner{Contigsfile: extractor.OutContigsfile,
- Distfile: extractor.OutPairsfile, K: k}
+ PairsFile: extractor.OutPairsfile, K: k}
partitioner.Run()
// Optimize the k groups separately
diff --git a/partition.go b/partition.go
index 888fd48..b1e4b31 100644
--- a/partition.go
+++ b/partition.go
@@ -20,7 +20,7 @@ import (
// Partitioner converts the bamfile into a matrix of link counts
type Partitioner struct {
Contigsfile string
- Distfile string
+ PairsFile string
K int
contigs []*ContigInfo
contigToIdx map[string]int
@@ -158,7 +158,7 @@ func (r *Partitioner) skipRepeats() {
// makeMatrix creates an adjacency matrix containing normalized score
func (r *Partitioner) makeMatrix() {
- edges := r.parseDist()
+ edges := parseDist(r.PairsFile)
N := len(r.contigs)
M := Make2DSliceInt64(N, N)
longestSquared := int64(r.longestRE) * int64(r.longestRE)
@@ -220,14 +220,14 @@ func (r *Partitioner) splitRE() {
}
}
-// parseDist imports the edges of the contig into a slice of DistLine
-// DistLine stores the data structure of the distfile
+// parseDist imports the edges of the contig into a slice of ContigPair
+// ContigPair stores the data structure of the distfile
// #X Y Contig1 Contig2 RE1 RE2 ObservedLinks ExpectedLinksIfAdjacent
// 1 44 idcChr1.ctg24 idcChr1.ctg51 6612 1793 12 121.7
// 1 70 idcChr1.ctg24 idcChr1.ctg52 6612 686 2 59.3
-func (r *Partitioner) parseDist() []ContigPair {
+func parseDist(pairsFile string) []ContigPair {
var edges []ContigPair
- recs := ReadCSVLines(r.Distfile)
+ recs := ReadCSVLines(pairsFile)
for _, rec := range recs {
ai, _ := strconv.Atoi(rec[0])
diff --git a/prune.go b/prune.go
index 4f4a394..161d228 100644
--- a/prune.go
+++ b/prune.go
@@ -9,13 +9,59 @@
package allhic
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "strings"
+)
+
// Pruner processes the pruning step
type Pruner struct {
- Bamfile string
+ AllelesFile string
+ PairsFile string
}
+// AlleleGroup stores the contig names that are considered allelic
+type AlleleGroup []string
+
// Run calls the pruning steps
func (r *Pruner) Run() {
- log.Errorf("Prune function is still under development. Please use `./prune` for now.")
- log.Errorf("Usage: ./prune -i Allele.ctg.table -b bam.list -r draft.asm.fasta")
+ edges := parseDist(r.PairsFile)
+ alleleGroups := parseAllelesTable(r.AllelesFile)
+ fmt.Println(edges[0])
+ fmt.Println(alleleGroups[0])
+}
+
+// parseAllelesTable imports the contig allelic relationship
+// File is a tab-separated file that looks like the following:
+// Chr10 18902 tig00030660 tig00003333
+// Chr10 35071 tig00038687 tig00038686 tig00065419
+func parseAllelesTable(allelesFile string) []AlleleGroup {
+ log.Noticef("Parse alleles table `%s`", allelesFile)
+ fh := mustOpen(allelesFile)
+ defer fh.Close()
+
+ reader := bufio.NewReader(fh)
+
+ var data []AlleleGroup
+ for {
+ row, err := reader.ReadString('\n')
+ row = strings.TrimSpace(row)
+ if row == "" && err == io.EOF {
+ break
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+ words := strings.Split(row, "\t")
+ if len(words) <= 3 { // Must have at least 4 fields, i.e. 1 pair
+ continue
+ }
+ alleleGroup := make(AlleleGroup, len(words)-2)
+ copy(alleleGroup, words[2:])
+ data = append(data, alleleGroup)
+ }
+
+ return data
}