diff --git a/README.md b/README.md index f4a0a8b..5d1209e 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,6 @@ go install github.com/tanghaibao/allhic/cmd/allhic ## Usage -### Prune - -Prune bamfile to remove weak links. WIP. - ### Extract Extract does a fair amount of preprocessing: 1) extract inter-contig links into a more compact form, specifically into `.clm`; 2) extract intra-contig links and build a distribution; 3) count up the restriction sites to be used in normalization (similar to LACHESIS); 4) bundles the inter-contig links into pairs of contigs. @@ -39,6 +35,20 @@ Extract does a fair amount of preprocessing: 1) extract inter-contig links into allhic extract tests/test.bam tests/seq.fasta.gz ``` +### Prune + +This prune step is **optional** for typical inbreeding diploid genomes. +However, pruning will improve the quality of assembly of polyploid genomes. +Prune pairs file to remove allelic/cross-allelic links. + +```console +allhic prune tests/Allele.ctg.table tests/test.pairs.txt +``` + +Please see help string of `allhic prune` on the formatting of +`Allele.ctg.table`. + + ### Partition Given a target `k`, number of partitions, the goal of the partitioning diff --git a/cluster.go b/cluster.go index 011a078..8e12924 100644 --- a/cluster.go +++ b/cluster.go @@ -329,7 +329,7 @@ func (r *Partitioner) sortClusters() { // printClusters shows the contents of the clusters func (r *Partitioner) printClusters() { - clusterfile := RemoveExt(RemoveExt(r.Distfile)) + ".clusters.txt" + clusterfile := RemoveExt(RemoveExt(r.PairsFile)) + ".clusters.txt" f, _ := os.Create(clusterfile) defer f.Close() w := bufio.NewWriter(f) diff --git a/cmd/allhic.go b/cmd/allhic.go index de31f8d..4ebe364 100644 --- a/cmd/allhic.go +++ b/cmd/allhic.go @@ -99,28 +99,6 @@ func main() { } app.Commands = []cli.Command{ - { - Name: "prune", - Usage: "Prune bamfile to remove weak links", - UsageText: ` - allhic prune bamfile [options] - -Prune function: -Given a bamfile, the goal of the pruning step is to remove all inter-allelic -links, then it is possible to reconstruct allele-separated assemblies. -`, - Action: func(c *cli.Context) error { - if c.NArg() < 1 { - cli.ShowSubcommandHelp(c) - return cli.NewExitError("Must specify bamfile", 1) - } - - bamfile := c.Args().Get(0) - p := allhic.Pruner{Bamfile: bamfile} - p.Run() - return nil - }, - }, { Name: "extract", Usage: "Extract Hi-C link size distribution", @@ -136,7 +114,7 @@ also prepares for the latter steps of ALLHIC. Action: func(c *cli.Context) error { if c.NArg() < 2 { cli.ShowSubcommandHelp(c) - return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1) + return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1) } bamfile := c.Args().Get(0) @@ -148,6 +126,40 @@ also prepares for the latter steps of ALLHIC. return nil }, }, + { + Name: "prune", + Usage: "Prune allelic, cross-allelic and weak links", + UsageText: ` + allhic prune alleles.table pairs.txt [options] + +Prune function: +Given contig pairing, the goal of the prune step is to remove all inter-allelic +links, then it is possible to reconstruct allele-separated assemblies in the following +steps. The alleles.table file contains tab-separated columns containing contigs that +are considered "allelic". For example: + +Chr10 18902 tig00030660 tig00003333 +Chr10 35071 tig00038687 tig00038686 tig00065419 + +These lines means that at certain locations in the genome (typically based on synteny +to a closely-related genome), several contigs are considered allelic. The Hi-C links +between these contigs are removed, in addition, other contigs linking to these contigs +simultaneously would only consider one single-best edge. The "pairs.txt" file is the output +of the "extract" command. +`, + Action: func(c *cli.Context) error { + if c.NArg() < 2 { + cli.ShowSubcommandHelp(c) + return cli.NewExitError("Must specify alleles.table and pairs.txt", 1) + } + + allelesFile := c.Args().Get(0) + pairsFile := c.Args().Get(1) + p := allhic.Pruner{AllelesFile: allelesFile, PairsFile: pairsFile} + p.Run() + return nil + }, + }, // { // Name: "anchor", // Usage: "Anchor contigs based on an iterative merging method", @@ -172,7 +184,7 @@ also prepares for the latter steps of ALLHIC. // }, { Name: "partition", - Usage: "Separate bamfile into k groups", + Usage: "Separate contigs into k groups", UsageText: ` allhic partition counts_RE.txt pairs.txt k [options] @@ -186,13 +198,13 @@ can be generated with the "extract" sub-command. Action: func(c *cli.Context) error { if c.NArg() < 3 { cli.ShowSubcommandHelp(c) - return cli.NewExitError("Must specify distfile", 1) + return cli.NewExitError("Must specify countsFile, pairsFile and value k", 1) } contigsfile := c.Args().Get(0) - distfile := c.Args().Get(1) + pairsFile := c.Args().Get(1) k, _ := strconv.Atoi(c.Args().Get(2)) - p := allhic.Partitioner{Contigsfile: contigsfile, Distfile: distfile, K: k} + p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k} p.Run() return nil }, @@ -332,7 +344,7 @@ A convenience driver function. Chain the following steps sequentially. Action: func(c *cli.Context) error { if c.NArg() < 3 { cli.ShowSubcommandHelp(c) - return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1) + return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1) } bamfile := c.Args().Get(0) @@ -354,7 +366,7 @@ A convenience driver function. Chain the following steps sequentially. // Partition into k groups banner(fmt.Sprintf("Partition into %d groups", k)) partitioner := allhic.Partitioner{Contigsfile: extractor.OutContigsfile, - Distfile: extractor.OutPairsfile, K: k} + PairsFile: extractor.OutPairsfile, K: k} partitioner.Run() // Optimize the k groups separately diff --git a/partition.go b/partition.go index 888fd48..b1e4b31 100644 --- a/partition.go +++ b/partition.go @@ -20,7 +20,7 @@ import ( // Partitioner converts the bamfile into a matrix of link counts type Partitioner struct { Contigsfile string - Distfile string + PairsFile string K int contigs []*ContigInfo contigToIdx map[string]int @@ -158,7 +158,7 @@ func (r *Partitioner) skipRepeats() { // makeMatrix creates an adjacency matrix containing normalized score func (r *Partitioner) makeMatrix() { - edges := r.parseDist() + edges := parseDist(r.PairsFile) N := len(r.contigs) M := Make2DSliceInt64(N, N) longestSquared := int64(r.longestRE) * int64(r.longestRE) @@ -220,14 +220,14 @@ func (r *Partitioner) splitRE() { } } -// parseDist imports the edges of the contig into a slice of DistLine -// DistLine stores the data structure of the distfile +// parseDist imports the edges of the contig into a slice of ContigPair +// ContigPair stores the data structure of the distfile // #X Y Contig1 Contig2 RE1 RE2 ObservedLinks ExpectedLinksIfAdjacent // 1 44 idcChr1.ctg24 idcChr1.ctg51 6612 1793 12 121.7 // 1 70 idcChr1.ctg24 idcChr1.ctg52 6612 686 2 59.3 -func (r *Partitioner) parseDist() []ContigPair { +func parseDist(pairsFile string) []ContigPair { var edges []ContigPair - recs := ReadCSVLines(r.Distfile) + recs := ReadCSVLines(pairsFile) for _, rec := range recs { ai, _ := strconv.Atoi(rec[0]) diff --git a/prune.go b/prune.go index 4f4a394..161d228 100644 --- a/prune.go +++ b/prune.go @@ -9,13 +9,59 @@ package allhic +import ( + "bufio" + "fmt" + "io" + "strings" +) + // Pruner processes the pruning step type Pruner struct { - Bamfile string + AllelesFile string + PairsFile string } +// AlleleGroup stores the contig names that are considered allelic +type AlleleGroup []string + // Run calls the pruning steps func (r *Pruner) Run() { - log.Errorf("Prune function is still under development. Please use `./prune` for now.") - log.Errorf("Usage: ./prune -i Allele.ctg.table -b bam.list -r draft.asm.fasta") + edges := parseDist(r.PairsFile) + alleleGroups := parseAllelesTable(r.AllelesFile) + fmt.Println(edges[0]) + fmt.Println(alleleGroups[0]) +} + +// parseAllelesTable imports the contig allelic relationship +// File is a tab-separated file that looks like the following: +// Chr10 18902 tig00030660 tig00003333 +// Chr10 35071 tig00038687 tig00038686 tig00065419 +func parseAllelesTable(allelesFile string) []AlleleGroup { + log.Noticef("Parse alleles table `%s`", allelesFile) + fh := mustOpen(allelesFile) + defer fh.Close() + + reader := bufio.NewReader(fh) + + var data []AlleleGroup + for { + row, err := reader.ReadString('\n') + row = strings.TrimSpace(row) + if row == "" && err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + words := strings.Split(row, "\t") + if len(words) <= 3 { // Must have at least 4 fields, i.e. 1 pair + continue + } + alleleGroup := make(AlleleGroup, len(words)-2) + copy(alleleGroup, words[2:]) + data = append(data, alleleGroup) + } + + return data }