Skip to content

Commit

Permalink
[prune] Add parseAllelesTable()
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Sep 3, 2018
1 parent d660177 commit c42f43a
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 43 deletions.
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ go install github.com/tanghaibao/allhic/cmd/allhic

## Usage

### <kbd>Prune</kbd>

Prune bamfile to remove weak links. WIP.

### <kbd>Extract</kbd>

Extract does a fair amount of preprocessing: 1) extract inter-contig links into a more compact form, specifically into `.clm`; 2) extract intra-contig links and build a distribution; 3) count up the restriction sites to be used in normalization (similar to LACHESIS); 4) bundles the inter-contig links into pairs of contigs.
Expand All @@ -39,6 +35,20 @@ Extract does a fair amount of preprocessing: 1) extract inter-contig links into
allhic extract tests/test.bam tests/seq.fasta.gz
```

### <kbd>Prune</kbd>

This prune step is **optional** for typical inbreeding diploid genomes.
However, pruning will improve the quality of assembly of polyploid genomes.
Prune pairs file to remove allelic/cross-allelic links.

```console
allhic prune tests/Allele.ctg.table tests/test.pairs.txt
```

Please see help string of `allhic prune` on the formatting of
`Allele.ctg.table`.


### <kbd>Partition</kbd>

Given a target `k`, number of partitions, the goal of the partitioning
Expand Down
2 changes: 1 addition & 1 deletion cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ func (r *Partitioner) sortClusters() {

// printClusters shows the contents of the clusters
func (r *Partitioner) printClusters() {
clusterfile := RemoveExt(RemoveExt(r.Distfile)) + ".clusters.txt"
clusterfile := RemoveExt(RemoveExt(r.PairsFile)) + ".clusters.txt"
f, _ := os.Create(clusterfile)
defer f.Close()
w := bufio.NewWriter(f)
Expand Down
70 changes: 41 additions & 29 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,28 +99,6 @@ func main() {
}

app.Commands = []cli.Command{
{
Name: "prune",
Usage: "Prune bamfile to remove weak links",
UsageText: `
allhic prune bamfile [options]
Prune function:
Given a bamfile, the goal of the pruning step is to remove all inter-allelic
links, then it is possible to reconstruct allele-separated assemblies.
`,
Action: func(c *cli.Context) error {
if c.NArg() < 1 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify bamfile", 1)
}

bamfile := c.Args().Get(0)
p := allhic.Pruner{Bamfile: bamfile}
p.Run()
return nil
},
},
{
Name: "extract",
Usage: "Extract Hi-C link size distribution",
Expand All @@ -136,7 +114,7 @@ also prepares for the latter steps of ALLHIC.
Action: func(c *cli.Context) error {
if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1)
}

bamfile := c.Args().Get(0)
Expand All @@ -148,6 +126,40 @@ also prepares for the latter steps of ALLHIC.
return nil
},
},
{
Name: "prune",
Usage: "Prune allelic, cross-allelic and weak links",
UsageText: `
allhic prune alleles.table pairs.txt [options]
Prune function:
Given contig pairing, the goal of the prune step is to remove all inter-allelic
links, then it is possible to reconstruct allele-separated assemblies in the following
steps. The alleles.table file contains tab-separated columns containing contigs that
are considered "allelic". For example:
Chr10 18902 tig00030660 tig00003333
Chr10 35071 tig00038687 tig00038686 tig00065419
These lines means that at certain locations in the genome (typically based on synteny
to a closely-related genome), several contigs are considered allelic. The Hi-C links
between these contigs are removed, in addition, other contigs linking to these contigs
simultaneously would only consider one single-best edge. The "pairs.txt" file is the output
of the "extract" command.
`,
Action: func(c *cli.Context) error {
if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify alleles.table and pairs.txt", 1)
}

allelesFile := c.Args().Get(0)
pairsFile := c.Args().Get(1)
p := allhic.Pruner{AllelesFile: allelesFile, PairsFile: pairsFile}
p.Run()
return nil
},
},
// {
// Name: "anchor",
// Usage: "Anchor contigs based on an iterative merging method",
Expand All @@ -172,7 +184,7 @@ also prepares for the latter steps of ALLHIC.
// },
{
Name: "partition",
Usage: "Separate bamfile into k groups",
Usage: "Separate contigs into k groups",
UsageText: `
allhic partition counts_RE.txt pairs.txt k [options]
Expand All @@ -186,13 +198,13 @@ can be generated with the "extract" sub-command.
Action: func(c *cli.Context) error {
if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile", 1)
return cli.NewExitError("Must specify countsFile, pairsFile and value k", 1)
}

contigsfile := c.Args().Get(0)
distfile := c.Args().Get(1)
pairsFile := c.Args().Get(1)
k, _ := strconv.Atoi(c.Args().Get(2))
p := allhic.Partitioner{Contigsfile: contigsfile, Distfile: distfile, K: k}
p := allhic.Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k}
p.Run()
return nil
},
Expand Down Expand Up @@ -332,7 +344,7 @@ A convenience driver function. Chain the following steps sequentially.
Action: func(c *cli.Context) error {
if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
return cli.NewExitError("Must specify pairsFile, clmfile and bamfile", 1)
}

bamfile := c.Args().Get(0)
Expand All @@ -354,7 +366,7 @@ A convenience driver function. Chain the following steps sequentially.
// Partition into k groups
banner(fmt.Sprintf("Partition into %d groups", k))
partitioner := allhic.Partitioner{Contigsfile: extractor.OutContigsfile,
Distfile: extractor.OutPairsfile, K: k}
PairsFile: extractor.OutPairsfile, K: k}
partitioner.Run()

// Optimize the k groups separately
Expand Down
12 changes: 6 additions & 6 deletions partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
// Partitioner converts the bamfile into a matrix of link counts
type Partitioner struct {
Contigsfile string
Distfile string
PairsFile string
K int
contigs []*ContigInfo
contigToIdx map[string]int
Expand Down Expand Up @@ -158,7 +158,7 @@ func (r *Partitioner) skipRepeats() {

// makeMatrix creates an adjacency matrix containing normalized score
func (r *Partitioner) makeMatrix() {
edges := r.parseDist()
edges := parseDist(r.PairsFile)
N := len(r.contigs)
M := Make2DSliceInt64(N, N)
longestSquared := int64(r.longestRE) * int64(r.longestRE)
Expand Down Expand Up @@ -220,14 +220,14 @@ func (r *Partitioner) splitRE() {
}
}

// parseDist imports the edges of the contig into a slice of DistLine
// DistLine stores the data structure of the distfile
// parseDist imports the edges of the contig into a slice of ContigPair
// ContigPair stores the data structure of the distfile
// #X Y Contig1 Contig2 RE1 RE2 ObservedLinks ExpectedLinksIfAdjacent
// 1 44 idcChr1.ctg24 idcChr1.ctg51 6612 1793 12 121.7
// 1 70 idcChr1.ctg24 idcChr1.ctg52 6612 686 2 59.3
func (r *Partitioner) parseDist() []ContigPair {
func parseDist(pairsFile string) []ContigPair {
var edges []ContigPair
recs := ReadCSVLines(r.Distfile)
recs := ReadCSVLines(pairsFile)

for _, rec := range recs {
ai, _ := strconv.Atoi(rec[0])
Expand Down
52 changes: 49 additions & 3 deletions prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,59 @@

package allhic

import (
"bufio"
"fmt"
"io"
"strings"
)

// Pruner processes the pruning step
type Pruner struct {
Bamfile string
AllelesFile string
PairsFile string
}

// AlleleGroup stores the contig names that are considered allelic
type AlleleGroup []string

// Run calls the pruning steps
func (r *Pruner) Run() {
log.Errorf("Prune function is still under development. Please use `./prune` for now.")
log.Errorf("Usage: ./prune -i Allele.ctg.table -b bam.list -r draft.asm.fasta")
edges := parseDist(r.PairsFile)
alleleGroups := parseAllelesTable(r.AllelesFile)
fmt.Println(edges[0])
fmt.Println(alleleGroups[0])
}

// parseAllelesTable imports the contig allelic relationship
// File is a tab-separated file that looks like the following:
// Chr10 18902 tig00030660 tig00003333
// Chr10 35071 tig00038687 tig00038686 tig00065419
func parseAllelesTable(allelesFile string) []AlleleGroup {
log.Noticef("Parse alleles table `%s`", allelesFile)
fh := mustOpen(allelesFile)
defer fh.Close()

reader := bufio.NewReader(fh)

var data []AlleleGroup
for {
row, err := reader.ReadString('\n')
row = strings.TrimSpace(row)
if row == "" && err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}
words := strings.Split(row, "\t")
if len(words) <= 3 { // Must have at least 4 fields, i.e. 1 pair
continue
}
alleleGroup := make(AlleleGroup, len(words)-2)
copy(alleleGroup, words[2:])
data = append(data, alleleGroup)
}

return data
}

0 comments on commit c42f43a

Please sign in to comment.