Skip to content

Commit

Permalink
[README] Update doc on build
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jul 6, 2018
1 parent 0d0f3d1 commit 0abaa4f
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 48 deletions.
36 changes: 28 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# ALLHIC: Genome scaffolding based on HiC data

_ _____ _____ ____ ____ _____ ______
/ \ |_ _| |_ _| |_ || _||_ _|.' ___ |
/ _ \ | | | | | |__| | | | / .' \_|
/ ___ \ | | _ | | _ | __ | | | | |
_/ / \ \_ _| |__/ | _| |__/ | _| | | |_ _| |_\ `.___.'\
|____| |____||________||________||____||____||_____|`.____ .'

# ALLHIC: Genome scaffolding based on HiC data

[![Travis-CI](https://travis-ci.org/tanghaibao/allhic.svg?branch=master)](https://travis-ci.org/tanghaibao/allhic)
[![GOreport](https://goreportcard.com/badge/github.com/tanghaibao/allhic)](https://goreportcard.com/report/github.com/tanghaibao/allhic)

Expand All @@ -19,7 +19,8 @@ the [releases](https://github.com/tanghaibao/allhic/releases) and make sure to
`chmod +x` the resulting binary.

If you are using [go](https://github.com/golang/go), you can build from source with:
```

```console
go get -u -t -v github.com/tanghaibao/allhic/...
go install github.com/tanghaibao/allhic/cmd/allhic
```
Expand All @@ -30,6 +31,14 @@ go install github.com/tanghaibao/allhic/cmd/allhic

Prune bamfile to remove weak links. WIP.

### Extract

Extract does a fair amount of preprocessing: 1) extract inter-contig links into a more compact form, specifically into `.clm`; 2) extract intra-contig links and build a distribution; 3) count up the restriction sites to be used in normalization (similar to LACHESIS); 4) bundles the inter-contig links into pairs of contigs.

```console
allhic extract tests/test.bam tests/test.fasta
```

### Partition

Given a target `k`, number of partitions, the goal of the partitioning
Expand All @@ -43,8 +52,8 @@ using eigen decomposition of the modularity matrix.
![networkbefore](script/graph-s.png)
![networkafter](script/graph-s.partitioned.png)

```bash
allhic partition tests/test.bam
```console
allhic partition tests/test.counts_GATC.txt tests/test.pairs.txt
```

### Optimize
Expand All @@ -57,16 +66,27 @@ Optimize uses Genetic Algorithm (GA) to search for the best scoring solution.

![ga](tests/test-movie.gif)

```bash
allhic optimize tests/test.clm
```console
allhic optimize tests/test.counts_GATC.txt tests/test.clm
```

### Build

Build genome release. WIP.

## Pipeline

Following the 4 steps of `prune`, `extract`, `partition`, `optimize`

```console
allhic extract T4_Chr1/{prunning.sub.bam,seq.fasta}
allhic partition T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.pairs.txt} 2
allhic optimize T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.clm}
allhic build T4_Chr/{prunning.sub.tour,seq.fasta}
```

## WIP features

- [ ] Add restriction enzyme for better normalization of contig lengths
- [x] Add restriction enzyme for better normalization of contig lengths
- [ ] Add test suites
- [ ] Speed up `build`
32 changes: 9 additions & 23 deletions build.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"bufio"
"fmt"
"os"
"strings"

"github.com/shenwei356/bio/seqio/fai"
)
Expand Down Expand Up @@ -138,28 +137,15 @@ func (r *Builder) Build(agpfile string) {
// > name
// contig1+ contig2- contig3?
func (r *OO) ParseTour(tourfile string) {
log.Noticef("Parse tourfile `%s`", tourfile)

file, _ := os.Open(tourfile)
scanner := bufio.NewScanner(file)
var (
name string
strand byte
)
for scanner.Scan() {
words := strings.Fields(scanner.Text())
if words[0][0] == '>' {
name = words[0][1:]
continue
}
for _, tig := range words {
at, ao := tig[:len(tig)-1], tig[len(tig)-1]
if ao == '+' || ao == '-' || ao == '?' {
tig, strand = at, ao
} else {
strand = '?'
}
r.Add(name, tig, r.sizes[tig], strand)
words := parseTourFile(tourfile)
var strand byte
for _, tig := range words {
at, ao := tig[:len(tig)-1], tig[len(tig)-1]
if ao == '+' || ao == '-' || ao == '?' {
tig, strand = at, ao
} else {
strand = '?'
}
r.Add("name", tig, r.sizes[tig], strand)
}
}
15 changes: 6 additions & 9 deletions clm.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"io"
"math"
"os"
"path"
"strconv"
"strings"
"sync"
Expand All @@ -31,9 +30,8 @@ import (
// tig00030676- tig00077819+ 7 118651 91877 91877 209149 125906 146462 146462
// tig00030676- tig00077819- 7 108422 157204 157204 137924 142611 75169 75169
type CLM struct {
Name string
REfile string
Clmfile string
Idsfile string
Tigs []TigF
Tour Tour
Signs []byte
Expand Down Expand Up @@ -93,11 +91,10 @@ type Tour struct {
}

// NewCLM is the constructor for CLM
func NewCLM(Clmfile string) *CLM {
func NewCLM(Clmfile, REfile string) *CLM {
p := new(CLM)
p.Name = RemoveExt(path.Base(Clmfile))
p.REfile = REfile
p.Clmfile = Clmfile
p.Idsfile = RemoveExt(Clmfile) + ".ids"
p.tigToIdx = make(map[string]int)
p.contacts = make(map[Pair]Contact)
p.orientedContacts = make(map[OrientedPair]GArray)
Expand All @@ -115,14 +112,14 @@ func NewCLM(Clmfile string) *CLM {
// tig00035238 46779 recover
// tig00030900 119291
func (r *CLM) ParseIds() {
file, _ := os.Open(r.Idsfile)
log.Noticef("Parse idsfile `%s`", r.Idsfile)
file, _ := os.Open(r.REfile)
log.Noticef("Parse idsfile `%s`", r.REfile)
scanner := bufio.NewScanner(file)
idx := 0
for scanner.Scan() {
words := strings.Fields(scanner.Text())
tig := words[0]
size, _ := strconv.Atoi(words[1])
size, _ := strconv.Atoi(words[len(words)-1])
r.Tigs = append(r.Tigs, TigF{idx, tig, size, true})
r.tigToIdx[tig] = idx
idx++
Expand Down
16 changes: 9 additions & 7 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,14 @@ of the contig linkage graph.
Name: "partition",
Usage: "Separate bamfile into k groups",
UsageText: `
allhic partition enrichment.txt distance.txt k [options]
allhic partition counts_RE.txt pairs.txt k [options]
Partition function:
Given a target k, number of partitions, the goal of the partitioning is to
separate all the contigs into separate clusters. As with all clustering
algorithm, there is an optimization goal here. The LACHESIS algorithm is
a hierarchical clustering algorithm using average links. The distfile can be
generated with the "extract" sub-command.
a hierarchical clustering algorithm using average links. The two input files
can be generated with the "extract" sub-command.
`,
Action: func(c *cli.Context) error {
if len(c.Args()) < 3 {
Expand All @@ -162,7 +162,7 @@ generated with the "extract" sub-command.
Name: "optimize",
Usage: "Order-and-orient tigs in a group",
UsageText: `
allhic optimize clmfile [options]
allhic optimize counts_RE.txt clmfile [options]
Optimize function:
Given a set of Hi-C contacts between contigs, as specified in the
Expand Down Expand Up @@ -200,19 +200,21 @@ for these contigs.
},
},
Action: func(c *cli.Context) error {
if len(c.Args()) < 1 {
if len(c.Args()) < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify clmfile", 1)
}

clmfile := c.Args().Get(0)
refile := c.Args().Get(0)
clmfile := c.Args().Get(1)
runGA := !c.Bool("skipGA")
startOver := c.Bool("startOver")
seed := c.Int64("seed")
npop := c.Int("npop")
ngen := c.Int("ngen")
mutpb := c.Float64("mutpb")
p := allhic.Optimizer{Clmfile: clmfile, RunGA: runGA, StartOver: startOver,
p := allhic.Optimizer{REfile: refile, Clmfile: clmfile,
RunGA: runGA, StartOver: startOver,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
p.Run()
return nil
Expand Down
3 changes: 2 additions & 1 deletion optimize.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
// Optimizer runs the order-and-orientation procedure, given a clmfile
type Optimizer struct {
Clmfile string
REfile string
RunGA bool
StartOver bool
Seed int64
Expand All @@ -31,7 +32,7 @@ type Optimizer struct {

// Run kicks off the Optimizer
func (r *Optimizer) Run() {
clm := NewCLM(r.Clmfile)
clm := NewCLM(r.Clmfile, r.REfile)
tourfile := RemoveExt(r.Clmfile) + ".tour"
shuffle := false

Expand Down

0 comments on commit 0abaa4f

Please sign in to comment.