From ddf683513721e26518fbed79941eb5c020e6bd94 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Thu, 26 Jul 2018 00:43:27 -0700 Subject: [PATCH] [partition] Add 1 pseudocount to RE counts; Also lower the MinRE cutoff --- base.go | 3 ++- extract.go | 3 ++- partition.go | 22 +++++++++++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/base.go b/base.go index ae7bd21..72c1e2b 100644 --- a/base.go +++ b/base.go @@ -61,7 +61,8 @@ const ( // *** The following parameters are modeled after LACHESIS *** // MinREs is the minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES) - MinREs = 25 + // MinREs = 25 + // MaxLinkDensity is the density threshold before marking a contig as 'repetitve' (CLUSTER_MAX_LINK_DENSITY) MaxLinkDensity = 2 // NonInformativeRatio is the cutoff for recovering skipped contigs back into the clusters (CLUSTER_NONINFORMATIVE_RATIO) diff --git a/extract.go b/extract.go index 405dbd1..0b7d2e0 100644 --- a/extract.go +++ b/extract.go @@ -193,7 +193,8 @@ func (r *Extracter) readFastaAndWriteRE() { } name := string(rec.Name) - count := bytes.Count(rec.Seq.Seq, []byte(r.RE)) + // Add pseudocount of 1 to prevent division by zero + count := bytes.Count(rec.Seq.Seq, []byte(r.RE)) + 1 length := rec.Seq.Length() totalCounts += count totalBp += int64(length) diff --git a/partition.go b/partition.go index 20b9d1e..93826d8 100644 --- a/partition.go +++ b/partition.go @@ -12,7 +12,9 @@ package allhic import ( "fmt" "math" + "path" "strconv" + "strings" ) // Partitioner converts the bamfile into a matrix of link counts @@ -61,10 +63,28 @@ func (r *Partitioner) makeTrivialClusters() { r.clusters = clusters } +// getRE extracts the restriction enzyme from the file name +func (r *Partitioner) getRE() string { + s := path.Base(r.Contigsfile) + return strings.Split(RemoveExt(s), "_")[1] +} + // skipContigsWithFewREs skip contigs with fewere than MinREs // This reads in the `counts_RE.txt` file generated by extract() func (r *Partitioner) skipContigsWithFewREs() { - log.Noticef("skipContigsWithFewREs with MinREs = %d", MinREs) + RE := r.getRE() + MinREs := 25 + switch len(RE) { + case 4: + MinREs = 25 // 25 * (4 ** 4) = 6Kb + case 5: + MinREs = 8 // 8 * (4 ** 5) = 8Kb + case 6: + MinREs = 3 // 3 * (4 ** 6) = 12Kb + default: + MinREs = 2 // We normally only use RE between 4 to 6 base cutter, but just in case + } + log.Noticef("skipContigsWithFewREs with MinREs = %d (RE = %s)", MinREs, RE) nShort := 0 shortRE := 0 shortLen := 0