Skip to content

Commit

Permalink
Merge pull request #185 from JureZmrzlikar/xlsites-barcode-merging
Browse files Browse the repository at this point in the history
xlsites: Add max_barcodes parameter
  • Loading branch information
tomazc committed Aug 20, 2018
2 parents 508ea02 + eef267c commit 7a6dba8
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 12 deletions.
8 changes: 6 additions & 2 deletions iCount/analysis/rnamaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def _process_read_group(xlink, chrom, strand, read, data, segmentation, metrics,


def run(bam, segmentation, out_file, strange, cross_transcript, implicit_handling='closest',
mismatches=2, mapq_th=0, holesize_th=4):
mismatches=2, mapq_th=0, holesize_th=4, max_barcodes=10000):
"""
Compute distribution of cross-links relative to genomic landmarks.
Expand Down Expand Up @@ -419,6 +419,10 @@ def run(bam, segmentation, out_file, strange, cross_transcript, implicit_handlin
holesize_th : int
Raeads with size of holes less than holesize_th are treted as if they
would have no holes.
max_barcodes : int
Skip merging similar barcodes if number of distinct barcodes at
position is higher that this.
Returns
-------
Expand Down Expand Up @@ -460,7 +464,7 @@ def run(bam, segmentation, out_file, strange, cross_transcript, implicit_handlin

for xlink_pos, by_bc in sorted(by_pos.items()):
# pylint: disable=protected-access
iCount.mapping.xlsites._merge_similar_randomers(by_bc, mismatches)
iCount.mapping.xlsites._merge_similar_randomers(by_bc, mismatches, max_barcodes)
# by_bc is modified in place in _merge_similar_randomers

# reads is a list of reads belonging to given barcode in by_bc
Expand Down
13 changes: 10 additions & 3 deletions iCount/mapping/xlsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _update(cur_vals, to_add):
cur_vals[pos] = [p + n for p, n in zip(prev_vals, vals_to_add)]


def _merge_similar_randomers(by_bc, mismatches, ratio_th=0.1):
def _merge_similar_randomers(by_bc, mismatches, max_barcodes, ratio_th=0.1):
"""
Merge randomers on same site that are max ``mismatches`` different.
Expand Down Expand Up @@ -270,6 +270,10 @@ def _merge_similar_randomers(by_bc, mismatches, ratio_th=0.1):
order_bcs = sorted(order_bcs, reverse=True)
min_hit_count = max(1, math.floor(order_bcs[0][0] * ratio_th))

# If number of barcodes exceeds ``max_barcodes`` parameter, just skip the last step
if len(by_bc) > max_barcodes:
return

# Step #3: merge remaining
# For each barcode with number of reads below the threshold, identify if there
# exists any similar one. If there is, join the hits form second barcode to the
Expand Down Expand Up @@ -629,7 +633,7 @@ def finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress):

def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
segmentation=None, mismatches=1, mapq_th=0, multimax=50, gap_th=4, ratio_th=0.1,
report_progress=False):
max_barcodes=10000, report_progress=False):
"""
Identify and quantify cross-linked sites.
Expand Down Expand Up @@ -678,6 +682,9 @@ def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
number of reads supporting the most frequent randomer. All randomers
above this threshold are accepted as unique. Remaining are merged
with the rest, allowing for the specified number of mismatches.
max_barcodes : int
Skip merging similar barcodes if number of distinct barcodes at
position is higher that this.
Returns
-------
Expand Down Expand Up @@ -707,7 +714,7 @@ def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
multi_by_pos = {}
for xlink_pos, by_bc in by_pos.items():

_merge_similar_randomers(by_bc, mismatches, ratio_th=ratio_th)
_merge_similar_randomers(by_bc, mismatches, max_barcodes, ratio_th=ratio_th)

# count single mapped reads only
_update(single_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1))
Expand Down
1 change: 1 addition & 0 deletions iCount/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ def test_xlsites(self):
'--multimax', '50',
'--gap_th', '4',
'--ratio_th', '0.1',
'--max_barcodes', '10000',
'-S', '40', # Supress lower than ERROR messages.
]

Expand Down
3 changes: 1 addition & 2 deletions iCount/tests/test_externals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import warnings
import unittest

import iCount.externals.cutadapt as cutadapt
import iCount.externals.star as star
from iCount.externals import cutadapt, star

from iCount.tests.utils import make_fasta_file, make_fastq_file, get_temp_dir, \
get_temp_file_name, make_file_from_list
Expand Down
32 changes: 27 additions & 5 deletions iCount/tests/test_xlsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def setUp(self):
warnings.simplefilter("ignore", ResourceWarning)

def test_merge(self):
# hit1, hit2, should be a 4-tuple, but for this test it is ok as is
# hit1, hit2, ... should all be a 4-tuple, but for this test it is ok as is
by_bc = {
'AAAAA': ['hit1', 'hit2'],
'AAAAG': ['hit3'],
Expand All @@ -106,11 +106,11 @@ def test_merge(self):
'AAAAG': ['hit3'],
'TTTTN': ['hit42'],
}
xlsites._merge_similar_randomers(by_bc, mismatches=2, ratio_th=0.1) # 1/10
xlsites._merge_similar_randomers(by_bc, mismatches=2, max_barcodes=10000, ratio_th=0.1) # 1/10
self.assertEqual(by_bc, expected)

def test_merge_ratio_th(self):
# hit1, hit2, should be a 4-tuple, but for this test it is ok as is
# hit1, hit2, ... should all be a 4-tuple, but for this test it is ok as is
by_bc = {
'AAAAA': ['hit1', 'hit2', 'hit4', 'hit5', 'hit6'],
'GGGGG': ['hit7', 'hit8'],
Expand All @@ -128,9 +128,31 @@ def test_merge_ratio_th(self):
'GAAAG': ['hit11'],
'CCCCG': ['hit13', 'hit12'],
}
xlsites._merge_similar_randomers(by_bc, mismatches=1, ratio_th=0.4) # 2/5
xlsites._merge_similar_randomers(by_bc, mismatches=1, max_barcodes=10000, ratio_th=0.4) # 2/5
self.assertEqual(by_bc, expected)

def test_max_barcodes(self):
# hit1, hit2, ... should all be a 4-tuple, but for this test it is ok as is
by_bc = {
'AAAAA': ['hit1', 'hit2', 'hit3', 'hit4'],
'AAAAG': ['hit5', 'hit6'],
'AAAAT': ['hit7'],
}
expected_max_barcodes_1 = {
'AAAAA': ['hit1', 'hit2', 'hit3', 'hit4'],
'AAAAG': ['hit5', 'hit6'],
'AAAAT': ['hit7'],
}
xlsites._merge_similar_randomers(by_bc, mismatches=2, max_barcodes=1, ratio_th=0.5)
self.assertEqual(by_bc, expected_max_barcodes_1)

expected_max_barcodes_10 = {
'AAAAA': ['hit1', 'hit2', 'hit3', 'hit4', 'hit7'],
'AAAAG': ['hit5', 'hit6'],
}
xlsites._merge_similar_randomers(by_bc, mismatches=2, max_barcodes=10, ratio_th=0.5)
self.assertEqual(by_bc, expected_max_barcodes_10)

@unittest.skip
def test_todo(self):
"""
Expand All @@ -145,7 +167,7 @@ def test_todo(self):
'AACGG': ['hit2', 'hit3'],
'NAAAN': ['hit4'],
}
xlsites._merge_similar_randomers(by_bc, mismatches=2)
xlsites._merge_similar_randomers(by_bc, mismatches=2, max_barcodes=10000)
expected = {
'AACGG': ['hit2', 'hit3'],
'AAAAA': ['hit1', 'hit4'],
Expand Down

0 comments on commit 7a6dba8

Please sign in to comment.