Skip to content

Commit

Permalink
Merge pull request #181 from JureZmrzlikar/small-fixes
Browse files Browse the repository at this point in the history
xlsites: small fixes
  • Loading branch information
tomazc committed May 29, 2018
2 parents 645aa6c + 1f32399 commit 508ea02
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 22 deletions.
2 changes: 1 addition & 1 deletion iCount/genomes/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,7 @@ def process_gene(gene_content):
LOGGER.info('Segmentation stored in %s', file3.fn)

LOGGER.info('Making also gene level segmentation...')
make_regions(segmentation, out_dir=os.path.dirname(segmentation))
make_regions(segmentation, out_dir=os.path.dirname(os.path.abspath(segmentation)))
return metrics


Expand Down
35 changes: 14 additions & 21 deletions iCount/mapping/xlsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,6 @@
end position of read. By default, score is of course assigned to cross-link
location. But for diagnostic purpuses, scores can also be assigned to middle or
end coordinate of the read.
TODO: check overlap between unique and multimap BED files, should be small,
otherwise, we should think of a more approapriate quantification of (division
of randomers among) unique mapped sites that overlap with multimapped reads
"""
import re
import os
Expand Down Expand Up @@ -498,7 +491,7 @@ def _get_read_data(read, metrics, mapq_th, segmentation=None, gap_th=4):
num_mapped, second_start)


def _processs_bam_file(bam_fname, metrics, mapq_th, skipped, segmentation=None, gap_th=4):
def _processs_bam_file(bam_fname, metrics, mapq_th, skipped, segmentation=None, gap_th=1000000):
"""
Extract data from BAM file into chunks of genome.
Expand Down Expand Up @@ -634,7 +627,7 @@ def finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress):
'reported in file: %s', metrics.strange_recs, skipped)


def run(bam, sites_unique, sites_multi, skipped, group_by='start', quant='cDNA',
def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
segmentation=None, mismatches=1, mapq_th=0, multimax=50, gap_th=4, ratio_th=0.1,
report_progress=False):
"""
Expand All @@ -647,16 +640,16 @@ def run(bam, sites_unique, sites_multi, skipped, group_by='start', quant='cDNA',
the mapq_th to 0 to include all reads. Mapq score is very useful,
because values coming from STAR are from a very limited set: 0 (5 or
more multiple hits), 1 (4 or 3 multiple hits), 3 (2 multiple hits),
255 (unique hit)
255 (single hit)
Parameters
----------
bam : str
Input BAM file with mapped reads.
sites_unique : str
Output BED6 file to store data from uniquely mapped reads.
sites_single : str
Output BED6 file to store data from single mapped reads.
sites_multi : str
Output BED6 file to store data from multi-mapped reads.
Output BED6 file to store data from single and multi-mapped reads.
skipped : str
Output BAM file to store reads that do not map as expected by segmentation and
reference genome sequence. If read's second start does not fall on any of
Expand Down Expand Up @@ -694,40 +687,40 @@ def run(bam, sites_unique, sites_multi, skipped, group_by='start', quant='cDNA',
"""
iCount.log_inputs(LOGGER, level=logging.INFO) # pylint: disable=protected-access

assert sites_unique.endswith(('.bed', '.bed.gz'))
assert sites_single.endswith(('.bed', '.bed.gz'))
assert sites_multi.endswith(('.bed', '.bed.gz'))
assert skipped.endswith(('.bam'))
assert quant in ['cDNA', 'reads']
assert group_by in ['start', 'middle', 'end']

metrics = iCount.Metrics()

unique, multi = {}, {}
single, multi = {}, {}
progress = 0
for (chrom, strand), new_progress, by_pos in _processs_bam_file(
bam, metrics, mapq_th, skipped, segmentation, gap_th):
if report_progress:
# pylint: disable=protected-access
progress = iCount._log_progress(new_progress, progress, LOGGER)

unique_by_pos = {}
single_by_pos = {}
multi_by_pos = {}
for xlink_pos, by_bc in by_pos.items():

_merge_similar_randomers(by_bc, mismatches, ratio_th=ratio_th)

# count uniquely mapped reads only
_update(unique_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1))
# count single mapped reads only
_update(single_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1))
# count all reads mapped les than multimax times
_update(multi_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=multimax))

unique.setdefault((chrom, strand), {}).update(unique_by_pos)
single.setdefault((chrom, strand), {}).update(single_by_pos)
multi.setdefault((chrom, strand), {}).update(multi_by_pos)

# Write output
val_index = ['cDNA', 'reads'].index(quant)
_save_dict(unique, sites_unique, val_index=val_index)
LOGGER.info('Saved to BED file (uniquely mapped reads): %s', sites_unique)
_save_dict(single, sites_single, val_index=val_index)
LOGGER.info('Saved to BED file (single mapped reads): %s', sites_single)
_save_dict(multi, sites_multi, val_index=val_index)
LOGGER.info('Saved to BED file (multi-mapped reads): %s', sites_multi)

Expand Down

0 comments on commit 508ea02

Please sign in to comment.