Skip to content

Commit

Permalink
Add umi_matrix option.
Browse files Browse the repository at this point in the history
This writes a matrix to umi_matrix where duplicate UMI have not been
removed, which is helpful for identifying over-amplification issues.

The tagcount test files were modified to have a single duplicated
UMI to test this functionality.

This commit also removes the no_scale_evidence option from
fasttagcount, since it should never be used.
  • Loading branch information
roryk committed Dec 5, 2017
1 parent 3be2525 commit adcbd13
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 7 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Expand Up @@ -4,6 +4,7 @@
- Support gzipped cellular barcode files.
- Support 10x V2 barcoding scheme. Thanks to @tomasgomes for the fix.
- Re-enable streaming for cellular barcode filtering.
- Add umi_matrix option to fasttagcount. This outputs a non-umi-deduped matrix of counts, useful for QC.

## 0.8.0
- Fix `fasttagcount` off-by-one issue.
Expand Down
Binary file modified examples/tagcount/tagcount.bam
Binary file not shown.
Binary file modified examples/tagcount/tagcount.bam.bai
Binary file not shown.
1 change: 1 addition & 0 deletions examples/tagcount/tagcount.sam
Expand Up @@ -4,3 +4,4 @@
NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1
NB500929:118:HHL3MBGXY:2:23110:12155:17145:CELL_GATAACCATA-GTTACCGC:UMI_ACAATA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1
NB500929:118:HHL3MBGXY:2:23110:12255:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1
NB500929:118:HHL3MBGXY:2:23110:12255:17146:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1
1 change: 1 addition & 0 deletions test.sh
Expand Up @@ -167,6 +167,7 @@ umis cb_filter \
umis fasttagcount \
--cb_cutoff 1 \
--cb_histogram examples/tagcount/cb-histogram.txt.gz \
--umi_matrix tests/results/test22-fasttagcount-umi-matrix.txt \
examples/tagcount/tagcount.bam \
tests/results/test22-fasttagcount-cbhistogram.txt

Expand Down
3 changes: 3 additions & 0 deletions tests/correct/test22-fasttagcount-umi-matrix.txt
@@ -0,0 +1,3 @@
gene,GATAACCATC-GTTACCGC
ENSDART00000163675.1,3
ENSDART00000163675.2,0
35 changes: 28 additions & 7 deletions umis/umis.py
Expand Up @@ -697,7 +697,6 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
@click.option('--cb_cutoff', default=None,
help=("Number of counts to filter cellular barcodes. Set to "
"'auto' to calculate a cutoff automatically."))
@click.option('--no_scale_evidence', default=False, is_flag=True)
@click.option('--subsample', required=False, default=None, type=int)
@click.option('--parse_tags', required=False, is_flag=True,
help=('Parse BAM tags in stead of read name. In this mode '
Expand All @@ -708,8 +707,10 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
'read gene mapping information in stead of the mapping '
'target nane. Useful if e.g. reads have been mapped to '
'genome in stead of transcriptome.'))
@click.option('--umi_matrix', required=False,
help=('Save a sparse matrix of counts without UMI deduping to this file.'))
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
cb_cutoff, no_scale_evidence, subsample, parse_tags, gene_tags):
cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
''' Count up evidence for tagged molecules, this implementation assumes the
alignment file is coordinate sorted
'''
Expand Down Expand Up @@ -799,7 +800,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
sampling_time = time.time() - start_sampling
logger.info('Sampling done - {:.3}s'.format(sampling_time))

evidence = collections.defaultdict(lambda: collections.defaultdict(int))
evidence = collections.defaultdict(lambda: collections.defaultdict(float))
bare_evidence = collections.defaultdict(float)
logger.info('Tallying evidence')
start_tally = time.time()

Expand Down Expand Up @@ -827,6 +829,10 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
cells = list(cb_hist.index)
targets_seen = set()

if umi_matrix:
bare_evidence_handle = open(umi_matrix, "w")
bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")

with open(out, "w") as out_handle:
out_handle.write(",".join(["gene"] + cells) + "\n")
for gene, transcripts in transcript_map.items():
Expand Down Expand Up @@ -884,10 +890,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
targets_seen.add(target_name)

# Scale evidence by number of hits
if no_scale_evidence:
evidence[CB][MB] += 1.0
else:
evidence[CB][MB] += weigh_evidence(aln.tags)
evidence[CB][MB] += weigh_evidence(aln.tags)
bare_evidence[CB] += weigh_evidence(aln.tags)
kept += 1
transcripts_processed += 1
if not transcripts_processed % 1000:
Expand All @@ -900,9 +904,19 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
umis = [1 for _, v in evidence[cell].items() if v >= minevidence]
earray.append(str(sum(umis)))
out_handle.write(",".join([gene] + earray) + "\n")
earray = []
if umi_matrix:
for cell in cells:
earray.append(str(int(bare_evidence[cell])))
bare_evidence_handle.write(",".join([gene] + earray) + "\n")

evidence = collections.defaultdict(lambda: collections.defaultdict(int))
bare_evidence = collections.defaultdict(int)
genes_processed += 1

if umi_matrix:
bare_evidence_handle.close()

# fill dataframe with missing values, sort and output
df = pd.read_csv(out, index_col=0, header=0)
targets = pd.Series(index=set(transcript_map.keys()))
Expand All @@ -911,6 +925,13 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
df = df.sort_index()
df.to_csv(out)

if umi_matrix:
df = pd.read_csv(umi_matrix, index_col=0, header=0)
df = df.reindex(targets.index.values, fill_value=0)
df = df.sort_index()
df.to_csv(umi_matrix)


@click.command()
@click.argument("csv")
@click.argument("sparse")
Expand Down

0 comments on commit adcbd13

Please sign in to comment.