From adcbd1358f70dcf39375a79a0373c7492e40c45e Mon Sep 17 00:00:00 2001 From: roryk Date: Tue, 5 Dec 2017 16:11:14 -0500 Subject: [PATCH] Add umi_matrix option. This writes a matrix to umi_matrix where duplicate UMI have not been removed, which is helpful for identifying over-amplification issues. The tagcount test files were modified to have a single duplicated UMI to test this functionality. This commit also removes the no_scale_evidence option from fasttagcount, since it should never be used. --- HISTORY.md | 1 + examples/tagcount/tagcount.bam | Bin 343 -> 347 bytes examples/tagcount/tagcount.bam.bai | Bin 104 -> 104 bytes examples/tagcount/tagcount.sam | 1 + test.sh | 1 + .../test22-fasttagcount-umi-matrix.txt | 3 ++ umis/umis.py | 35 ++++++++++++++---- 7 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 tests/correct/test22-fasttagcount-umi-matrix.txt diff --git a/HISTORY.md b/HISTORY.md index fbf1530..16c8e5f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ - Support gzipped cellular barcode files. - Support 10x V2 barcoding scheme. Thanks to @tomasgomes for the fix. - Re-enable streaming for cellular barcode filtering. +- Add umi_matrix option to fasttagcount. This outputs a non-umi-deduped matrix of counts, useful for QC. ## 0.8.0 - Fix `fasttagcount` off-by-one issue. diff --git a/examples/tagcount/tagcount.bam b/examples/tagcount/tagcount.bam index 081334942dd48f7a3eb8cd54bd3a04cb8f05408b..de7ed71852d26177030a83746c986b21aa63f48c 100644 GIT binary patch delta 193 zcmV;y06zcM0^0(RhkwTa&5+LufUvPsXgxWgKTB1J1WlRVHb2eL8~xk#HUZ z7hFo<8=j`1N@F^W7Evj*YNf4pn^tga>JAmU8yv9No3hi|g*KM#W`qPqbfx)?`ZSp!a!!q_timcm8`L3umY}!t z;cTT&QP9CdAIu--clnrsJz08;UB_na28&&dFQ%+3WRm0}M}QsHwMpATpD!p-qyPzm z04Wi+kY%VUV|q=NNGY;vqpfwDR(NWgo)ozo9;rQ;s@K|umNM-Zj0I(M|b!KT%$s-JX#F2v{cCY6Ac-lmp5D delta 20 acmd1Em>|b!JyBkq-I|dB2$(0@Y6Ac-ECay+ diff --git a/examples/tagcount/tagcount.sam b/examples/tagcount/tagcount.sam index 48f009b..823587d 100644 --- a/examples/tagcount/tagcount.sam +++ b/examples/tagcount/tagcount.sam @@ -4,3 +4,4 @@ NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 NB500929:118:HHL3MBGXY:2:23110:12155:17145:CELL_GATAACCATA-GTTACCGC:UMI_ACAATA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 NB500929:118:HHL3MBGXY:2:23110:12255:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 +NB500929:118:HHL3MBGXY:2:23110:12255:17146:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 diff --git a/test.sh b/test.sh index ae6d69a..6cddf45 100644 --- a/test.sh +++ b/test.sh @@ -167,6 +167,7 @@ umis cb_filter \ umis fasttagcount \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt.gz \ + --umi_matrix tests/results/test22-fasttagcount-umi-matrix.txt \ examples/tagcount/tagcount.bam \ tests/results/test22-fasttagcount-cbhistogram.txt diff --git a/tests/correct/test22-fasttagcount-umi-matrix.txt b/tests/correct/test22-fasttagcount-umi-matrix.txt new file mode 100644 index 0000000..bb5a26e --- /dev/null +++ b/tests/correct/test22-fasttagcount-umi-matrix.txt @@ -0,0 +1,3 @@ +gene,GATAACCATC-GTTACCGC +ENSDART00000163675.1,3 +ENSDART00000163675.2,0 diff --git a/umis/umis.py b/umis/umis.py index 4df3027..c0c8e78 100644 --- a/umis/umis.py +++ b/umis/umis.py @@ -697,7 +697,6 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, @click.option('--cb_cutoff', default=None, help=("Number of counts to filter cellular barcodes. Set to " "'auto' to calculate a cutoff automatically.")) -@click.option('--no_scale_evidence', default=False, is_flag=True) @click.option('--subsample', required=False, default=None, type=int) @click.option('--parse_tags', required=False, is_flag=True, help=('Parse BAM tags in stead of read name. In this mode ' @@ -708,8 +707,10 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, 'read gene mapping information in stead of the mapping ' 'target nane. Useful if e.g. reads have been mapped to ' 'genome in stead of transcriptome.')) +@click.option('--umi_matrix', required=False, + help=('Save a sparse matrix of counts without UMI deduping to this file.')) def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, - cb_cutoff, no_scale_evidence, subsample, parse_tags, gene_tags): + cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' @@ -799,7 +800,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) - evidence = collections.defaultdict(lambda: collections.defaultdict(int)) + evidence = collections.defaultdict(lambda: collections.defaultdict(float)) + bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() @@ -827,6 +829,10 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cells = list(cb_hist.index) targets_seen = set() + if umi_matrix: + bare_evidence_handle = open(umi_matrix, "w") + bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") + with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): @@ -884,10 +890,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, targets_seen.add(target_name) # Scale evidence by number of hits - if no_scale_evidence: - evidence[CB][MB] += 1.0 - else: - evidence[CB][MB] += weigh_evidence(aln.tags) + evidence[CB][MB] += weigh_evidence(aln.tags) + bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: @@ -900,9 +904,19 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, umis = [1 for _, v in evidence[cell].items() if v >= minevidence] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") + earray = [] + if umi_matrix: + for cell in cells: + earray.append(str(int(bare_evidence[cell]))) + bare_evidence_handle.write(",".join([gene] + earray) + "\n") + evidence = collections.defaultdict(lambda: collections.defaultdict(int)) + bare_evidence = collections.defaultdict(int) genes_processed += 1 + if umi_matrix: + bare_evidence_handle.close() + # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) @@ -911,6 +925,13 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, df = df.sort_index() df.to_csv(out) + if umi_matrix: + df = pd.read_csv(umi_matrix, index_col=0, header=0) + df = df.reindex(targets.index.values, fill_value=0) + df = df.sort_index() + df.to_csv(umi_matrix) + + @click.command() @click.argument("csv") @click.argument("sparse")