Add umi_matrix option.

This writes a matrix to umi_matrix where duplicate UMI have not been removed, which is helpful for identifying over-amplification issues. The tagcount test files were modified to have a single duplicated UMI to test this functionality. This commit also removes the no_scale_evidence option from fasttagcount, since it should never be used.
vals · Dec 5, 2017 · adcbd13 · adcbd13
1 parent 3be2525
commit adcbd13
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 7 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -4,6 +4,7 @@
 - Support gzipped cellular barcode files.
 - Support 10x V2 barcoding scheme. Thanks to @tomasgomes for the fix.
 - Re-enable streaming for cellular barcode filtering.
+- Add umi_matrix option to fasttagcount. This outputs a non-umi-deduped matrix of counts, useful for QC.
 
 ## 0.8.0
 - Fix `fasttagcount` off-by-one issue.

diff --git a/examples/tagcount/tagcount.bam b/examples/tagcount/tagcount.bam
diff --git a/examples/tagcount/tagcount.bam.bai b/examples/tagcount/tagcount.bam.bai
diff --git a/examples/tagcount/tagcount.sam b/examples/tagcount/tagcount.sam
@@ -4,3 +4,4 @@
 NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
 NB500929:118:HHL3MBGXY:2:23110:12155:17145:CELL_GATAACCATA-GTTACCGC:UMI_ACAATA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
 NB500929:118:HHL3MBGXY:2:23110:12255:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
+NB500929:118:HHL3MBGXY:2:23110:12255:17146:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
diff --git a/test.sh b/test.sh
@@ -167,6 +167,7 @@ umis cb_filter \
 umis fasttagcount \
      --cb_cutoff 1 \
      --cb_histogram examples/tagcount/cb-histogram.txt.gz \
+     --umi_matrix tests/results/test22-fasttagcount-umi-matrix.txt \
      examples/tagcount/tagcount.bam \
     tests/results/test22-fasttagcount-cbhistogram.txt
 

diff --git a/tests/correct/test22-fasttagcount-umi-matrix.txt b/tests/correct/test22-fasttagcount-umi-matrix.txt
@@ -0,0 +1,3 @@
+gene,GATAACCATC-GTTACCGC
+ENSDART00000163675.1,3
+ENSDART00000163675.2,0
diff --git a/umis/umis.py b/umis/umis.py
@@ -697,7 +697,6 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
 @click.option('--cb_cutoff', default=None,
                 help=("Number of counts to filter cellular barcodes. Set to "
                       "'auto' to calculate a cutoff automatically."))
-@click.option('--no_scale_evidence', default=False, is_flag=True)
 @click.option('--subsample', required=False, default=None, type=int)
 @click.option('--parse_tags', required=False, is_flag=True,
                 help=('Parse BAM tags in stead of read name. In this mode '
@@ -708,8 +707,10 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
                       'read gene mapping information in stead of the mapping '
                       'target nane. Useful if e.g. reads have been mapped to '
                       'genome in stead of transcriptome.'))
+@click.option('--umi_matrix', required=False, 
+              help=('Save a sparse matrix of counts without UMI deduping to this file.'))
 def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, 
-                 cb_cutoff, no_scale_evidence, subsample, parse_tags, gene_tags):
+                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
     ''' Count up evidence for tagged molecules, this implementation assumes the
     alignment file is coordinate sorted
     '''
@@ -799,7 +800,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
         sampling_time = time.time() - start_sampling
         logger.info('Sampling done - {:.3}s'.format(sampling_time))
 
-    evidence = collections.defaultdict(lambda: collections.defaultdict(int))
+    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
+    bare_evidence = collections.defaultdict(float)
     logger.info('Tallying evidence')
     start_tally = time.time()
 
@@ -827,6 +829,10 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
     cells = list(cb_hist.index)
     targets_seen = set()
 
+    if umi_matrix:
+        bare_evidence_handle = open(umi_matrix, "w")
+        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")
+
     with open(out, "w") as out_handle:
         out_handle.write(",".join(["gene"] + cells) + "\n")
         for gene, transcripts in transcript_map.items():
@@ -884,10 +890,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                     targets_seen.add(target_name)
 
                     # Scale evidence by number of hits
-                    if no_scale_evidence:
-                        evidence[CB][MB] += 1.0
-                    else:
-                        evidence[CB][MB] += weigh_evidence(aln.tags)
+                    evidence[CB][MB] += weigh_evidence(aln.tags)
+                    bare_evidence[CB] += weigh_evidence(aln.tags)
                     kept += 1
                 transcripts_processed += 1
                 if not transcripts_processed % 1000:
@@ -900,9 +904,19 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                 umis = [1 for _, v in evidence[cell].items() if v >= minevidence]
                 earray.append(str(sum(umis)))
             out_handle.write(",".join([gene] + earray) + "\n")
+            earray = []
+            if umi_matrix:
+                for cell in cells:
+                    earray.append(str(int(bare_evidence[cell])))
+                bare_evidence_handle.write(",".join([gene] + earray) + "\n")
+
             evidence = collections.defaultdict(lambda: collections.defaultdict(int))
+            bare_evidence = collections.defaultdict(int)
             genes_processed += 1
 
+    if umi_matrix:
+        bare_evidence_handle.close()
+
     # fill dataframe with missing values, sort and output
     df = pd.read_csv(out, index_col=0, header=0)
     targets = pd.Series(index=set(transcript_map.keys()))
@@ -911,6 +925,13 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
     df = df.sort_index()
     df.to_csv(out)
 
+    if umi_matrix:
+        df = pd.read_csv(umi_matrix, index_col=0, header=0)
+        df = df.reindex(targets.index.values, fill_value=0)
+        df = df.sort_index()
+        df.to_csv(umi_matrix)
+
+
 @click.command()
 @click.argument("csv")
 @click.argument("sparse")