From adcbd1358f70dcf39375a79a0373c7492e40c45e Mon Sep 17 00:00:00 2001
From: roryk <rory.kirchner@gmail.com>
Date: Tue, 5 Dec 2017 16:11:14 -0500
Subject: [PATCH] Add umi_matrix option.

This writes a matrix to umi_matrix where duplicate UMI have not been
removed, which is helpful for identifying over-amplification issues.

The tagcount test files were modified to have a single duplicated
UMI to test this functionality.

This commit also removes the no_scale_evidence option from
fasttagcount, since it should never be used.
---
 HISTORY.md                                    |   1 +
 examples/tagcount/tagcount.bam                | Bin 343 -> 347 bytes
 examples/tagcount/tagcount.bam.bai            | Bin 104 -> 104 bytes
 examples/tagcount/tagcount.sam                |   1 +
 test.sh                                       |   1 +
 .../test22-fasttagcount-umi-matrix.txt        |   3 ++
 umis/umis.py                                  |  35 ++++++++++++++----
 7 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 tests/correct/test22-fasttagcount-umi-matrix.txt
diff --git a/HISTORY.md b/HISTORY.md
index fbf1530..16c8e5f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 - Support gzipped cellular barcode files.
 - Support 10x V2 barcoding scheme. Thanks to @tomasgomes for the fix.
 - Re-enable streaming for cellular barcode filtering.
+- Add umi_matrix option to fasttagcount. This outputs a non-umi-deduped matrix of counts, useful for QC.
 
 ## 0.8.0
 - Fix `fasttagcount` off-by-one issue.
diff --git a/examples/tagcount/tagcount.bam b/examples/tagcount/tagcount.bam
index 081334942dd48f7a3eb8cd54bd3a04cb8f05408b..de7ed71852d26177030a83746c986b21aa63f48c 100644
GIT binary patch
delta 193
zcmV;y06zcM0^0(RhkwTa&5+Luf<O?5*Uk}ac<`vO*T7xf$~iTzvI?sNZBU2sS%Tim
zhqIMBML~xMKbSwv@A5G{c`)|~xeWEv4Z>UvPsXgxWgKTB1J1WlRVHb2eL8~xk#HUZ
z7hFo<8=j`1N@F^W7Evj*YNf4pn^tga>JAmU8yv9No3hi|g*KM#W`qPqbfx)?`ZS<E
v7Bphv2n*Q<7GrBG<@LYRmoNJGMjxlAFTVdiPFR2acl|q72Lr79lQ;n%rMFss

delta 189
zcmV;u07C!U0@nhNhkwHW&5+Luf<O?5*Uk}ac<`v;HE>p!a!!q_timcm8`L3umY}!t
z;cTT&QP9CdAIu--clnrsJz08;UB_na28&&dFQ%+3WRm0}M}QsHwMpATpD!p-qyPzm
z04Wi+kY%VUV|q=NNGY;vqpfwDR(NWgo)ozo9;rQ;s@K|umNM-Zj0I(M<Heo(Jm5Z+
rJc`4KIE=rrnp#&YZ<teGzUk8kefmFr@%#5_Ui}9?NT=C7lRN<*@c&zb

diff --git a/examples/tagcount/tagcount.bam.bai b/examples/tagcount/tagcount.bam.bai
index f49e2ee0645d32514f5587d7a8815d2a23f2c3dc..d7f1548a24f140379f556b66a3f1bdc740340383 100644
GIT binary patch
delta 20
acmd1Em>|b!KT%$s-JX#F2v{cCY6Ac-lmp5D

delta 20
acmd1Em>|b!JyBkq-I|dB2$(0@Y6Ac-ECay+

diff --git a/examples/tagcount/tagcount.sam b/examples/tagcount/tagcount.sam
index 48f009b..823587d 100644
--- a/examples/tagcount/tagcount.sam
+++ b/examples/tagcount/tagcount.sam
@@ -4,3 +4,4 @@
 NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
 NB500929:118:HHL3MBGXY:2:23110:12155:17145:CELL_GATAACCATA-GTTACCGC:UMI_ACAATA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
 NB500929:118:HHL3MBGXY:2:23110:12255:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
+NB500929:118:HHL3MBGXY:2:23110:12255:17146:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT	0	ENSDART00000163675.1	269	255	36M	*	0	0	ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG	AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE	NH:i:1
diff --git a/test.sh b/test.sh
index ae6d69a..6cddf45 100644
--- a/test.sh
+++ b/test.sh
@@ -167,6 +167,7 @@ umis cb_filter \
 umis fasttagcount \
      --cb_cutoff 1 \
      --cb_histogram examples/tagcount/cb-histogram.txt.gz \
+     --umi_matrix tests/results/test22-fasttagcount-umi-matrix.txt \
      examples/tagcount/tagcount.bam \
     tests/results/test22-fasttagcount-cbhistogram.txt
 
diff --git a/tests/correct/test22-fasttagcount-umi-matrix.txt b/tests/correct/test22-fasttagcount-umi-matrix.txt
new file mode 100644
index 0000000..bb5a26e
--- /dev/null
+++ b/tests/correct/test22-fasttagcount-umi-matrix.txt
@@ -0,0 +1,3 @@
+gene,GATAACCATC-GTTACCGC
+ENSDART00000163675.1,3
+ENSDART00000163675.2,0
diff --git a/umis/umis.py b/umis/umis.py
index 4df3027..c0c8e78 100644
--- a/umis/umis.py
+++ b/umis/umis.py
@@ -697,7 +697,6 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
 @click.option('--cb_cutoff', default=None,
                 help=("Number of counts to filter cellular barcodes. Set to "
                       "'auto' to calculate a cutoff automatically."))
-@click.option('--no_scale_evidence', default=False, is_flag=True)
 @click.option('--subsample', required=False, default=None, type=int)
 @click.option('--parse_tags', required=False, is_flag=True,
                 help=('Parse BAM tags in stead of read name. In this mode '
@@ -708,8 +707,10 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
                       'read gene mapping information in stead of the mapping '
                       'target nane. Useful if e.g. reads have been mapped to '
                       'genome in stead of transcriptome.'))
+@click.option('--umi_matrix', required=False, 
+              help=('Save a sparse matrix of counts without UMI deduping to this file.'))
 def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, 
-                 cb_cutoff, no_scale_evidence, subsample, parse_tags, gene_tags):
+                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
     ''' Count up evidence for tagged molecules, this implementation assumes the
     alignment file is coordinate sorted
     '''
@@ -799,7 +800,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
         sampling_time = time.time() - start_sampling
         logger.info('Sampling done - {:.3}s'.format(sampling_time))
 
-    evidence = collections.defaultdict(lambda: collections.defaultdict(int))
+    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
+    bare_evidence = collections.defaultdict(float)
     logger.info('Tallying evidence')
     start_tally = time.time()
 
@@ -827,6 +829,10 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
     cells = list(cb_hist.index)
     targets_seen = set()
 
+    if umi_matrix:
+        bare_evidence_handle = open(umi_matrix, "w")
+        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")
+
     with open(out, "w") as out_handle:
         out_handle.write(",".join(["gene"] + cells) + "\n")
         for gene, transcripts in transcript_map.items():
@@ -884,10 +890,8 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                     targets_seen.add(target_name)
 
                     # Scale evidence by number of hits
-                    if no_scale_evidence:
-                        evidence[CB][MB] += 1.0
-                    else:
-                        evidence[CB][MB] += weigh_evidence(aln.tags)
+                    evidence[CB][MB] += weigh_evidence(aln.tags)
+                    bare_evidence[CB] += weigh_evidence(aln.tags)
                     kept += 1
                 transcripts_processed += 1
                 if not transcripts_processed % 1000:
@@ -900,9 +904,19 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                 umis = [1 for _, v in evidence[cell].items() if v >= minevidence]
                 earray.append(str(sum(umis)))
             out_handle.write(",".join([gene] + earray) + "\n")
+            earray = []
+            if umi_matrix:
+                for cell in cells:
+                    earray.append(str(int(bare_evidence[cell])))
+                bare_evidence_handle.write(",".join([gene] + earray) + "\n")
+
             evidence = collections.defaultdict(lambda: collections.defaultdict(int))
+            bare_evidence = collections.defaultdict(int)
             genes_processed += 1
 
+    if umi_matrix:
+        bare_evidence_handle.close()
+
     # fill dataframe with missing values, sort and output
     df = pd.read_csv(out, index_col=0, header=0)
     targets = pd.Series(index=set(transcript_map.keys()))
@@ -911,6 +925,13 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
     df = df.sort_index()
     df.to_csv(out)
 
+    if umi_matrix:
+        df = pd.read_csv(umi_matrix, index_col=0, header=0)
+        df = df.reindex(targets.index.values, fill_value=0)
+        df = df.sort_index()
+        df.to_csv(umi_matrix)
+
+
 @click.command()
 @click.argument("csv")
 @click.argument("sparse")