Skip to content

Commit

Permalink
Merge pull request #171 from JureZmrzlikar/demux
Browse files Browse the repository at this point in the history
demultiplex: Support barcodes with different sizes
  • Loading branch information
tomazc committed Jan 24, 2018
2 parents e3f7a62 + edd06b0 commit f3a2df6
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 14 deletions.
4 changes: 2 additions & 2 deletions iCount/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
import traceback
import collections

from sphinx.ext.napoleon.docstring import NumpyDocstring

import iCount
from iCount import logger

from sphinx.ext.napoleon.docstring import NumpyDocstring

LOGGER = logging.getLogger(__name__)


Expand Down
30 changes: 18 additions & 12 deletions iCount/demultiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,34 +141,40 @@ def _extract(reads, barcodes, mismatches=1, minimum_length=15):
FastqEntry, experiment ID, randomer
"""
all_barcodes = len(barcodes)
barcode_len = len(barcodes[0])
p2n2i = _make_p2n2i(barcodes)
max_votes = len(p2n2i)
randomer_pos = [i for i in range(barcode_len) if i not in p2n2i]
all_barcodes = len(barcodes)
# Precompute barcode length for each barcode
barcode_len = {i: len(brc) for i, brc in enumerate(barcodes)}
# Precompute max possible votes for each barcode. Equals to the number of valid nucleotides
max_votes = {i: len(brc.replace('N', '')) for i, brc in enumerate(barcodes)}
# Precompute positions of randomer nucleotides for each barcode
randomer_pos = {
i: [j for j, nuc in enumerate(brc) if nuc == 'N'] for i, brc in enumerate(barcodes)}

# Determine experiment ID and extract random barcode for each FASTQ entry.
# Experiment ID si determined by "voting": at each non-randomer position
# `pos` there is nucleotide X. Check which experiments (=barcodes) have X
# at position `pos`. Increment votes for them. Experiment with max votes is
# the winner, but only if votes equal/exceed max_votes - mismatches.
# at position `pos`. Increment votes for them.
for read in iCount.files.fastq.FastqFile(reads).read():
votes = [0] * all_barcodes
for pos, n2i in p2n2i.items():
for exp_id in n2i.get(read.seq[pos], []):
votes[exp_id] += 1
mvotes = max(votes)
if mvotes < max_votes - mismatches: # not recognized as valid barcode

# Count mismatches for each barcode. Experiment with least mismatches is the winner.
xmatches = [max_votes[i] - votes[i] for i, barcode in enumerate(barcodes)]
min_mismatches = min(xmatches)
if min_mismatches > mismatches: # not recognized as valid barcode
experiment_id = -1
randomer = ''
seq = read.seq
qual = read.qual
else: # recognized as valid barcode
experiment_id = votes.index(mvotes)
randomer = ''.join(read.seq[p] for p in randomer_pos)
experiment_id = xmatches.index(min_mismatches)
randomer = ''.join(read.seq[p] for p in randomer_pos[experiment_id])
# Keep only the remainder of the sequence / quality scores
seq = read.seq[barcode_len:]
qual = read.qual[barcode_len:]
seq = read.seq[barcode_len[experiment_id]:]
qual = read.qual[barcode_len[experiment_id]:]

if len(seq) >= minimum_length:
new_fq_entry = iCount.files.fastq.FastqEntry(read.id, seq, '+', qual)
Expand Down
63 changes: 63 additions & 0 deletions iCount/tests/test_demultiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,42 @@ def test_extract_mismatch(self):
self.assertEqual(read.plus, '+')
self.assertEqual(read.qual, data[3])

def test_barcode_size_diff(self):
# One mismatch, second barcode.
barcodes = ['NAAANN', 'NNCCCNN']
adapter = 'CCCCCC'
data = [
'@header1',
'TACATT' + adapter + make_sequence(40, rnd_seed=0),
'+',
make_quality_scores(50, rnd_seed=0) + '!J',
'@header2',
'AACCCTT' + adapter + make_sequence(39, rnd_seed=0),
'+',
make_quality_scores(50, rnd_seed=0) + '!J',
]
fq_fname = get_temp_file_name(extension='fq')
fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt')
fq_file.write(iCount.files.fastq.FastqEntry(*data[:4]))
fq_file.write(iCount.files.fastq.FastqEntry(*data[4:]))
fq_file.close()

handle = demultiplex._extract(fq_fname, barcodes, mismatches=1)
read1, exp_id1, randomer1 = next(handle)
self.assertEqual(exp_id1, 0)
self.assertEqual(randomer1, 'TTT')
self.assertEqual(read1.id, data[0])
self.assertEqual(read1.seq, data[1][6:])
self.assertEqual(read1.plus, '+')
self.assertEqual(read1.qual, data[3][6:])
read2, exp_id2, randomer2 = next(handle)
self.assertEqual(exp_id2, 1)
self.assertEqual(randomer2, 'AATT')
self.assertEqual(read2.id, data[4])
self.assertEqual(read2.seq, data[5][7:])
self.assertEqual(read2.plus, '+')
self.assertEqual(read2.qual, data[7][7:])


class TestMakeP2n2i(unittest.TestCase):

Expand Down Expand Up @@ -203,6 +239,33 @@ def test_make_p2n2i(self):
}
self.assertEqual(result, expected)

def test_barcode_size_diff(self):
barcodes = [
'NATAN',
'NNNCAGN',
]

result = demultiplex._make_p2n2i(barcodes)
expected = {
1: {
'A': {0}
},
2: {
'T': {0}
},
3: {
'A': {0},
'C': {1},
},
4: {
'A': {1},
},
5: {
'G': {1},
},
}
self.assertEqual(result, expected)


if __name__ == '__main__':
unittest.main()

0 comments on commit f3a2df6

Please sign in to comment.