In [2]:
from __future__ import print_function, division

import csv
import os
import argparse

from itertools import izip
from HTSeq import FastqReader
from collections import Counter

In [18]:
%reload_ext line_profiler
import line_profiler

In [3]:
def bc_dict_seq2id(bc_index_fpath):
    """ dict[barcode_seq] = barcode_id """
    out = dict()
    with open(bc_index_fpath, 'rb') as fin:
        freader = csv.reader(fin, delimiter='\t')
        next(freader)
        out = {row[1]: int(row[0]) for row in freader}
    return(out)


In [27]:
def demultiplexing(read1_fpath, read2_fpath, outdir, bc_dict,
                   len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,
                   do_bc_rev_complement=False,
                   do_tx_rev_complement=False,
                   verbose=False):
    """
    Demultiplexing to fastq files based on
    barcode sequence.
    """
    fh_umibc = FastqReader(read1_fpath)
    fh_tx = FastqReader(read2_fpath)

    sample_counter = Counter()

    bc_fhout = dict()
    for bc_seq, bc_id in bc_dict.iteritems():  # py2
        bc_fhout[bc_seq] = os.path.join(outdir, 'BC_{}-{}.fastq'.format(bc_id,
                                                                        bc_seq))
    bc_fhout['UNKNOWNBC_R1'] = os.path.join(outdir, 'UNKNOWNBC_R1.fastq')
    bc_fhout['UNKNOWNBC_R2'] = os.path.join(outdir, 'UNKNOWNBC_R2.fastq')

    for bc_seq, v in bc_fhout.items():
        bc_fhout[bc_seq] = open(v, 'wb')

    for i, (read_umibc, read_tx) in enumerate(izip(fh_umibc, fh_tx)):
        sample_counter['total'] += 1
        if len(read_umibc) < len_umi + len_bc:
            continue

        if min(read_umibc.qual[:(len_umi + len_bc)]) < bc_qual_min:
            continue

        sample_counter['qualified'] += 1
        cell_bc = read_umibc.seq[len_umi:(len_umi + len_bc)]
        umi = read_umibc.seq[0:len_umi]
        try:
            fhout = bc_fhout[cell_bc]
            if len(read_tx) > len_tx:
                read_tx = read_tx[:len_tx]

            read_name = read_tx.name.split()[0] + ':UMI:{}'.format(umi)
            read_tx.name = read_name

            read_tx.write_to_fastq_file(fhout)
            sample_counter['saved'] += 1
        except KeyError as e:
            fhout = bc_fhout['UNKNOWNBC_R1']
            read_umibc.write_to_fastq_file(fhout)
            fhout = bc_fhout['UNKNOWNBC_R2']
            read_tx.write_to_fastq_file(fhout)
            sample_counter['unknown'] += 1
    sample_counter['unqualified'] = sample_counter['total'] - sample_counter['qualified']
    for _, v in bc_fhout.items():
        v.close()
    if verbose:
        print(sample_counter)
    return(sample_counter)

# Main Run

In [28]:
bc_index_fpath='/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab'
r1_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R1_001.fastq.1M.gz'
r2_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R2_001.fastq.1M.gz'
outdir='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/bc_split_Y'

In [29]:
bc_dict = bc_dict_seq2id(bc_index_fpath)

In [30]:
%lprun -f demultiplexing demultiplexing(read1_fpath=r1_fpath, read2_fpath=r2_fpath,\
                         outdir=outdir, bc_dict=bc_dict,\
                         len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,\
                         do_bc_rev_complement=False, do_tx_rev_complement=False,verbose=False)

2