In [4]:
import HTSeq
HTSeq.__version__

'0.8.0'

In [5]:
import pickle

In [6]:
sam_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/smallsam/BC-15-TCTGAG.sam'
gff_fpath='/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz'

In [7]:
htseq_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/sam2umicnt/BC-15-TCTGAG.count.bk' ## generated by default htseq-count command

# MWE for single-end sam + GFF => alignments count

<http://htseq.readthedocs.io/en/master/counting.html>

In [8]:
## Prepare GFF
fh_gff = HTSeq.GFF_Reader(gff_fpath)
features = HTSeq.GenomicArrayOfSets( "auto", stranded=True )

In [9]:
feature_atrr = 'gene_id'
feature_type = 'exon'
accept_aln_qual_min = 10
is_gapped_aligner = True ## bowtie2 is False
## Bowtie2: 1) non_gapped_aligner; 2) default report best one for multiple alignment
len_umi = 6
len_bc = 6

In [10]:
i = 0
for gff in fh_gff:
    if gff.type != feature_type:
        continue
    features[gff.iv] += gff.attr[feature_atrr]
    if i % 100000 == 0:
        print('{:,} lines'.format(i))
    i += 1

0 lines
100,000 lines
200,000 lines
300,000 lines
400,000 lines


In [11]:
with open('./test.p3', 'wb') as fout:
    pickle.dump(features, fout)

In [12]:
## sam per cell
from collections import Counter

In [10]:
counts = Counter()
fh_aln = HTSeq.SAM_Reader(sam_fpath)
for aln in fh_aln:
    if not aln.aligned:
        counts["_unmapped"] += 1
        continue
    if aln.aQual < accept_aln_qual_min:
        counts["_low_qual"] += 1
        continue
#     try: # bowtie2 report best one randomly by default
#         if aln.optional_field( "NH" ) > 1:
#             counts['_multimapped'] += 1
#             continue
#     except KeyError:
#         pass
    
    gene_ids = set()
    if is_gapped_aligner:
        for aln_part in aln.cigar:
            if aln_part.type != 'M':
                continue
            for _, gene_id in features[aln_part.ref_iv].steps():
                gene_ids |= gene_id
    else: # bowtie2 is non-gapped-aligner
        for _, gene_id in features[aln.iv].steps():
            gene_ids |= gene_id

    ## union model        
    if len(gene_ids) == 1:
        gene_id = list(gene_ids)[0]
        counts[gene_id] += 1
    elif len(gene_ids) == 0:
        counts["_no_feature"] += 1
    else:
        counts["_ambiguous"] += 1

In [11]:
i = 0
for gene_id in counts:
    print(gene_id, counts[gene_id])
    i += 1
    if i == 10: break

ENSDARG00000041450 1
_unmapped 1562
ENSDARG00000014690 115
_low_qual 2023
ENSDARG00000080337 339
ENSDARG00000036162 7
ENSDARG00000018334 4
_no_feature 2100
ENSDARG00000103791 3
ENSDARG00000032430 1


In [12]:
! grep ENSDARG00000080337 {htseq_fpath}
! grep ENSDARG00000014690 {htseq_fpath}

[01;31m[KENSDARG00000080337[m[K	339
[01;31m[KENSDARG00000014690[m[K	115


## MWE for sam+GFF => UMI count 

In [None]:
fh_aln = HTSeq.SAM_Reader(sam_fpath)

aln = next(iter(fh_aln))

aln.read.name

aln.read.seq

In [16]:
from collections import defaultdict

In [17]:
def _umi_seq(name, length=6):
    ## BC-TCTGAG_UMI-CGTTAC => 
    try:
        out = name.split('_')[1][4:4+length]
    except Exception as e:
        raise(e)
    return(out)
foox='BC-TCTGAG_UMI-CGTTAC'
foo=_umi_seq(foox, 6)
print(foo)
foox='BC-TCTGAG_UMI-AAAAAA'
print(foo)

CGTTAC
CGTTAC


In [18]:
umi_cnt = defaultdict(set)
aln_cnt = Counter()
fh_aln = HTSeq.SAM_Reader(sam_fpath)
i = 0
for aln in fh_aln:
    i += 1
    if not aln.aligned:
        aln_cnt["_unmapped"] += 1
        continue
    if aln.aQual < accept_aln_qual_min:
        aln_cnt["_low_qual"] += 1
        continue
    try: # bowtie2 report best one randomly by default
        if aln.optional_field( "NH" ) > 1:
            aln_cnt['_multimapped'] += 1
            continue
    except KeyError:
        pass
    
    gene_ids = set()
    if is_gapped_aligner:
        for aln_part in aln.cigar:
            if aln_part.type != 'M':
                continue
            for _, gene_id in features[aln_part.ref_iv].steps():
                gene_ids |= gene_id
    else: # bowtie2 is non-gapped-aligner
        for _, gene_id in features[aln.iv].steps():
            gene_ids |= gene_id
    ## union model        
    if len(gene_ids) == 1:
        gene_id = list(gene_ids)[0]
        aln_cnt[gene_id] += 1
        umi_seq = _umi_seq(aln.read.name)
        umi_cnt[gene_id].add(umi_seq)
    elif len(gene_ids) == 0:
        aln_cnt["_no_feature"] += 1
    else:
        aln_cnt["_ambiguous"] += 1        

In [19]:
umi_count = Counter({x : len(umi_cnt.get(x, set())) for x in umi_cnt})

In [20]:
i = 0
for gene_id in umi_count:
    print(gene_id, umi_count[gene_id])
    i += 1
    if i == 10: break

ENSDARG00000041450 1
ENSDARG00000014690 107
ENSDARG00000080337 285
ENSDARG00000036162 7
ENSDARG00000018334 4
ENSDARG00000103791 3
ENSDARG00000032430 1
ENSDARG00000051888 8
ENSDARG00000051975 1
ENSDARG00000024540 9


In [21]:
! grep ENSDARG00000014690 {htseq_fpath}

[01;31m[KENSDARG00000014690[m[K	115


## Test modules



In [None]:
# %%time
# pickle.dump(features, open('test.p', 'wb'))

# %%time
import pickle
features_bk = pickle.load(open('test.p', 'rb'))

In [None]:
type(features_bk) is str

In [None]:
# from celseq2.prepare_annotation_model import cook_anno_model
# features2 = cook_anno_model(gff_fpath, verbose=True)

In [20]:
import pickle
from celseq2.count_umi import count_umi
# gff_pickle='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/annotation/Danio_rerio.GRCz10.87.gtf.pickle'
gff_pickle = './test.p3'
features2 = pickle.load(open(gff_pickle, 'rb'))


In [15]:
# features2 = pickle.load(open('./test.p', 'rb'))

In [21]:
umi_count2 = count_umi(sam_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/smallsam/BC-15-TCTGAG.sam',
                       features=features2, len_umi=6, accept_aln_qual_min=10, is_gapped_aligner=False, 
                       dumpto='./test.umi.cnt')

KeyError: 'KN150593.1'

In [45]:
type(umi_count2)

collections.Counter

In [56]:
!ls -la

total 50972
drwxr-s---  4 yy1533 yy1533      350 Jun 29 00:57 .
drwxr-s---  7 yy1533 yy1533      269 Jun 28 15:34 ..
-rw-r-----  1 yy1533 yy1533     3315 Jun 29 00:26 align_bowtie2.snakemake
-rw-r-----  1 yy1533 yy1533     6138 Jun 22 14:53 align_bowtie_snakemake.ipynb
-rw-r-----  1 yy1533 yy1533     5858 Jun 22 14:53 barcode_demo.ipynb
-rw-r-----  1 yy1533 yy1533    10962 Jun 22 14:53 barcode_demo_py3.ipynb
drwxr-s---  2 yy1533 yy1533       55 Jun 28 10:48 .ipynb_checkpoints
-rw-r-----  1 yy1533 yy1533    14877 Jun 29 00:57 low_level_demo_HTSeq.ipynb
drwxr-s--- 14 yy1533 yy1533      361 Jun 29 00:46 .snakemake
-rw-r-----  1 yy1533 yy1533 22992892 Jun 28 11:16 test.p
-rw-r-----  1 yy1533 yy1533 22992892 Jun 29 00:55 test.p2
-rw-r-----  1 yy1533 yy1533    42184 Jun 29 00:57 test.umi.cnt


In [28]:
umi_count2['ENSDARG00000014690']

107

In [29]:
foo = 'xx'
type(foo) == int

False

In [57]:
! ls -l '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/annotation/Danio_rerio.GRCz10.87.gtf.pickle'

-rw-r----- 1 yy1533 yy1533 22990722 Jun 28 15:44 /ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2/annotation/Danio_rerio.GRCz10.87.gtf.pickle
