In [1]:
# NO QTAG ERRORS ALLOWED

"""
updated 2016-01-22 for csv mice, includes filtering
"""
import numpy as np
import pandas as pd
import regex
import os
import sys
import gzip
from itertools import izip
import sqlalchemy as sqla
# import ipcluster as ipp

In [2]:
EXPERIMENT = "2016-08-04-nates1"
INPUT_DIRECTORIES = ["../data/nate"]
OUTPUT_DIR = "../output"
QTAG_CSV = "../helpers/qtags_var.csv"

GTAG_MOTIF = "CGA([ACTG]{3})C([ACTG]{4})AATTCGATGG"
MCOUNT_MOTIF = "C([ACTG]{3})C([ACTG]{3})C([ACTG]{3})GCGCAACGCG"
INDEX_MOTIF = "(.+)_S\d{1,3}_L\d{3}_R(\d)_\d{3}\.fastq\.gz"


In [3]:
def load_qtags(qtag_csv):
    try:
        lookup = pd.DataFrame.from_csv(qtag_csv)
    except Exception, e:
        print "Cannot load qtag file at %s.\nAborting with Exception: %s."%(qtag_csv,e)
    else:
        lookup.index.name = 'qid'
        lookup.columns = ['seq']
        lookup.seq = lookup.seq.str.upper()
        lookup.reset_index(inplace=True)
        lookup.set_index('seq',inplace=True)
        lookup.qid = lookup.qid.apply(lambda x: 'q'+str(x))
        return lookup       

In [4]:
def make_rexs(gtag_motif, mcount_motif, qtags):
    qtag_motif = "|".join(['(?P<%s>%s)'%(q.qid,seq) for seq,q in qtags.iterrows()])
    qtag_regex = regex.compile(qtag_motif, flags=regex.I)
    gtag_regex = regex.compile(gtag_motif, flags=regex.I)
    mcount_regex = regex.compile(mcount_motif, flags=regex.I)
    return {'q':qtag_regex,'g':gtag_regex,'m':mcount_regex}

In [10]:
def init_indexes(root, rexs):
    indexes = {}
    if os.path.isdir(root):
        for directory, sub, files in os.walk(root):
            for f in files:
                term = regex.search(INDEX_MOTIF, f)
                if term and term[0]!='Undetermined':
                    idx, read = term.groups()
                    indexes.setdefault(idx, ["",""])
                    indexes[idx][int(read)-1] = directory+"/"+f
    for idx in indexes:
        indexes[idx] = Index(idx, indexes[idx], rexs)
    return indexes

In [6]:
class Counts(object):
    def __init__(self, idx, counts):
        self.idx = idx
        self.counts = counts
    
    @staticmethod
    def convert_generator(datadict):
        i = 0
        for key in datadict:
            keyscores = datadict[key]
            q, g, m = key
            for kscore in keyscores:
                score = kscore[0]+kscore[1] if kscore[0]!='None' and kscore[1]!='None' else 'None'
                yield (i, q, g, m, score)
                i += 1
    @staticmethod
    def get_read_counts(df, q, g, m):
        qgbbool = []
        inputqgb = [q,g,m]
        tags = ['qtag','gtag','mcount']
        for i in range(len(tags)):
            b = (df[tags[i]] != 'None') if inputqgb[i] else (df[tags[i]] == 'None')
            qgbbool.append(b)
        return len(df.loc[qgbbool[0] & qgbbool[1] & qgbbool[2]])

    def convert_save_df(self):
        countsdf = pd.DataFrame(self.convert_generator(self.counts))
        countsdf.columns = ['index','qtag','gtag','mcount','score']
        self.df = countsdf
        return self
    
    def filter_reads(self):
        def classify_read(row):
            passed = 0
            minscore = np.min([ord(s) for s in row.score]) if row.score != 'None' else 0
            return 1 if minscore >= 63 else 0
        self.df['passed'] = self.df.apply(classify_read,axis=1)
        self.df = self.df.loc[self.df.qtag!='None']
        return self  
    
    def export_to_db(self, engine, if_exists='replace'):
        self.df.to_sql(self.idx, engine, if_exists=if_exists)
        return
    
    def consolidate_filter(self, writer):
        qgm_counts = pd.pivot_table(self.df.loc[self.df['passed']>0], 
                                     index=['qtag','gtag','mcount'], 
                                     values='passed', aggfunc=sum)
        if len(qgm_counts) < 1:
            self.qgcounts = pd.DataFrame()
            return self
        else:
            
            qg_counts = pd.pivot_table(pd.DataFrame(qgm_counts).reset_index(), 
                                       index=['qtag','gtag'], 
                                       values='passed', aggfunc=[sum, len])
            qg_counts.rename(columns={'len':'molecs','sum':'reads'}, inplace=True)
            qg_counts.reset_index(inplace=True)
            qg_counts.sort_values(by='molecs',ascending=False, inplace=True)
            self.qgcounts = qg_counts
            qg_counts.to_excel(writer, self.idx)
            return self
        
    def get_stats(self):
        valid = self.df.loc[(self.df.qtag!='None')&
                            (self.df.gtag!='None')&
                            (self.df.mcount!='None')]
        idxstats = {
            'total reads': len(self.df),
            'mcounts with qtag, gtag and mcount': len(valid.groupby(['qtag','gtag','mcount'])),
            'reads with qtag, gtag and mcount': len(valid),
            'reads with only no qtag': self.get_read_counts(self.df, False, True, True),
            'reads with only no gtag': self.get_read_counts(self.df, True, False, True),
            'reads with only no mcount': self.get_read_counts(self.df, True, True, False),
            'reads with only mcount': self.get_read_counts(self.df,False,False,True),
            'reads with only barcode': self.get_read_counts(self.df, False,True,False),
            'reads with only qtag': self.get_read_counts(self.df, True,False,False),
            'reads with no qtag, barcode or mcount': self.get_read_counts(self.df,False,False,False)
        }
        
        return idxstats


In [7]:
class Index(object):
    def __init__(self, idx, reads, rexs):
        self.idx = idx
        self.file0, self.file1 = reads[:2]
        self.tname = regex.sub('[^0-9a-zA-Z]+',"",idx)
        self.rexs = rexs            
    
    def search_read(self, chunk):
        # search for motifs in reads
        seq0, seq1 = chunk[0]
        qs0, qs1 = chunk[1]
        q = regex.search(self.rexs['q'],seq1)
        g = regex.search(self.rexs['g'],seq0)
        m = regex.search(self.rexs['m'],seq0)
        
        gtag, mcount = ('None','None')
        gscore, mscore = ('None','None')
        
        # extract sequences and loci
        qtag = q.lastgroup if q else 'None'
        if g:
            gtag = "".join(g.groups())
            gscore = qs0[g.start():g.end()]
        if m:
            mcount = "".join(m.groups())
            mscore = qs0[m.start():m.end()]
            
        # construct key and spans tuples for handoff
        key = (qtag,gtag,mcount)
        scores = [gscore, mscore]
        return key, scores
    
    def iterreads(self, read0, read1):
        line = 2
        counts = {}
        # iterate through reads 
        chunk = [(),()]
        for r0, r1 in izip(read0, read1):
            if line == 3:
                chunk[0] = (r0, r1)
                line = -1
            elif line == 1:
                chunk[1] = (r0, r1)
                key, scores = self.search_read(chunk)
                if key in counts:
                    counts[key].append(scores)
                else:
                    counts[key] = [scores]
            line += 1
        return counts
    
    def init_search(self, rexs):
        try:
            read0 = gzip.open(self.file0)
            read1 = gzip.open(self.file1)
        except Exception, e:
            print "Cannot open read files for %s.\nAborting with Exception: %s"%(self.idx,e)
        else:
            counts = self.iterreads(read0, read1)
            return Counts(self.idx,counts)

In [13]:
def run(db_name=None):
    all_counts = {}
    stats = {}
    qtags = load_qtags(QTAG_CSV)
    rexs = make_rexs(GTAG_MOTIF, MCOUNT_MOTIF, qtags)
    
    if db_name == None:
        db_name = 'sqlite:///%s/counts_%s.db'%(OUTPUT_DIR, EXPERIMENT)
    else: db_name = 'sqlite:///%s/%s.db'%(OUTPUT_DIR, db_name)
        
    engine = sqla.create_engine(db_name)
    writer = pd.ExcelWriter('%s/filtered_%s.xlsx'%(OUTPUT_DIR,EXPERIMENT))
    iterum = 1
    for directory in INPUT_DIRECTORIES:
        
        indexes = init_indexes(directory, rexs)
#         return    
        for i in indexes:
            conn = engine.connect()
            sys.stdout.write('Starting index %d of %d: %s\n'%(iterum, len(indexes), i))
            sys.stdout.flush()
            index = indexes[i]
            counts = index.init_search(rexs)
            sys.stdout.write('\t searched: %s\n'%i)
            sys.stdout.flush()
            try:
                counts.convert_save_df()
#                 sys.stdout.write('\t converted to df: %s\n'%i)
                sys.stdout.flush()                
                counts.filter_reads().consolidate_filter(writer)
#                 sys.stdout.write('\t filtered: %s\n'%i)
                sys.stdout.flush()
                counts.export_to_db(conn)
#                 sys.stdout.write('\t exported: %s\n'%i)
                sys.stdout.flush()
                stats[i] = counts.get_stats()
#                 sys.stdout.write('\tanalyzed statistics: %s\n'%i)
                sys.stdout.flush()
                sys.stdout.write('\t complete.\n')
                sys.stdout.flush()
            except Exception as e:
                print e
                raise
            conn.close()
            iterum+=1
            all_counts[i]=counts
    writer.save()
    engine.dispose()
    sys.stdout.write('Job complete\n')
    sys.stdout.flush()
    return all_counts, stats


In [14]:
data_counts, data_stats = run()


Starting index 1 of 16: NH003
	 searched: NH003
	 complete.
Starting index 2 of 16: NH002
	 searched: NH002
	 complete.
Starting index 3 of 16: NH001
	 searched: NH001
	 complete.
Starting index 4 of 16: NH007
	 searched: NH007
	 complete.
Starting index 5 of 16: NH006
	 searched: NH006
	 complete.
Starting index 6 of 16: NH005
	 searched: NH005
	 complete.
Starting index 7 of 16: NH004
	 searched: NH004
	 complete.
Starting index 8 of 16: NH009
	 searched: NH009
	 complete.
Starting index 9 of 16: NH008
	 searched: NH008
	 complete.
Starting index 10 of 16: NH010
	 searched: NH010
	 complete.
Starting index 11 of 16: NH096
	 searched: NH096
	 complete.
Starting index 12 of 16: NH075
	 searched: NH075
	 complete.
Starting index 13 of 16: NH120
	 searched: NH120
	 complete.
Starting index 14 of 16: NH025
	 searched: NH025
	 complete.
Starting index 15 of 16: NH125
	 searched: NH125
	 complete.
Starting index 16 of 16: NH144
	 searched: NH144
	 complete.
Job complete


In [15]:
pd.DataFrame.from_dict(data_stats).T.to_csv("%s/%s_stats.csv"%(OUTPUT_DIR,EXPERIMENT))

In [11]:
# writer = pd.ExcelWriter('filtered.xlsx')
# for idx in c:
#     cidx = Counts(c[idx].idx, c[idx].counts)
#     cidx.df = c[idx].df
#     cidx.consolidate_filter(writer)
# #     c[idx].qgcounts.to_excel(writer, idx)
#     print idx
# writer.save()