In [1]:
# NO QTAG ERRORS ALLOWED

"""
updated 2016-01-22 for csv mice, includes filtering
"""
import numpy as np
import pandas as pd
import regex
import os,sys
import gzip
import sqlalchemy as sqla

In [292]:
EXPERIMENT = "2016-08-04-nates1"
INPUT_DIRECTORIES = ["../data/nate"]
OUTPUT_DIR = "../output"

QTAG_CSV = "../helpers/qtags_var.csv"

GTAG_MOTIF = "CGA(?P<gtag>[ACTG]{3})C(?P<gtag>[ACTG]{4})AATTCGATGG"
MCOUNT_MOTIF = "C(?P<mcount>[ACTG]{3})C(?P<mcount>[ACTG]{3})C(?P<mcount>[ACTG]{3})GCGCAACGCG"
FILE_MOTIF = "(?P<sample>.+)_(?P<sample_barcode>.+)_L(?P<lane>\d{3})_R(?P<read_number>\d)_(?P<set_number>\d{3}).fastq.gz"
READ_REF_DEFAULT = [('q',1),('g',0),('m',0)]

In [3]:
'''TEST (self explanatory i know but just in case)'''
test = '9615-01_S9_L001_R1_001.fastq.gz'

In [279]:
# used only to make regex motifs, but
# not nested to preserve qtag loading functionality if desired
def load_qtags(qtag_csv):
    try:
        qtagdf = pd.DataFrame.from_csv(qtag_csv).reset_index()
        qtagdf.rename(columns={'qtag_seq':'seq', 'qtag_num':'qid'}, inplace=True)
        qtagdf.qid = qtagdf.qid.apply(lambda x: "q%s"%str(x))
        qtagdf.seq = qtagdf.seq.str.upper()
        qtagdf.set_index('seq', inplace=True)
    # TO DO: CHECK FOR DUPLICATE SEQUENCES OR NAMES
    except IOError as e:
        print "Unable to load qtag file, with error:", e
        sys.exit(1)
    return qtagdf


# construct regex motif dict for read search
def make_rexs(qtag_csv):
    # load and construct qtag motif as OR list of each qtag seq (named)
    qtags = load_qtags(qtag_csv)
    qtag_phrases = qtags.apply(lambda x: '(?P<%s>%s)'%(x.qid, x.name) , axis=1)    
    qtag_motif = "|".join( qtag_phrases.values )
    # return compiled motifs for qtag, gtag (barcode), and molec counter, resp.
    return {'q':regex.compile(qtag_motif, flags=regex.I),
            'g':regex.compile(GTAG_MOTIF, flags=regex.I),
            'm':regex.compile(MCOUNT_MOTIF, flags=regex.I)}

In [7]:
# this looks gross but works for now; make pretty later
def get_file_list(root):
    fpath_temp_a = []
    fil_temp_a = []
    # construct list of files and their infodict, as tuples:
    # (i.e. <sample>_<sample_barcode>_L<lane>_R<read_number>_<set_number>)
    for direct, sub, fil in os.walk(root):
        fpaths = np.array( [ "%s/%s"%(direct,f)  for f in fil] )
        to_append = np.array([regex.search(FILE_MOTIF,f) for f in fil ])
        fil_temp_a.append( to_append )
        fpath_temp_a.append(fpaths)
        
    fil_temp_b = np.concatenate(fil_temp_a)
    fpath_temp_b = np.concatenate(fpath_temp_a)
    fil_temp_c = fil_temp_b[np.nonzero(fil_temp_b)]
    fpath_temp_c = fpath_temp_b[np.nonzero(fil_temp_b)]
    files = np.array( [(fp, fil.groupdict()) for (fp, fil) in zip(fpath_temp_c, fil_temp_c)] )
    return files

In [19]:
def init_indexes(root):
    files = get_file_list(root)
    ### FIX :  files list item fmt:  (fpath, fil.str)
    indexes = dict([(f[1]['sample'],["",""]) for f in files])
    for fpath, match in files:
        if match['sample']!='Undetermined':
            # assumes 2 reads (fwd and reverse)
            indexes[match['sample']][int(match['read_number'])-1] = fpath
    if len(indexes) == 0:
        print "Empty index list. No valid files. Please check your input directory and file naming convention."
        sys.exit(1)            
    # convert idx entry list of files to Index object
    for idx, idx_paths in indexes.items():
        indexes[idx] = Index(idx, idx_paths)
    return indexes

In [216]:
# modified opening .gz file with error/exception catching
# 15 aug 2016

# with zip(gzip.open(self.file0), gzip.open(self.file1)) as f0, f1:
def open_gz(fpath):
    try:
        f_gen = gzip.open(fpath)
        return f_gen
    except EnvironmentError as e:
        print '%s "%s". Please check your file and/or directory paths. Skipping index. [EnvironmentError Errno %d]'%(
                e.strerror, e.filename, e.errno)
    except TypeError as e:
        print "TypeError: %s. Skipping index."%e
    except BaseException as e:
        print 'Other error: %s. Skipping index.'%e
    return None


In [293]:
'''TEST'''
REXS = make_rexs(QTAG_CSV)
directory = INPUT_DIRECTORIES[0]
indexes = init_indexes(directory)


In [341]:
'''TEST'''
testi = indexes.values()[1]
testfq = gzip.open(testi.file0)

In [342]:
'''TEST'''
test_seqid = testfq.readline().strip()
test_seq = testfq.readline().strip()
test_qsid = testfq.readline().strip()
test_qs = testfq.readline().strip()
match = regex.search(REXS['g'],test_seq)
extracted = filter(lambda x: len(x[1])>0, match.capturesdict().items())


In [354]:
'''
Updated 15 August 2016 -- need to test all class methods together, 
but otherwise cleaned
'''
class Index(object):
    
    # defining read_ref as instance variable so that
    # if user uses multiple read rexs or refs, changing
    # var won't affect previously defined objects
    
    def __init__(self, idx, fpaths, read_ref=READ_REF_DEFAULT):
        self.idx = idx
        self.file0, self.file1 = fpaths
        self.read_ref = read_ref
        self.tname = regex.sub('[^0-9a-zA-Z]+',"",idx)

    # so ugly i'm cringing but should probably not change it
    # for this v1 version
    def count_reads(self):
        counts = {}
        # such that line 1 is seq, line 3 is qs
        line = 0
        entry_len = 4
        gz0, gz1 = [open_gz(self.file0), open_gz(self.file1)]
        if gz0 and gz1:
            chunk = [(),()]
            for r in zip(gz0, gz1):
                if line==1: chunk[0] = r  # sequence
                elif line==3: chunk[1] = r  # q scores
                if line+1 > entry_len:
                    key,qscores = self.motif_search(chunk[0],chunk[1])
                    counts.setdefault(key,[])
                    counts[key].append(qscores)
                    chunk = []
                    line = -1
                line += 1
                break
        return counts

    def motif_search(self, seqs, qscores):
        keys = dict([(c,None) for c in self.read_ref.keys()]) 
        qs_seqs = ""
        searches = [(comp, regex.search(REXS[comp], seqs[read])) 
                    for comp, read in self.read_ref ]
        for component, result in searches:
            if result:
                extracted = filter(lambda x: len(x[1])>0, 
                                   result.capturesdict().items())
                if len(extracted) == 1:
                    key, seq_matches = extracted[0]
                    sequence = "".join(seq_matches)
                    keys[component] = key
                    qs_seqs += qscores[result.start(), result.end()] 
                else: 
                    print "Error: non-unique sequence"
        return keys, qs_seqs


In [36]:
class Counts(object):
    def __init__(self, idx, counts):
        self.idx = idx
        self.counts = counts
    
    @staticmethod
    def convert_generator(datadict):
        i = 0
        for key in datadict:
            keyscores = datadict[key]
            q, g, m = key
            for kscore in keyscores:
                score = kscore[0]+kscore[1] if kscore[0]!='None' and kscore[1]!='None' else 'None'
                yield (i, q, g, m, score)
                i += 1
    @staticmethod
    def get_read_counts(df, q, g, m):
        qgbbool = []
        inputqgb = [q,g,m]
        tags = ['qtag','gtag','mcount']
        for i in range(len(tags)):
            b = (df[tags[i]] != 'None') if inputqgb[i] else (df[tags[i]] == 'None')
            qgbbool.append(b)
        return len(df.loc[qgbbool[0] & qgbbool[1] & qgbbool[2]])

    def convert_save_df(self):
        countsdf = pd.DataFrame(self.convert_generator(self.counts))
        countsdf.columns = ['index','qtag','gtag','mcount','score']
        self.df = countsdf
        return self
    
    def filter_reads(self):
        def classify_read(row):
            passed = 0
            minscore = np.min([ord(s) for s in row.score]) if row.score != 'None' else 0
            return 1 if minscore >= 63 else 0
        self.df['passed'] = self.df.apply(classify_read,axis=1)
        self.df = self.df.loc[self.df.qtag!='None']
        return self  
    
    def export_to_db(self, engine, if_exists='replace'):
        self.df.to_sql(self.idx, engine, if_exists=if_exists)
        return
    
    def consolidate_filter(self, writer):
        qgm_counts = pd.pivot_table(self.df.loc[self.df['passed']>0], 
                                     index=['qtag','gtag','mcount'], 
                                     values='passed', aggfunc=sum)
        if len(qgm_counts) < 1:
            self.qgcounts = pd.DataFrame()
            return self
        else:
            
            qg_counts = pd.pivot_table(pd.DataFrame(qgm_counts).reset_index(), 
                                       index=['qtag','gtag'], 
                                       values='passed', aggfunc=[sum, len])
            qg_counts.rename(columns={'len':'molecs','sum':'reads'}, inplace=True)
            qg_counts.reset_index(inplace=True)
            qg_counts.sort_values(by='molecs',ascending=False, inplace=True)
            self.qgcounts = qg_counts
            qg_counts.to_excel(writer, self.idx)
            return self
        
    def get_stats(self):
        valid = self.df.loc[(self.df.qtag!='None')&
                            (self.df.gtag!='None')&
                            (self.df.mcount!='None')]
        idxstats = {
            'total reads': len(self.df),
            'mcounts with qtag, gtag and mcount': len(valid.groupby(['qtag','gtag','mcount'])),
            'reads with qtag, gtag and mcount': len(valid),
            'reads with only no qtag': self.get_read_counts(self.df, False, True, True),
            'reads with only no gtag': self.get_read_counts(self.df, True, False, True),
            'reads with only no mcount': self.get_read_counts(self.df, True, True, False),
            'reads with only mcount': self.get_read_counts(self.df,False,False,True),
            'reads with only barcode': self.get_read_counts(self.df, False,True,False),
            'reads with only qtag': self.get_read_counts(self.df, True,False,False),
            'reads with no qtag, barcode or mcount': self.get_read_counts(self.df,False,False,False)
        }
        
        return idxstats


In [None]:
def sysprint(msg,tab_num=0):
    tabs = "".join(["\t" for t in range(tab_num)])
    sys.stdout.write("%s%s\n"%(tabs, msg))
    sys.stdout.flush()
    return
    
    

In [38]:
def run(db_name=None, quiet=False):
    all_counts = {}
    stats = {}
    qtags = load_qtags(QTAG_CSV)
    rexs = make_rexs(GTAG_MOTIF, MCOUNT_MOTIF, qtags)
    
    if db_name == None:
        db_name = 'sqlite:///%s/counts_%s.db'%(OUTPUT_DIR, EXPERIMENT)
    else: db_name = 'sqlite:///%s/%s.db'%(OUTPUT_DIR, db_name)
        
    engine = sqla.create_engine(db_name)
    writer = pd.ExcelWriter('%s/filtered_%s.xlsx'%(OUTPUT_DIR,EXPERIMENT))
    for directory in INPUT_DIRECTORIES:
        
        indexes = init_indexes(directory, rexs)
        for idx, obj in indexes.items():
            conn = engine.connect()
#             sysprint('Starting index %d of %d: %s'%(iterum, len(indexes), idx))
            index = indexes[i]
            try:
                counts_dict = index.count_reads()
                '''
                START EDITING HERE
                '''
                counts.convert_save_df()
#                 sysprint('converted to df: %s\n'%i,1)
                counts.filter_reads().consolidate_filter(writer)
#                 sysprint('filtered: %s\n'%i,1)
                counts.export_to_db(conn)
#                 sys.stdout.write('\t exported: %s\n'%i)
                stats[i] = counts.get_stats()
#                 sys.stdout.write('\tanalyzed statistics: %s\n'%i)
#                 sys.stdout.write('\t complete.\n')
            except Exception as e:
                print e
                raise
            conn.close()
            iterum+=1
            all_counts[i]=counts
    writer.save()
    engine.dispose()
    sys.stdout.write('Job complete\n')
    sys.stdout.flush()
    return all_counts, stats


In [39]:
data_counts, data_stats = run(quiet=True)


Starting index 1 of 51: 16314-08-Y
	 searched: 16314-08-Y
	 converted to df: 16314-08-Y
	 filtered: 16314-08-Y
	 exported: 16314-08-Y
	analyzed statistics: 16314-08-Y
	 complete.
Starting index 2 of 51: 16314-11-N
	 searched: 16314-11-N
	 converted to df: 16314-11-N
	 filtered: 16314-11-N
	 exported: 16314-11-N
	analyzed statistics: 16314-11-N
	 complete.
Starting index 3 of 51: 16614-02-Y
	 searched: 16614-02-Y
	 converted to df: 16614-02-Y
	 filtered: 16614-02-Y
	 exported: 16614-02-Y
	analyzed statistics: 16614-02-Y
	 complete.
Starting index 4 of 51: 16314-36-N
	 searched: 16314-36-N
	 converted to df: 16314-36-N
	 filtered: 16314-36-N
	 exported: 16314-36-N
	analyzed statistics: 16314-36-N
	 complete.
Starting index 5 of 51: 16314-12-N
	 searched: 16314-12-N
	 converted to df: 16314-12-N
	 filtered: 16314-12-N
	 exported: 16314-12-N
	analyzed statistics: 16314-12-N
	 complete.
Starting index 6 of 51: 16314-47-Y
	 searched: 16314-47-Y
	 converted to df: 16314-47-Y
	 filtered: 16314

In [40]:
pd.DataFrame.from_dict(data_stats).T.to_csv("%s/%s_stats.csv"%(OUTPUT_DIR,EXPERIMENT))