<b>16 August 2016:</b>

First:

Checkpoint for finished Index, Counts in progress
- finished writing Index and surface debugging / catching
- cleaned Counts fn construct_df (with helpers parse_qgm_key and calculate_count_minscore) 
- in progress: debugging construct_df, re-writing counting qg molecs & reads


<b>15 August 2016:</b>

- set Index to call open_gz, cleaned Index obj:
- Added read_ref option for UX flexibility
- Streamlined count_reads logic and flow, calling motif_search
- Major modifications to motif search; generalize search and feature extraction for each feature and to minimize downstream conditional statements 



In [1]:
# NO QTAG ERRORS ALLOWED
import numpy as np
import pandas as pd
import regex
import os,sys
import gzip
import sqlalchemy as sqla

### CONSTANTS defined by user

In [35]:
EXPERIMENT = "2016-08-04-nates1"
INPUT_DIRECTORIES = ["../data/nate"]
OUTPUT_DIR = "../output"

QTAG_CSV = "../helpers/qtags_var.csv"

GTAG_MOTIF = "CGA(?P<gtag>[ACTG]{3})C(?P<gtag>[ACTG]{4})AATTCGATGG"
MCOUNT_MOTIF = "C(?P<mcount>[ACTG]{3})C(?P<mcount>[ACTG]{3})C(?P<mcount>[ACTG]{3})GCGCAACGCG"
FILE_MOTIF = "(?P<sample>.+)_(?P<sample_barcode>.+)_L(?P<lane>\d{3})_R(?P<read_number>\d)_(?P<set_number>\d{3}).fastq.gz"
READ_REF_DEFAULT = {'q':1, 'g':0, 'm':0}

IF_SQLTABLE_EXISTS = 'replace'
DEFAULT_DB_NAME = "counts-%s.db"%EXPERIMENT

In [None]:
# def format_motif(user):
    

In [4]:
# used only to make regex motifs, but
# not nested to preserve qtag loading functionality if desired
def load_qtags(qtag_csv):
    try:
        qtagdf = pd.DataFrame.from_csv(qtag_csv).reset_index()
        qtagdf.rename(columns={'qtag_seq':'seq', 'qtag_num':'qid'}, inplace=True)
        qtagdf.qid = qtagdf.qid.apply(lambda x: "q%s"%str(x))
        qtagdf.seq = qtagdf.seq.str.upper()
        qtagdf.set_index('seq', inplace=True)
    # TO DO: CHECK FOR DUPLICATE SEQUENCES OR NAMES
    except IOError as e:
        print "Unable to load qtag file, with error:", e
        sys.exit(1)
    return qtagdf


# construct regex motif dict for read search
def make_rexs(qtag_csv):
    # load and construct qtag motif as OR list of each qtag seq (named)
    qtags = load_qtags(qtag_csv)
    qtag_phrases = qtags.apply(lambda x: '(?P<%s>%s)'%(x.qid, x.name) , axis=1)    
    qtag_motif = "|".join( qtag_phrases.values )
    # return compiled motifs for qtag, gtag (barcode), and molec counter, resp.
    return {'q':regex.compile(qtag_motif, flags=regex.I),
            'g':regex.compile(GTAG_MOTIF, flags=regex.I),
            'm':regex.compile(MCOUNT_MOTIF, flags=regex.I)}

In [5]:
# this looks gross but works for now; make pretty later
def get_file_list(root):
    fpath_temp_a = []
    fil_temp_a = []
    # construct list of files and their infodict, as tuples:
    # (i.e. <sample>_<sample_barcode>_L<lane>_R<read_number>_<set_number>)
    for direct, sub, fil in os.walk(root):
        fpaths = np.array( [ "%s/%s"%(direct,f)  for f in fil] )
        to_append = np.array([regex.search(FILE_MOTIF,f) for f in fil ])
        fil_temp_a.append( to_append )
        fpath_temp_a.append(fpaths)
        
    fil_temp_b = np.concatenate(fil_temp_a)
    fpath_temp_b = np.concatenate(fpath_temp_a)
    fil_temp_c = fil_temp_b[np.nonzero(fil_temp_b)]
    fpath_temp_c = fpath_temp_b[np.nonzero(fil_temp_b)]
    files = np.array( [(fp, fil.groupdict()) for (fp, fil) in zip(fpath_temp_c, fil_temp_c)] )
    return files

In [6]:
def init_indexes(root):
    files = get_file_list(root)
    ### FIX :  files list item fmt:  (fpath, fil.str)
    indexes = dict([(f[1]['sample'],["",""]) for f in files])
    for fpath, match in files:
        if match['sample']!='Undetermined':
            # assumes 2 reads (fwd and reverse)
            indexes[match['sample']][int(match['read_number'])-1] = fpath
    if len(indexes) == 0:
        print "Empty index list. No valid files. Please check your input directory and file naming convention."
        sys.exit(1)            
    # convert idx entry list of files to Index object
    for idx, idx_paths in indexes.items():
        indexes[idx] = Index(idx, idx_paths)
    return indexes

In [7]:
# modified opening .gz file with error/exception catching
# 15 aug 2016

# with zip(gzip.open(self.file0), gzip.open(self.file1)) as f0, f1:
def open_gz(fpath):
    try:
        f_gen = gzip.open(fpath)
        return f_gen
    except EnvironmentError as e:
        print '%s "%s". Please check your file and/or directory paths. Skipping index. [EnvironmentError Errno %d]'%(
                e.strerror, e.filename, e.errno)
    except TypeError as e:
        print "TypeError: %s. Skipping index."%e
    except BaseException as e:
        print 'Other error: %s. Skipping index.'%e
    return None


In [75]:
def sysprint(msg,tab_num=0):
    tabs = "".join(["\t" for t in range(tab_num)])
    sys.stdout.write("%s%s\n"%(tabs, msg))
    sys.stdout.flush()
    return

In [80]:
'''
Updated 15 August 2016 -- need to test all class methods together, 
but otherwise cleaned

- Added read_ref option for UX flexibility
- Streamlined count_reads logic and flow, calling motif_search
- Major modifications to motif search; generalize search and feature extraction for each feature and to minimize downstream conditional statements 

'''
class Index(object):
    
    # defining read_ref as instance variable so that
    # if user uses multiple read rexs or refs, changing
    # var won't affect previously defined objects
    
    def __init__(self, idx, fpaths, read_ref=READ_REF_DEFAULT):
        self.idx = idx
        self.file0, self.file1 = fpaths
        # read_ref as dict
        self.read_ref = read_ref
        self.tname = regex.sub('[^0-9a-zA-Z]+',"",idx)

    # so ugly i'm cringing but should probably not change it
    # for this v1 version
    def count_reads(self):
        counts = {}
        # such that line 1 is seq, line 3 is qs
        line = 0
        entry_len = 4
        gz0, gz1 = [open_gz(self.file0), open_gz(self.file1)]
        if gz0 and gz1:
            chunk = [(),()]
            for r0,r1 in zip(gz0, gz1):
                if line==1: chunk[0] = (r0,r1)  # sequence
                elif line==3: chunk[1] = (r0,r1)  # q scores
                if line+1 > entry_len:
                    key,qscores = self.motif_search(chunk[0],chunk[1])
                    counts.setdefault(key,[])
                    counts[key].append(qscores)
                    chunk = [(),()]
                    line = -1
                line += 1
                
        return counts

    def motif_search(self, seqs, qscores, order=['q','g','m']):
        keys = ['None' for _ in order] 
        qs_seqs = ""
        searches = [(feature, read, regex.search(REXS[feature], seqs[read])) 
                    for feature, read in self.read_ref.items() ]
        
        for feature, i in zip( order, range(len(order)) ):
            r = self.read_ref[feature]
            search = regex.search(REXS[feature], seqs[r])
            if search:
                match = search.capturesdict()
                extracted = filter(lambda x: len(match[x])>0, match)
                if len(extracted) == 1:
                    k = extracted[0]
                    keys[i] = k if feature=='q' else "".join(match[k])
                    qs_seqs += "" if feature=='q' else qscores[r][search.start():search.end()]
                else:
                    print "Error: non-unique sequence"
            
        return tuple(keys), qs_seqs


In [88]:
''' 
    NOTE ON QSCORE FORMATS (ref. fn calculate_count_minscore)
    Q-scores for Illumina 1.8+ (most recent as of Aug 2016) ranges 
    from 33 to 73 (Phred+33 system). P, the probability of erroneous base call,
    is defined as:  P(erroneous base call) = 10 ^ (Qphred / -10), i.e.
    for Illumina 1.8+, P = 10^( (QS-33)/-10 ). 

    The minimum QS cutoff is set at an error probability  
    of 10^-3 (standard for Illumina system).
'''


class Counts(object):
    def __init__(self, idx):
        self.idx = idx
    
    # generator for parsing raw df qgm keys
    # row is pd.Series
    @staticmethod
    def parse_qgm_key(row, order=['q','g','m']):
        # parse qgm key
        for feature, seq in zip(order, row[0]):
            row[feature] = seq
        return row

    # generator for calculating and counting read minscores (PF)
    # row is pd.Series
    @staticmethod
    def calculate_count_minscore(row):
        # define variables as null values
        keys = [ 'reads_total','reads_pf','molec_passed' ]
        vals = [0,0,False]
        q, g, m = row[['q','g','m']]
        valid = (q!='None') and (g!='None') and (m!='None')
        # if key is valid (i.e. non-null q, g, and m), count
        if valid:
            try:
                min_qscores = np.array([])
                # get min qscores for each read_qs in row (i.e. per read)
                for read_qs in row[1]:
                    read_min_qs = 0 if len(read_qs)==0 else np.min(
                                        [ord(s) for s in read_qs])
                    min_qscores = np.append(min_qscores,read_min_qs)            
                # calculate values for variables
                vals = [ len(min_qscores), #reads_total
                         len(min_qscores[np.where(min_qscores>=63)]), #reads_pf
                         True if max(min_qscores) > 0 else False  #molec_passed
                       ]
            except Exception as e:
                print e
                print row
                sys.exit(1)
                
        for k, v in zip(keys, vals) :
            row[k] = v
        return row
    
    # counts number of molecular counters and reads that PF per qg
    # group is pd.DataFrame
    @staticmethod
    def count_qg_molec_reads(group):
        s = pd.Series()
        molecs = group.loc[group.reads_pf>0].molec_passed
        s['molecs'], s['reads'] = np.sum(molecs), np.sum(group.reads_pf)
        return s

    '''  "construct_df" creates pd.DataFrame from Index.counts_dict, and
      1) parses qgm key after calling df;
      2) calculates min read qscore for each read; and
      3) counts reads PF and drops qscore seqs to save memory.
      4) sets qg_df property of obj
      5) returns qgm_df to export to db if desired without saving to memory
    '''
    
    def construct_qg_df(self, counts):
        qgm_df = pd.DataFrame.from_dict(counts.items())
        qgm_df = qgm_df.apply(self.parse_qgm_key, axis=1)
        qgm_df = qgm_df.apply(self.calculate_count_minscore, axis=1)
        # count and consolidate qg
        keep = ['q','g','m','reads_total','reads_pf','molec_passed']
        qgm_df = qgm_df[keep]
        counts_df = qgm_df.groupby(['q','g'], as_index=False
                    ).apply(self.count_qg_molec_reads)

        counts_df.loc[:,'passed'] = counts_df.apply(lambda x:
                    True if (x.molecs > 0) and not('None' in x[['q','g','m']])
                    else False, axis=1 )
        self.counts_df = counts_df
        return qgm_df
    

In [None]:
qgm

### TEST CELLS

In [3]:
'''TEST (self explanatory i know but just in case)'''
test = '9615-01_S9_L001_R1_001.fastq.gz'

In [81]:
'''TEST'''
REXS = make_rexs(QTAG_CSV)
directory = INPUT_DIRECTORIES[0]
indexes = init_indexes(directory)
testi = indexes.values()[1]
counts_dict = testi.count_reads()


In [89]:
'''TEST'''
testcount = Counts(testi.idx)
qgm = testcount.construct_qg_df(counts_dict)

In [91]:
testcount.counts_df.loc[testcount.counts_df.passed==True]

Unnamed: 0_level_0,Unnamed: 1_level_0,molecs,reads,passed
q,g,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
q19,GATCAAG,1,1,True
q24,CATCTTC,1,1,True
q24,CGGGTGC,1,1,True
q24,GGAGCGG,1,1,True
q25,CATCTTC,2,2,True
q25,TTCTTAG,1,1,True
q26,AGGGTGC,1,1,True
q26,ATGACGG,1,1,True
q26,CAACTTC,2,2,True
q26,CACCTTC,2,2,True


### NEED TO DEAL WITH / REWRITE

In [None]:
        
#     def get_stats(self):
#         valid = self.df.loc[(self.df.qtag!='None')&
#                             (self.df.gtag!='None')&
#                             (self.df.mcount!='None')]
#         idxstats = {
#             'total reads': len(self.df),
#             'mcounts with qtag, gtag and mcount': len(valid.groupby(['qtag','gtag','mcount'])),
#             'reads with qtag, gtag and mcount': len(valid),
#             'reads with only no qtag': self.get_read_counts(self.df, False, True, True),
#             'reads with only no gtag': self.get_read_counts(self.df, True, False, True),
#             'reads with only no mcount': self.get_read_counts(self.df, True, True, False),
#             'reads with only mcount': self.get_read_counts(self.df,False,False,True),
#             'reads with only barcode': self.get_read_counts(self.df, False,True,False),
#             'reads with only qtag': self.get_read_counts(self.df, True,False,False),
#             'reads with no qtag, barcode or mcount': self.get_read_counts(self.df,False,False,False)
#         }
        
#         return idxstats


In [38]:
def run(db_name=DEFAULT_DB_NAME, quiet=False, as_csv=True, as_xlsx=False ):
    
    
    
    
    stats = {}    
    
    # define output files
    db_name = db_name.split(".db")[0]
    db_path = 'sqlite:///%s/%s.db'%(OUTPUT_DIR, db_name)
    engine = sqla.create_engine(db_path)
    
    filtered_fpath = '%s/filtered-%s'%(OUTPUT_DIR,EXPERIMENT)
    if as_xlsx: 
        writer = pd.ExcelWriter('%s.xlsx'%filtered_fpath)
    if as_csv:
        # generate a new csv file and open
        open('%s.csv'%filtered_fpath, 'a').close()
        f = open('%s.csv'%filtered_fpath, 'a')
        header = True

    i = 1
    # iterate through directories/indexes
    for directory in INPUT_DIRECTORIES:
        indexes = init_indexes(directory, rexs)
        for idx_name, index in indexes.items():
            sysprint('\nIndex %d of %d: %s',(i,len(indexes),idx_name))
            
            conn = engine.connect()
            counts_dict = index.count_reads()
            counts = Counts(idx_name)
            qgm_df = counts.construct_qg_df(counts_dict)   
            '''
                ADD COL FOR IDX NAME
            '''
            # write to output files
            qgm_df.to_sql(idx_name, conn, if_exists=if_exists) 
#             if as_excel: counts.counts_df.to_excel(writer, idx_name)
#             if as_csv: counts.counts_df.to_csv(f, header=header)
            header = False
            
            conn.close()
            i+=1
    writer.save()
    f.close()
    engine.dispose()
    sysprint('Job complete\n')
    return stats


In [39]:
# data_counts, data_stats = run(quiet=True)
# 

Starting index 1 of 51: 16314-08-Y
	 searched: 16314-08-Y
	 converted to df: 16314-08-Y
	 filtered: 16314-08-Y
	 exported: 16314-08-Y
	analyzed statistics: 16314-08-Y
	 complete.
Starting index 2 of 51: 16314-11-N
	 searched: 16314-11-N
	 converted to df: 16314-11-N
	 filtered: 16314-11-N
	 exported: 16314-11-N
	analyzed statistics: 16314-11-N
	 complete.
Starting index 3 of 51: 16614-02-Y
	 searched: 16614-02-Y
	 converted to df: 16614-02-Y
	 filtered: 16614-02-Y
	 exported: 16614-02-Y
	analyzed statistics: 16614-02-Y
	 complete.
Starting index 4 of 51: 16314-36-N
	 searched: 16314-36-N
	 converted to df: 16314-36-N
	 filtered: 16314-36-N
	 exported: 16314-36-N
	analyzed statistics: 16314-36-N
	 complete.
Starting index 5 of 51: 16314-12-N
	 searched: 16314-12-N
	 converted to df: 16314-12-N
	 filtered: 16314-12-N
	 exported: 16314-12-N
	analyzed statistics: 16314-12-N
	 complete.
Starting index 6 of 51: 16314-47-Y
	 searched: 16314-47-Y
	 converted to df: 16314-47-Y
	 filtered: 16314

In [40]:
# pd.DataFrame.from_dict(data_stats).T.to_csv("%s/%s_stats.csv"%(OUTPUT_DIR,EXPERIMENT))

In [36]:
# old
# class Counts(object):
#     def __init__(self, idx, counts):
#         self.idx = idx
#         self.counts = counts
    
#     @staticmethod
#     def convert_generator(datadict):
#         i = 0
#         for key in datadict:
#             keyscores = datadict[key]
#             q, g, m = key
#             for kscore in keyscores:
#                 score = kscore[0]+kscore[1] if kscore[0]!='None' and kscore[1]!='None' else 'None'
#                 yield (i, q, g, m, score)
#                 i += 1
#     @staticmethod
#     def get_read_counts(df, q, g, m):
#         qgbbool = []
#         inputqgb = [q,g,m]
#         tags = ['qtag','gtag','mcount']
#         for i in range(len(tags)):
#             b = (df[tags[i]] != 'None') if inputqgb[i] else (df[tags[i]] == 'None')
#             qgbbool.append(b)
#         return len(df.loc[qgbbool[0] & qgbbool[1] & qgbbool[2]])

#     def convert_save_df(self):
#         countsdf = pd.DataFrame(self.convert_generator(self.counts))
#         countsdf.columns = ['index','qtag','gtag','mcount','score']
#         self.df = countsdf
#         return self
    
#     def filter_reads(self):
#         def classify_read(row):
#             passed = 0
#             minscore = np.min([ord(s) for s in row.score]) if row.score != 'None' else 0
#             return 1 if minscore >= 63 else 0
#         self.df['passed'] = self.df.apply(classify_read,axis=1)
#         self.df = self.df.loc[self.df.qtag!='None']
#         return self  
    
#     def export_to_db(self, engine, if_exists='replace'):
#         self.df.to_sql(self.idx, engine, if_exists=if_exists)
#         return
    
#     def consolidate_filter(self, writer):
#         qgm_counts = pd.pivot_table(self.df.loc[self.df['passed']>0], 
#                                      index=['qtag','gtag','mcount'], 
#                                      values='passed', aggfunc=sum)
#         if len(qgm_counts) < 1:
#             self.qgcounts = pd.DataFrame()
#             return self
#         else:
            
#             qg_counts = pd.pivot_table(pd.DataFrame(qgm_counts).reset_index(), 
#                                        index=['qtag','gtag'], 
#                                        values='passed', aggfunc=[sum, len])
#             qg_counts.rename(columns={'len':'molecs','sum':'reads'}, inplace=True)
#             qg_counts.reset_index(inplace=True)
#             qg_counts.sort_values(by='molecs',ascending=False, inplace=True)
#             self.qgcounts = qg_counts
#             qg_counts.to_excel(writer, self.idx)
#             return self
        
#     def get_stats(self):
#         valid = self.df.loc[(self.df.qtag!='None')&
#                             (self.df.gtag!='None')&
#                             (self.df.mcount!='None')]
#         idxstats = {
#             'total reads': len(self.df),
#             'mcounts with qtag, gtag and mcount': len(valid.groupby(['qtag','gtag','mcount'])),
#             'reads with qtag, gtag and mcount': len(valid),
#             'reads with only no qtag': self.get_read_counts(self.df, False, True, True),
#             'reads with only no gtag': self.get_read_counts(self.df, True, False, True),
#             'reads with only no mcount': self.get_read_counts(self.df, True, True, False),
#             'reads with only mcount': self.get_read_counts(self.df,False,False,True),
#             'reads with only barcode': self.get_read_counts(self.df, False,True,False),
#             'reads with only qtag': self.get_read_counts(self.df, True,False,False),
#             'reads with no qtag, barcode or mcount': self.get_read_counts(self.df,False,False,False)
#         }
        
#         return idxstats
