## NOTES
**V1**
solved ratio +/-1 issues; except for test 18...
<br/>
making a copy to preserve this version
<br/>
**V2**
DOUBLES ARE SENSITIVE; adjust for triples/multi to recognize low abundance barcodes in more complex samples
<br/>
making copy (v3) to preserve this version
<br/>
**V3**
MULTI WORKS ( added cutoff for previous threshold before calc new thr, post clean)
<br/>
making a copy (v4) to clean up version 
### V4 (this version)
2016-05-01:  from 2016-04-12-ratio_thresholding, renamed from threshold_new_20160412-v4.ipynb

In [53]:
import pandas as pd
import numpy as np
import regex
import os, sys


In [54]:
FILTERED_FILE = '../data/filtered-allmonkeys-scrapate_ga.csv'
MIN_READS = 1000
GROUP_BY = ['monkey','tube','sample_type']

In [55]:
def calculate_percent_molecs(df):
    total = float(df.molecs.values.sum()) / 100.
    df.loc[:,'percent_molecs'] = df.molecs.apply(lambda x: x/total)
    df.reset_index(inplace=True,drop=True)
    return df

In [56]:
def load_data(filtered_file):
    # loads excel file (all tabs)
    data = pd.read_csv(filtered_file)
    data.reset_index(inplace=True,drop=True)
    
    # clean up no qtags, adds per_molecs column
    data = data.loc[data.qtag!='None']
    data = data.groupby(['monkey','tube','sample_type']).apply(calculate_percent_molecs)
    
    # formatting
    columns = ['monkey','tube','qtag','gtag','molecs','reads','sample_type']
    data = data[columns]
    data.sort_values(by=['monkey','tube','sample_type','molecs'], inplace=True, ascending=[True,True,False,False])
    data.reset_index(inplace=True,drop=True)
    data.loc[:,'monkey'] = data.monkey.apply(lambda x: int(x))
    data.loc[:,'tube'] = data.tube.apply(lambda x: x if x[:1]=='n' else int(x))
    return data


In [57]:
def calculate_threshold_simple(y):
    yarray = np.concatenate([ [y[0]], y, [0] ])
    return np.argmin(np.diff(yarray,n=2))+1

In [58]:
# STEP 2: CALCULATE THRESHOLD via. CONCAVITY
def calculate_threshold_few_complex(y):
    
    def rolling_window(arr):
        shape = arr.shape[:-1] + (arr.shape[-1]-1, 2)
        strides = arr.strides + (arr.strides[-1],)
        windows = np.lib.stride_tricks.as_strided(arr, 
                              shape=shape, strides=strides)
        return windows
    
    def first_d_gen(windows):
        for w in windows:
            yield float(w[1]-w[0])/w[0]
    def second_d_gen(windows):
        for w in windows:
            yield w[1]-w[0]         
    def derive(arr):
        windows = rolling_window(y)
        der = np.fromiter(derivative_gen(windows), 
                          np.float, count=len(windows))
        return der
    
    yarray = np.concatenate([ [y[0]], y, [1] ])
    windows_y = rolling_window(yarray)
    first_d = np.fromiter(first_d_gen(windows_y), np.float, count=len(windows_y))
    windows_1 = rolling_window(first_d)
    second_d = np.fromiter(second_d_gen(windows_1), np.float, count=len(windows_1))
    
    return np.argmin(second_d)+1

In [59]:
# STEP 3: CLEAN BY ELIMINATING ONE-OFFS
def eliminate_oneoffs(group, threshold, thr_i, pad=True):
    
    group.loc[:,'delete'] = group.molecs.apply(lambda _: False)
    counter = 0
    # add capability to check other parameters, i.e. qtag
    for majorI, marow in group.loc[group.percent_molecs > threshold].iterrows():
        majorG, majorQ, majorDel = marow[['gtag','qtag','delete']]
        if majorDel == False:
            motif = "(%s){s<=1}"%majorG
            subgroup = group[counter+1:].loc[(group.delete==False)]
            for minorI, mirow in subgroup.iterrows():
                minorG = mirow['gtag']
                query = regex.search(motif,minorG)
                if query:
                    group.loc[minorI,'delete'] = True
        counter+=1
        
    output = group.loc[(group.delete==False)]
    if len(group.loc[group.delete==True])> 0 and pad==True:
        maxdelete = group.loc[group.delete==True].molecs.idxmax()
        output = group.loc[(group.delete==False) | (group.index==maxdelete)]
        output.loc[:,'delete'] = False    
    return output
            

In [60]:
# THRESHOLD A SAMPLE/INDEX/DF GROUP
def threshold_group(group, reps_remaining, threshold, threshold_i):
    
    group = calculate_percent_molecs(group).reset_index(drop=True)
    cleaned = eliminate_oneoffs(group,threshold, threshold_i)
    
    if threshold > 0:
        cleaned = cleaned.loc[cleaned.percent_molecs >= threshold]
    
    
    initial_thr = calculate_threshold_few_complex(cleaned.molecs.values)+1
    thr_i = min(initial_thr, len(cleaned)-1)
    new_threshold = group.percent_molecs.values[thr_i]

    # max out reps
    if reps_remaining <= 0:
        print 'Maxed out reps. Skipping group.'
        return 

    # if reached steady state
    elif new_threshold == threshold:
        prepass = cleaned.loc[cleaned.molecs >= group.molecs.values[thr_i]]
        prepass.reset_index(drop=True, inplace=True)
        prepass2 = eliminate_oneoffs(prepass, -1, len(prepass), pad=False)
        prepass.reset_index(drop=True, inplace=True)

        thr_i = min(calculate_threshold_few_complex(prepass2.molecs.values)+1, len(prepass2)-1)
        threshold = prepass2.percent_molecs.values[thr_i]
        passed = prepass2.loc[prepass2.percent_molecs >= threshold]
        passed = calculate_percent_molecs(passed)
        return passed, threshold, thr_i
    
    # recursively clean and re-threshold
    else:
        return threshold_group(cleaned, reps_remaining-1, new_threshold, thr_i)

In [61]:

# RUN THRESHOLDING FOR ALL SAMPLES IN DICT
def run_threshold(d):
    tvals = {}
    tis = {}
    passed = {}
    counter = 1
    
    datadict = d if type(d)==type(dict()) else dict(((i,g) for i,g in d.groupby(GROUP_BY)))
    for idx in datadict:
        msg = "Index %d of %d (%s): "%(counter,len(datadict),idx)
        group = datadict[idx]
        group = group.loc[(group.qtag!='None') & (group.gtag!='None')]
        if len(group) > 0 and max(group.reads.values) > MIN_READS:
            result = threshold_group(group, 20, -1, len(group)+2)
            if result:
                passed[idx], tvals[idx], tis[idx] = result
                msg += "Thresholded.\n" 
            else: msg += "Skipped.\n"
        else: msg+= "Skipped.\n"
        sys.stdout.write(msg)
        sys.stdout.flush()
        counter += 1

    try:
        passeddf = pd.concat(passed.values())
        passeddf.loc[:,'monkey'] = passeddf.monkey.apply(lambda x: int(x))
        passeddf.loc[:,'tube'] = passeddf.tube.apply(lambda x: int(x))
        passeddf.sort_values(by=GROUP_BY+['percent_molecs'],ascending=[True,True,False,False],inplace=True)
        passeddf.reset_index(inplace=True, drop=True)
        passeddf.drop(['delete'], axis=1, inplace=True)
        
    except Exception, e:
        print 'EXCEPTION', e
        passeddf = passed
    return passeddf, tvals, tis

In [62]:
data = load_data(FILTERED_FILE)

In [63]:
passed, threshold_values, threshold_i = run_threshold(data)

Index 1 of 155 ((18014, 12, 'scrapate')): Thresholded.
Index 2 of 155 ((17513, 11, 'scrapate')): Thresholded.
Index 3 of 155 ((18014, 24, 'scrapate')): Thresholded.
Index 4 of 155 ((17814, 10, 'scrapate')): Thresholded.
Index 5 of 155 ((18114, 11, 'scrapate')): Thresholded.
Index 6 of 155 ((17914, 8, 'scrapate')): Thresholded.
Index 7 of 155 ((18014, 9, 'scrapate')): Thresholded.
Index 8 of 155 ((15614, 3, 'scrapate')): Thresholded.
Index 9 of 155 ((18014, 1, 'ga')): Thresholded.
Index 10 of 155 ((18014, 21, 'scrapate')): Thresholded.
Index 11 of 155 ((18014, 6, 'scrapate')): Thresholded.
Index 12 of 155 ((18114, 1, 'scrapate')): Thresholded.
Index 13 of 155 ((15614, 10, 'scrapate')): Thresholded.
Index 14 of 155 ((18114, 18, 'scrapate')): Thresholded.
Index 15 of 155 ((18014, 18, 'scrapate')): Thresholded.
Index 16 of 155 ((15614, 9, 'scrapate')): Thresholded.
Index 17 of 155 ((15614, 22, 'scrapate')): Skipped.
Index 18 of 155 ((17814, 24, 'scrapate')): Thresholded.
Index 19 of 155 ((

In [64]:
counts = passed.groupby(GROUP_BY).agg(len)['gtag']
counts.name='barcodes'
counts = pd.DataFrame(counts)
counts.reset_index(inplace=True)

In [65]:
passed.loc[:,'passed_threshold'] = True

In [66]:
merged = pd.merge(data,passed,on=['monkey','tube','sample_type','qtag','gtag'], how='outer')
merged.drop(['molecs_y','reads_y'],axis=1, inplace=True)
merged.rename(columns={'molecs_x':'molecs','reads_x':'reads'},inplace=True)
merged.passed_threshold = merged.passed_threshold.fillna(False)

In [None]:
merged.to_csv('../output/thresholded-allmonkey-scrapates_ga-barcodes.csv')

In [67]:
pd.DataFrame(counts).to_csv('../output/thresholded-allmonkey-scrapates_ga-counts.csv')


In [68]:
merged.to_csv('../output/thresholded-allmonkey-scrapates_ga-barcodes-merged.csv')