In [1]:
import pandas as pd
import numpy as np
import regex
import os, sys

In [2]:
"""User inputs"""

"""Experiment name to name output files"""
EXPERIMENT = "sample"

"""Directory path to input data"""
FILTERED_FILEPATH = "../output/filtered-sample.csv"

"""Directory path to save output"""
OUTPUT_DIRECTORY = "../output"
# s
"""Minimum number of reads as a baseline

Used to simplify data processing. 
"""
MIN_READS = 100

# """Groupby for multiple index defining columns"""
# GROUPBY = ['idx']

In [3]:
MOLECS = 'mcountsPF'
QTAG = 'qtag'
BARCODE = 'barcode'
INDEX = 'idx'
READS = 'readsPF'
PER_MOLECS = 'percent_%s'%MOLECS

In [4]:
"""Updates percent molec counters for index"""

def calculate_percent_molecs(df):
    total = float(df[MOLECS].values.sum()) / 100.
    df[PER_MOLECS] = df[MOLECS].apply(lambda x: x/total)
    df = df.sort_values(by=MOLECS, ascending=False)
    return df

In [5]:
"""Loads filtered lib-ID-barcode data csv to dict of indexes"""

def load_data(filtered_file):
    columns = [INDEX, QTAG, BARCODE, MOLECS, READS]
    
    # loads excel file (all tabs)
    csv = pd.read_csv(filtered_file)
    # filter out null barcodes just in case (if custom user input)
    csv = csv.loc[(csv[QTAG]!='None') & (csv[BARCODE]!='None')]
    # format
    csv = csv[columns]
    csv[INDEX] = csv[INDEX].apply(lambda x: str(x))
    # get percent molecs per index, store as output dict entry 
    data = {}
    for idx, group in csv.groupby(INDEX):
        group = calculate_percent_molecs(group.copy())
        data[idx] = group
    return data


In [6]:
data = load_data(FILTERED_FILEPATH)

In [12]:
test = np.array([1,2,3])

type(test) in [np.ndarray, list]

True

In [15]:
ddict = {'a':test, 'b':np.array([1,2])}
if [type(v) == np.ndarray for v in ddict.values()]: print 'hi'

hi


In [21]:
test2 = [[1,2],[3,4]]
[ [type(k),type(v)]==[str,np.array] for (k,v) in ddict.items() ]

[False, False]

In [38]:
cond_item_types = lambda arr: [ [type(i[0]),type(i[1])] == [str,pd.DataFrame] 
                           for i in arr]


In [37]:
type(data.items()[0][1]) == pd.DataFrame

True

In [42]:
cond_item_types(data.items())

[True, True, True, True]

In [None]:

# RUN THRESHOLDING FOR ALL SAMPLES IN DICT
def run_threshold(in_data):
    tvals = {}
    tis = {}
    passed = {}
    counter = 1
    arr = in_data.items() if type(in_data) == dict else in_data
    for idx, df in arr:
        
        msg = "Index %d of %d (%s): "%(counter,len(datadict),idx)
        df = df.loc[df.molecs>50]
        if len(df) > 0 and max(group[MOLECS].values) > MIN_READS:
            result = threshold_group(group, 20, -1, len(group)+2)
            if result:
                passed[idx], tvals[idx], tis[idx] = result
                msg += "Thresholded.\n" 
            else: msg += "Skipped.\n"
        else: msg+= "Skipped.\n"
        sys.stdout.write(msg)
        sys.stdout.flush()
        counter += 1

    try:
        passeddf = pd.concat(passed.values())
        passeddf.loc[:,'monkey'] = passeddf.monkey.apply(lambda x: int(x))
#         passeddf.loc[:,'tube'] = passeddf.tube.apply(lambda x: int(x))
        passeddf.sort_values(by=GROUP_BY+['percent_molecs'],ascending=[True,True,False,False],inplace=True)
        passeddf.reset_index(inplace=True, drop=True)
        passeddf.drop(['delete'], axis=1, inplace=True)
        
    except Exception, e:
        print 'EXCEPTION', e
        passeddf = passed
    return passeddf, tvals, tis

In [None]:
# """TEST!!!!!!! WITH CHECKS"""

# # RUN THRESHOLDING FOR ALL SAMPLES IN DICT
# def run_threshold(in_data):
    
    
#     c_listlike = type(in_data) in [list, np.ndarray]
#     c_dict = type(in_data) == dict
    
#     # ensure there are correct number of entries in each item of list 
#     c_listitems = [len(item)==2 for item in in_data]
    
#     # ensure item types are valid (first val or dict key is str for idx, 
#     # second val or dict val is pd.DataFrame)
#     if type(in_data) in [dict, list, np.ndarray]: 
#         # if dict, change to list
#         in_data = in_data.items() if type(in_data) == dict else in_data
#         # check valid types
#         c_itemtypes = [[type(i), type(d)] == [str,pd.DataFrame] for [i,d] in in_data]
#         if sum(c_itemtypes) != len(in_data):
#             print "Invalid data input: not item types are valid. Please use \
#             format [idx(str), d(pd.DataFrame)]"
#             sys.exit(1)
#     # if dataframe, group into indexes
#     elif type(in_data) == pd.DataFrame: 
#         groups = input_data.groupby('idx').groups
#         idx_dfs = [idx, groups.get_group(idx) for idx in groups.keys()]
    
#     else:
#         print "No valid data input provided. Aborting. "
#         sys.exit(1)
    
    
        
    
#     tvals = {}
#     tis = {}
#     passed = {}
#     counter = 1
    
#     datadict = d if type(d)==type(dict()) else dict(((i,g) for i,g in d.groupby(GROUP_BY)))
#     for idx in datadict:
#         msg = "Index %d of %d (%s): "%(counter,len(datadict),idx)
#         group = datadict[idx]
        
#         """APPLY MIN READS"""
#         group = group.loc[(group.molecs>50)]
#         if len(group) > 0 and max(group.reads.values) > MIN_READS:
            
#             """"ACTION"""
#             result = threshold_group(group, 20, -1, len(group)+2)
            
#             if result:
#                 passed[idx], tvals[idx], tis[idx] = result
                

#         counter += 1

#     try:
#         passeddf = pd.concat(passed.values())
#         passeddf.loc[:,'monkey'] = passeddf.monkey.apply(lambda x: int(x))
# #         passeddf.loc[:,'tube'] = passeddf.tube.apply(lambda x: int(x))
#         passeddf.sort_values(by=GROUP_BY+['percent_molecs'],ascending=[True,True,False,False],inplace=True)
#         passeddf.reset_index(inplace=True, drop=True)
#         passeddf.drop(['delete'], axis=1, inplace=True)
        
#     except Exception, e:
#         print 'EXCEPTION', e
#         passeddf = passed
#     return passeddf, tvals, tis

In [83]:
# STEP 2: CALCULATE THRESHOLD via. CONCAVITY
def calculate_threshold_few_complex(y):
    
    def rolling_window(arr):
        shape = arr.shape[:-1] + (arr.shape[-1]-1, 2)
        strides = arr.strides + (arr.strides[-1],)
        windows = np.lib.stride_tricks.as_strided(arr, 
                              shape=shape, strides=strides)
        return windows
    
    def first_d_gen(windows):
        for w in windows:
            yield float(w[1]-w[0])/w[0]
    def second_d_gen(windows):
        for w in windows:
            yield w[1]-w[0]         
    def derive(arr):
        windows = rolling_window(y)
        der = np.fromiter(derivative_gen(windows), 
                          np.float, count=len(windows))
        return der
    
    yarray = np.concatenate([ [y[0]], y, [1] ])
    windows_y = rolling_window(yarray)
    first_d = np.fromiter(first_d_gen(windows_y), np.float, count=len(windows_y))
    windows_1 = rolling_window(first_d)
    second_d = np.fromiter(second_d_gen(windows_1), np.float, count=len(windows_1))
    
    return np.argmin(second_d)+1

In [7]:
# STEP 3: CLEAN BY ELIMINATING ONE-OFFS
def eliminate_oneoffs(group, threshold, thr_i, pad=True):
    
    group.loc[:,'delete'] = group.molecs.apply(lambda _: False)
    counter = 0
    # add capability to check other parameters, i.e. qtag
    for majorI, marow in group.loc[group.percent_molecs > threshold].iterrows():
        majorG, majorQ, majorDel = marow[['gtag','qtag','delete']]
        if majorDel == False:
            motif = "(%s){s<=1}"%majorG
            subgroup = group[counter+1:].loc[(group.delete==False)]
            for minorI, mirow in subgroup.iterrows():
                minorG = mirow['gtag']
                query = regex.search(motif,minorG)
                if query:
                    group.loc[minorI,'delete'] = True
        counter+=1
        
    output = group.loc[(group.delete==False)]
    if len(group.loc[group.delete==True])> 0 and pad==True:
        maxdelete = group.loc[group.delete==True].molecs.idxmax()
        output = group.loc[(group.delete==False) | (group.index==maxdelete)]
        output.loc[:,'delete'] = False    
    return output
            

In [82]:
# THRESHOLD A SAMPLE/INDEX/DF GROUP
def threshold_group(group, reps_remaining, threshold, threshold_i):
    
    group = calculate_percent_molecs(group).reset_index(drop=True)
    cleaned = eliminate_oneoffs(group,threshold, threshold_i)
    
    if threshold > 0:
        cleaned = cleaned.loc[cleaned.percent_molecs >= threshold]
    
    
    initial_thr = calculate_threshold_few_complex(cleaned.molecs.values)+1
    thr_i = min(initial_thr, len(cleaned)-1)
    new_threshold = group.percent_molecs.values[thr_i]

    # max out reps
    if reps_remaining <= 0:
        print 'Maxed out reps. Skipping group.'
        return 

    # if reached steady state
    elif new_threshold == threshold:
        prepass = cleaned.loc[cleaned.molecs >= group.molecs.values[thr_i]]
        prepass.reset_index(drop=True, inplace=True)
        prepass2 = eliminate_oneoffs(prepass, -1, len(prepass), pad=False)
        prepass.reset_index(drop=True, inplace=True)

        thr_i = min(calculate_threshold_few_complex(prepass2.molecs.values)+1, len(prepass2)-1)
        threshold = prepass2.percent_molecs.values[thr_i]
        passed = prepass2.loc[prepass2.percent_molecs >= threshold]
        passed = calculate_percent_molecs(passed)
        return passed, threshold, thr_i
    
    # recursively clean and re-threshold
    else:
        return threshold_group(cleaned, reps_remaining-1, new_threshold, thr_i)

In [99]:

# RUN THRESHOLDING FOR ALL SAMPLES IN DICT
def run_threshold(d):
    tvals = {}
    tis = {}
    passed = {}
    counter = 1
    
    datadict = d if type(d)==type(dict()) else dict(((i,g) for i,g in d.groupby(GROUP_BY)))
    for idx in datadict:
        msg = "Index %d of %d (%s): "%(counter,len(datadict),idx)
        group = datadict[idx]
        group = group.loc[(group.qtag!='None') & (group.gtag!='None') & (group.molecs>50)]
        if len(group) > 0 and max(group.reads.values) > MIN_READS:
            result = threshold_group(group, 20, -1, len(group)+2)
            if result:
                passed[idx], tvals[idx], tis[idx] = result
                msg += "Thresholded.\n" 
            else: msg += "Skipped.\n"
        else: msg+= "Skipped.\n"
        sys.stdout.write(msg)
        sys.stdout.flush()
        counter += 1

    try:
        passeddf = pd.concat(passed.values())
        passeddf.loc[:,'monkey'] = passeddf.monkey.apply(lambda x: int(x))
#         passeddf.loc[:,'tube'] = passeddf.tube.apply(lambda x: int(x))
        passeddf.sort_values(by=GROUP_BY+['percent_molecs'],ascending=[True,True,False,False],inplace=True)
        passeddf.reset_index(inplace=True, drop=True)
        passeddf.drop(['delete'], axis=1, inplace=True)
        
    except Exception, e:
        print 'EXCEPTION', e
        passeddf = passed
    return passeddf, tvals, tis

In [100]:
data = load_data(FILTERED_FILE)

In [101]:
passed, threshold_values, threshold_i = run_threshold(data)

Index 1 of 53 ((9815, 9, 720.0)): Thresholded.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Index 2 of 53 ((9615, 18, 727.0)): Thresholded.
Index 3 of 53 ((9815, 13, 720.0)): Thresholded.
Index 4 of 53 ((9815, 40, 720.0)): Skipped.
Index 5 of 53 ((9615, 24, 727.0)): Thresholded.
Index 6 of 53 ((9615, 13, 727.0)): Skipped.
Index 7 of 53 ((9615, 6, 727.0)): Skipped.
Index 8 of 53 ((9815, 25, 720.0)): Thresholded.
Index 9 of 53 ((9615, 11, 727.0)): Skipped.
Index 10 of 53 ((9615, 12, 727.0)): Skipped.
Index 11 of 53 ((9815, 19, 720.0)): Thresholded.
Index 12 of 53 ((9815, 33, 720.0)): Thresholded.
Index 13 of 53 ((9815, 30, 720.0)): Thresholded.
Index 14 of 53 ((9815, 28, 720.0)): Thresholded.
Index 15 of 53 ((9815, 5, 720.0)): Thresholded.
Index 16 of 53 ((9815, 23, 727.0)): Thresholded.
Index 17 of 53 ((9615, 22, 727.0)): Skipped.
Index 18 of 53 ((9815, 34, 727.0)): Skipped.
Index 19 of 53 ((9615, 27, 727.0)): Skipped.
Index 20 of 53 ((9815, 36, 720.0)): Thresholded.
Index 21 of 53 ((17914, 'p1', 727.0)): Skipped.
Index 22 of 53 ((9615, 1, 727.0)): Skipped.
Index 23 of 53 ((96

In [102]:
counts = passed.groupby(GROUP_BY).agg(len)['gtag']
counts.name='barcodes'
counts = pd.DataFrame(counts)
counts.reset_index(inplace=True)

In [103]:
passed.loc[:,'passed_threshold'] = True

In [107]:
merged = pd.merge(data,passed,on=['monkey','tube','sample_type','qtag','gtag'], how='outer')
merged.drop(['molecs_y','reads_y','percent_molecs'],axis=1, inplace=True)
merged.rename(columns={'molecs_x':'molecs','reads_x':'reads'},inplace=True)
merged.passed_threshold = merged.passed_threshold.fillna(False)

In [109]:
merged.to_csv('../output/thresholded-%s-merged.csv'%EXP_NAME)

In [110]:
pd.DataFrame(counts).to_csv('../output/thresholded-%s-counts.csv'%EXP_NAME)


In [111]:
passed.to_csv('../output/thresholded-%s-passed.csv'%EXP_NAME)

In [11]:
# def calculate_threshold_simple(y):
#     yarray = np.concatenate([ [y[0]], y, [0] ])
#     return np.argmin(np.diff(yarray,n=2))+1