In [1]:
import pandas as pd
import numpy as np
import regex
import os, sys

In [2]:
"""User inputs"""

"""Experiment name to name output files"""
EXPERIMENT = "sample"

"""Directory path to input data"""
FILTERED_FILEPATH = "../output/filtered-sample.csv"

"""Directory path to save output"""
OUTPUT_DIRECTORY = "../output"
# s
"""Minimum number of reads as a baseline

Used to simplify data processing. 
"""
MIN_READS = 100

# """Groupby for multiple index defining columns"""
# GROUPBY = ['idx']

In [3]:
MOLECS = 'mcountsPF'
QTAG = 'qtag'
BARCODE = 'barcode'
INDEX = 'idx'
READS = 'readsPF'
PER_MOLECS = 'percent_%s'%MOLECS

In [4]:
"""Updates percent molec counters for index"""

def calculate_percent_molecs(df):
    total = float(df[MOLECS].values.sum()) / 100.
    df[PER_MOLECS] = df[MOLECS].apply(lambda x: x/total)
    df = df.sort_values(by=MOLECS, ascending=False)
    return df

In [5]:
"""Loads filtered lib-ID-barcode data csv to dict of indexes"""

def load_data(filtered_file):
    columns = [INDEX, QTAG, BARCODE, MOLECS, READS]
    
    # loads excel file (all tabs)
    csv = pd.read_csv(filtered_file)
    # filter out null barcodes just in case (if custom user input)
    csv = csv.loc[(csv[QTAG]!='None') & (csv[BARCODE]!='None')]
    # format
    csv = csv[columns]
    csv[INDEX] = csv[INDEX].apply(lambda x: str(x))
    # get percent molecs per index, store as output dict entry 
    data = {}
    for idx, group in csv.groupby(INDEX):
        group = calculate_percent_molecs(group.copy())
        data[idx] = group
    return data


In [136]:
def check_data(d):
    # wrangle data to list of list-like pairs, as "[idx, df]"
    data_arr = []
    if type(d) in [np.array, list] :
        data_arr = d
    elif type(d) == dict:
        data_arr = d.items()
    elif type(d) == pd.DataFrame :
        data_arr = [(s,df) for s,df in d.groupby(sample)]
    else:
        print "Input data is not in correct format. Please provide list-like, dict, or pd.DataFrame"
    
    # check input has correct values
    try:
        for a in data_arr:
            assert len(a) == 2
            assert type(a[0]) == str
            assert type(a[1]) == pd.DataFrame
            assert INDEX in a[1].columns
            assert MOLECS in a[1].columns
            assert READS in a[1].columns      
    except IndexError as e:
        print a
        print "Item number of values is not 2.\n"%e.message
    except ValueError as e:
        print a
        print "Index name could not be converted to float: %s\n%s"%(
            type(item[i]), e.message)
    except AssertionError as e:
        print a
        print "Assertion failed: %s\n"%e.message
    return data_arr

In [131]:
"""Thresholds barcodes of a given index

    group(pd.DataFrame): df containing library-ID-barcodes, 
        mcountsPF and percent_mcountsPF
    reps_remaining(int): reps remaining from max number 
        input from user 
    thresh_val(float or int): initial threshold value (percent_mcountPF)
        provided from previous recursion or user input
    thresh_i(int): initial position of threshold value in 
        percent_mcountsPF list, ranging [0,len(group))
        
    Returns: 
        None, if thresholding fails;
        passed(pd.DataFrame), if thresholding successful; or
        self, otherwise, with updated threshold values and 
            group df.
"""

def threshold(group, reps_remaining, thresh_val, thresh_i):
    # max out reps
    if reps_remaining <= 0:
        print 'Maxed out reps. Skipping group.'
    # no barcodes passed threshold 
    elif len(group) == 0:
        print "No barcodes passed threshold. Skipping index."
    else:
        # calculate new threshold 
        calc_threshold_i = calculate_threshold(group.mcountsPF.values)+1
        new_thresh_i = min(calc_threshold_i, len(group)-1)
        new_thresh_val = group.percent_mcountsPF.values[new_thresh_i]
        # if reached steady state
        if new_thresh_val == thresh_val:
            # get rid of any "padding" barcodes (see eliminate_oneoffs fn)
            passed = group.loc[group.is_padding == False]
            # update percent molecs
            passed = calculate_percent_molecs(passed).reindex(range(len(passed)))
            sys.stdout.write('Thresholded.\n')
            return passed
        # recursively clean and re-threshold
        else:
            # clean up group by eliminating one-offs
            group = calculate_percent_molecs(group)
            cleaned = eliminate_oneoffs(group,new_thresh_val)
            # recurse with cleaned df and new threshold values
            return threshold(cleaned, reps_remaining-1, new_thresh_val, new_thresh_i)
        
    # if thresholding failed, return None
    sys.stdout.write('Skipped.\n')
    return pd.DataFrame()

In [110]:
# STEP 2: CALCULATE THRESHOLD via. CONCAVITY
"""Calculates threshold of series with modified concavity approach

    y(np.array or list): list or list-like object of 
        values as floats or ints
    
    Returns index of inflection point in array, 
        i.e. threshold position.
"""

def calculate_threshold(y):
    
    """Constructs list of overlapping subarray ranges of size 2"""
    def rolling_window(arr):
        shape = arr.shape[:-1] + (arr.shape[-1]-1, 2)
        strides = arr.strides + (arr.strides[-1],)
        windows = np.lib.stride_tricks.as_strided(arr, 
                              shape=shape, strides=strides)
        return windows
    """Generates first derivative of windows as relative difference"""
    def first_d_gen(windows):
        for w in windows:
            yield float(w[1]-w[0])/w[0]
    """Gemerates second derivative of windows"""
    def second_d_gen(windows):
        for w in windows:
            yield w[1]-w[0]         
    
    # left and right padding to cover all array vals in derivations
    yarray = np.concatenate([ [y[0]], y, [1] ])
    # calculates first derivative
    first_windows = rolling_window(yarray)
    first_derivs = np.fromiter(first_d_gen(first_windows), np.float
                               , count=len(first_windows))
    # calculates second derivative
    second_windows = rolling_window(first_derivs)
    second_derivs = np.fromiter(second_d_gen(second_windows), np.float
                                , count=len(second_windows))
    # gets index or position value of inflection point
    thresh_i = np.argmin(second_derivs)+1
    return thresh_i

In [98]:
# STEP 3: CLEAN BY ELIMINATING ONE-OFFS
def eliminate_oneoffs(group, thresh_val, pad=True):
    
    group.loc[:,'delete'] = group.mcountsPF.apply(lambda _: False)
    group.loc[:,'is_padding'] = False
    counter = 0
    # add capability to check other parameters, i.e. qtag
    for majorI, marow in group.loc[group.percent_mcountsPF >= thresh_val].iterrows():
        majorG, majorQ, majorDel = marow[['barcode','qtag','delete']]
        if majorDel == False:
            motif = "(%s){s<=1}"%majorG
            subgroup = group[counter+1:].loc[(group.delete==False)]
            for minorI, mirow in subgroup.iterrows():
                minorG = mirow['barcode']
                query = regex.search(motif,minorG)
                if query:
                    group.loc[minorI,'delete'] = True
        counter+=1
        
    output = group.loc[(group.delete==False)]
    deletes_mcountsPF = group.loc[group.delete==True].mcountsPF
    if len(deletes_mcountsPF)> 0 and pad==True:
        max_deletes_i = deletes_mcountsPF.idxmax()
        output = group.loc[(group.delete==False) | (group.index==max_deletes_i)]
        output.loc[:,'delete'] = False 
        output.loc[max_deletes_i, 'is_padding'] = True

    return output
            

In [128]:
# ORIGINAL / INITIAL
# RUN THRESHOLDING FOR ALL SAMPLES IN DICT
def run_threshold(d, sample=[INDEX]):

    passed = []
    counter = 1
    
    data_arr = check_data(d)
    for idx, group in data_arr:
        msg = "Index %d of %d (%s): "%(counter,len(data_arr),idx)
        group = group.loc[(group.qtag!='None') & (group.barcode!='None') 
                          & (group.mcountsPF>50)
                          & (group.readsPF>MIN_READS)]
        result = threshold(group, 20, -1, len(group)+2)
        passed.append(result)
        sys.stdout.flush()
        counter += 1
    if len(passed) > 0:
        passeddf = pd.concat(passed)
        passeddf.sort_values(by=[SAMPLE+[PER_MOLECS]]
                             ,ascending=[True,False],inplace=True)
        passeddf.drop(['delete','is_padding'], axis=1, inplace=True)
        return passeddf
    else:
        print "No indexes were successfully thresholded."
    return 

In [100]:
data = load_data(FILTERED_FILEPATH)

In [129]:
passed = run_threshold(data)

checked idx NH025
checked idx NH120
checked idx NH001
checked idx NH005


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.
Thresholded.


In [None]:
passed.loc[:,'passed_threshold'] = True
passed.to_csv('../output/thresholded-%s-passed.csv'%EXP_NAME)

In [102]:
counts = passed.groupby(GROUP_BY).agg(len)['gtag']
counts.name='barcodes'
counts = pd.DataFrame(counts)
counts.reset_index(inplace=True)
counts.to_csv('../output/thresholded-%s-counts.csv'%EXP_NAME)

In [107]:
merged = pd.merge(data,passed,on=['idx','qtag','barcode'], how='outer')
#merged.drop(['molecs_y','reads_y','percent_molecs'],axis=1, inplace=True)
#merged.rename(columns={'molecs_x':'molecs','reads_x':'reads'},inplace=True)
merged.passed_threshold = merged.passed_threshold.fillna(False)
merged.to_csv('../output/thresholded-%s-merged.csv'%EXP_NAME)