# Consolidate manual annotations
consolidate manual annotations of threshold data to generate a final list of thresholded barcodes for plotting
<br/>
**V1**: updated 2016-05-03 

In [1]:
import pandas as pd
import numpy as np

In [2]:
HIGH_CYCLE_FILE = '../output/thresholded-2016-05-02-Ling_redo-barcodes-merged-manual_edit.csv'
OUTPUT_FILE = '../output/thresholded-2016-05-02-Ling_redo-barcodes-final.csv'
SAMPLES_FILE_LING = '../helpers/samples-ling-2016_05_03.csv'
SAMPLES_FILE_FLYNN = '../helpers/samples-scrapates-all.csv'

In [3]:
samples = pd.read_csv(SAMPLES_FILE_LING, sep="\t")


In [4]:
data = pd.read_csv(HIGH_CYCLE_FILE, sep='\t')

In [10]:
'''
helper functions for high-cycle data
'''

def load_highcycle_data(fname):
    data = pd.read_csv(fname, sep='\t')
    data = data.loc[(data.manual==True)]

    drop_columns = ["Unnamed: 0", "idx", "percent_molecs", 'passed_threshold']
    data.drop(drop_columns, axis=1, inplace=True)
    data.reset_index(inplace=True,drop=True)

    data.loc[:,'prep_type'] = "high cycle"
    data.loc[:,'sample_type'] = data.monkey.apply(lambda x: 'BAL-GA' if x=='BALGA' else 'scrapate')
    data.tube = data.tube.astype(int)

    group_columns = ['monkey','tube','molecs']
    totals = data[group_columns].groupby(group_columns[:-1]).agg(sum)
    totals.loc[:,'molecs'] = totals.molecs.apply(lambda x: float(x)/100.)
    data.loc[:,'percent_molecs'] = data.apply(lambda x: x.molecs / totals.loc[(x.monkey, x.tube),'molecs'], axis=1)
    
    return data

def merge_highcycle_sample_info(data):
    scrapate_data = data.loc[data.sample_type=='scrapate']
    scrapate_data.monkey = scrapate_data.monkey.astype(int)
    merged_scrapates = pd.merge(scrapate_data,
                        samples.loc[samples.sample_type=='scrapate'],
                        how='outer',on=['monkey','tube','sample_type'])
    balga_data = data.loc[data.sample_type=='BAL-GA']
    
    merged_balga = pd.merge(balga_data,
                           samples.loc[samples.sample_type!='scrapate'],
                           how='outer',on='tube', suffixes=['_d',""])
    drop_columns = ['monkey_d','sample_type_d']
    merged_balga.drop(drop_columns, axis=1, inplace=True)
    
    return [merged_scrapates, merged_balga]
def merge_highcycle_dfs(dfs):
    merged = pd.concat(dfs)
    merged.drop('manual',axis=1,inplace=True)
    columns = ['monkey','sample_type','tissue', 'tissue_type',
               'qtag','gtag', 'molecs', 'percent_molecs', 'reads', 
               'prep_type', 'tube', 'count', 'dilution' ]
    merged = merged[columns]
    merged.sort_values(by=['monkey','sample_type','tube','molecs'],
                       ascending=[True,False,True,False], inplace=True)
    return merged

In [14]:
'''
RUN function for high-cycle data
'''

def process_highcycle_data(fname):
    data = load_highcycle_data(fname)
    return data
    merge_dfs = merge_highcycle_sample_info(data)
    merged = merge_highcycle_dfs(merge_dfs)
#     return merged

In [15]:
data = process_highcycle_data(HIGH_CYCLE_FILE)

In [17]:
data

Unnamed: 0,monkey,tube,qtag,gtag,molecs,reads,manual,prep_type,sample_type,percent_molecs
0,16314,10,q24,CATCTCT,16077,16917,True,high cycle,scrapate,59.825848
1,16314,10,q27,GCGGTAG,10796,11177,True,high cycle,scrapate,40.174152
2,16314,2,q26,CGCTTTG,17969,19037,True,high cycle,scrapate,100.0
3,16314,22,q24,GTTAATA,1586,1596,True,high cycle,scrapate,100.0
4,16314,26,q19,TTCGTCT,18920,20049,True,high cycle,scrapate,95.147096
5,16314,26,q19,TAGGGTC,965,970,True,high cycle,scrapate,4.852904
6,16314,27,q23,CGGCGCT,33237,36834,True,high cycle,scrapate,97.859498
7,16314,27,q23,GGGGGGG,727,731,True,high cycle,scrapate,2.140502
8,16314,3,q19,TTCGTCT,18664,19685,True,high cycle,scrapate,100.0
9,16314,30,q23,CGGCGCT,28475,30964,True,high cycle,scrapate,100.0


In [12]:
highcycle = process_highcycle_data(HIGH_CYCLE_FILE)

In [8]:
# highcycle.to_csv(OUTPUT_FILE)

In [13]:
highcycle

Unnamed: 0,monkey,sample_type,tissue,tissue_type,qtag,gtag,molecs,percent_molecs,reads,prep_type,tube,count,dilution
67,16314.0,scrapate,LUL Gran 8,gran,,,,,,,1.0,39.0,Neat
2,16314.0,scrapate,LUL Gran 5,gran,q26,CGCTTTG,17969.0,100.000000,19037.0,high cycle,2.0,20.0,Neat
8,16314.0,scrapate,LUL Gran 4,gran,q19,TTCGTCT,18664.0,100.000000,19685.0,high cycle,3.0,40.0,Neat
18,16314.0,scrapate,LUL Gran B,gran,q24,TATCGTC,18732.0,54.717532,19801.0,high cycle,4.0,2.0,Neat
19,16314.0,scrapate,LUL Gran B,gran,q27,GCGGTAG,10864.0,31.734533,11239.0,high cycle,4.0,2.0,Neat
20,16314.0,scrapate,LUL Gran B,gran,q23,CGGCGCT,4638.0,13.547935,4708.0,high cycle,4.0,2.0,Neat
68,16314.0,scrapate,LUL Gran 3,gran,,,,,,,5.0,13.0,Neat
69,16314.0,scrapate,RLL Gran G,gran,,,,,,,6.0,98.0,Neat
27,16314.0,scrapate,RUL Gran 1,gran,q24,CCTGCGT,48095.0,88.816458,55582.0,high cycle,7.0,1.0,Neat
28,16314.0,scrapate,RUL Gran 1,gran,q23,CGGCGCT,5568.0,10.282359,5661.0,high cycle,7.0,1.0,Neat
