## Updated as of 2016 July 26
#### FUNCTIONAL!

### REMOVED from Master Barcode_XYZ_7.14.16-abbr3.xlsx

i.e. skepticals

  * 17814_ACC0A ( pet-neg, cfu-neg )
  * 17814_ACC0E ( pet-neg, cfu-neg )
  
  
  * 17914_RLL24A ( pet-neg, cfu-neg )
  * 17914_RLL0A ( pet-neg, cfu-neg )
  * 17914_RLL0B ( pet-neg, cfu-neg )
  * 17914_RLL0C ( pet-neg, cfu-neg )
  * 17914_RLL0D ( pet-neg, cfu-neg )
  * 17914_RLL0E ( pet-neg, cfu-neg )
  * 17914_RLL0F ( pet-neg, cfu-neg )


  * 18014_RLL06A ( pet-neg, cfu-neg )


  * 18114_ACCD ( wasn't sent )
  * 18114_ACCE ( wasn't sent )
  * (include RLL6)

### CHANGES DONE
  * get rid of all the lesions not seen on PET and not culturable. (ie.remove all lesions which: (1) have letter designation/pet-neg) , and 2) no barcode)
  * keep lesions that have no barcode and not seen on scan, but have CFU & not lost in pipeline 
  * 18114 ACC D and ACC E - not sent, don't include
  * don't include ones lost in the pipeline


### PLOT CHANGES
  * change masking to not presents


In [106]:
import numpy as np
import pandas as pd
import os

In [107]:
FILE = '../data/Master Barcode_XYZ_7.14.16-abbr4.xlsx'
xlsx = pd.ExcelFile(FILE)
sheetnames = xlsx.sheet_names
OUTPUT_FOLDER = ["../data/wrangled/",'abbr4/']

In [108]:
EXCLUDE = {'18114':['ACCD','ACCE']}

In [109]:
def prep_data(sheet):
    def clean_up_initial_data(row):
        row['lesion'] = row.lesion.split("_")[1]
        row['lesion_locus'] = row.lesion_tissue_type if pd.isnull(row.lesion_locus) else row.lesion_locus.strip()        
        return row

    data = xlsx.parse(sheet)    
    # don't include ones lost or not recovered or sterile or cfu only (only have active ones)
    data = data.loc[(data.nx_neg==0) # not recovered in nx
                    & (data.lost_pp==0) # not lost in pipeline (or skeptical)
                    & (data.secondary==0) # not secondary lesion
                    & (data.sterile==0) # not sterile
                    & (((data.z.isnull()==True) & (data.cfu==0))==False) # no cfuonly
                   ]
    
    # clean up the df
    data.drop(['comments','nx_neg', 'lost_pp', 'secondary', 'sterile'], axis=1, inplace=True)
    data = data.apply(clean_up_initial_data, axis=1)
    data.reset_index(inplace=True,drop=True)

    # if custom exclusions
    if monkey in EXCLUDE: data = data.loc[data.lesion.isin(EXCLUDE[monkey])==False]
    return data

In [110]:
# def construct_barcode_refs(data):
#     def parse_barcode_tuples(row):
#         les, zs = row.item()
#         keys = ['lesion_list', 'max_z','num_lesions','barcode_dissem_type']
#         lesion_list = lesion_list
#         grans = 
#         gran_pos = 
        
#         values = [les, max(zs),len(les), 'disseminated' if len(les)>1 else 'contained']
#         return pd.Series(values,keys)

#     # construct table
#     barcode_counts = pd.DataFrame(data.groupby('barcode').apply(lambda x: (x.lesion.values, x.z)))
#     barcode_refs = barcode_counts.apply(parse_barcode_tuples, axis=1).sort_values(
#         by=['num_lesions','max_z'], ascending=False)
#     # assign barcode id based on sort
#     barcode_refs['barcode_id'] = range(1,len(barcode_refs)+1)
#     # assign barcode prop to data
#     for prop in ['barcode_dissem_type','barcode_id']:
#         data.loc[:,prop] = data.barcode.apply(lambda x: barcode_refs.loc[x,prop])
#     return data, barcode_refs

In [111]:
def construct_barcode_refs(data):
    
    def construct_barcode_refs(group):
        keys = ['lesion_list', 'gran_pos', 'max_lesion_z','num_lesions','barcode_dissem_type']
        items = {
            'lesion_list':np.array([]),
            'gran_pos':False,
            'max_lesion_z':-float('inf'),
            'num_lesions':0,
            'barcode_dissem_type':'contained'
        }
        
        
        items['lesion_list'] = group.lesion.values
        grans = group.loc[group.lesion_tissue_type=='Gran']
        items['gran_pos'] = True if len(grans)>0 else False
        items['max_lesion_z'] = max(grans.z.values) if items['gran_pos'] else -float('inf')
        items['num_lesions'] = len(group)
        items['barcode_dissem_type'] = 'disseminated' if len(group)>1 else 'contained'
        return pd.Series(items.values(),items.keys())
    
    # construct table
    barcode_refs = pd.DataFrame(data.groupby('barcode').apply(construct_barcode_refs))
    barcode_refs.sort_values(by=['gran_pos','barcode_dissem_type','max_lesion_z'], 
                             ascending=False, inplace=True)
    # assign barcode id based on sort
    barcode_refs['barcode_id'] = range(1,len(barcode_refs)+1)
    # assign barcode prop to data
    for prop in ['barcode_dissem_type','barcode_id','max_lesion_z']:
        data.loc[:,prop] = data.barcode.apply(lambda x: barcode_refs.loc[x,prop])
    return data, barcode_refs

In [157]:
def construct_lesion_refs(data, barcode_refs):
    
    def get_lesion_info(group):
        tissue_type, locus, z = group[:1][['lesion_tissue_type','lesion_locus','z']].values[0]
        keys = ['lesion_tissue_type','lesion_locus','z','min_barcode_id','lesion_dissem_type']
        min_barcode_id  = min(group.barcode_id.values)
        min_barcode = group.barcode.values[group.barcode_id.values==min_barcode_id][0]
        lesion_dissem_type = 'disseminating' if barcode_refs.loc[min_barcode,'barcode_dissem_type'
                                                                ] == 'disseminated' else 'contained' 
        return pd.Series( [tissue_type, locus, z, min_barcode_id, lesion_dissem_type], keys )
    
    lesion_refs = data.groupby('lesion').apply(get_lesion_info)
    lesion_refs.reset_index(inplace=True)
    lesion_refs.set_index('lesion',inplace=True)
    lesion_refs.sort_values(by=['lesion_tissue_type','lesion_locus', 'z','min_barcode_id'], ascending=[True,True,False,True], inplace=True)
    # assign lesion id based on sort
    lesion_refs['lesion_id'] = range(1,len(lesion_refs)+1)
    # assign lesion props to data
    for prop in ['lesion_dissem_type','lesion_id','z']:
        data.loc[:,prop] = data.lesion.apply(lambda x: lesion_refs.loc[x,prop])
    return data, lesion_refs

In [158]:
all_data = {}
for sheet in sheetnames:
    monkey = str(sheet.split("_")[0])
    data = prep_data(sheet)
    data.loc[data.z.isnull(),['x','y','z']] = -float('inf')
    data, barcodes = construct_barcode_refs(data)
    data, lesions = construct_lesion_refs(data, barcodes)
    # cherry on top
    data.sort_values(by=['lesion_id','z','barcode_id'],ascending=[True,False,True])
    data['present'] = 0
    all_data[monkey] = (data, lesions, barcodes)


In [160]:
for i in range(len(OUTPUT_FOLDER)):
    folder = "".join(OUTPUT_FOLDER[:i+1])
    if os.path.isdir(folder) == False: os.mkdir(folder)
output_folder = "".join(OUTPUT_FOLDER)
dfnames = ['wrangled','lesions','barcodes']
for monkey in all_data.keys():
    dfs = all_data[monkey]
    for df, name in zip(dfs, dfnames):
        path = "%s%s"%(output_folder,name)
        if os.path.isdir(path)==False: os.mkdir(path)
        df.to_csv("%s/%s-%s.csv"%(path,name,monkey))