## Updated as of 2016 July 26
#### FUNCTIONAL!

### REMOVED from Master Barcode_XYZ_7.14.16-abbr3.xlsx

i.e. skepticals

  * 17814_ACC0A ( pet-neg, cfu-neg )
  * 17814_ACC0E ( pet-neg, cfu-neg )
  
  
  * 17914_RLL24A ( pet-neg, cfu-neg )
  * 17914_RLL0A ( pet-neg, cfu-neg )
  * 17914_RLL0B ( pet-neg, cfu-neg )
  * 17914_RLL0C ( pet-neg, cfu-neg )
  * 17914_RLL0D ( pet-neg, cfu-neg )
  * 17914_RLL0E ( pet-neg, cfu-neg )
  * 17914_RLL0F ( pet-neg, cfu-neg )


  * 18014_RLL06A ( pet-neg, cfu-neg )


  * 18114_ACCD ( wasn't sent )
  * 18114_ACCE ( wasn't sent )
  * (include RLL6)

### CHANGES TO BE MADE
  * remove all lesions which: 1) have letter designation , and 2) no barcode
  * get rid of all the lesions not seen on PET and not culturable.
  * keep lesions that have no barcode and not seen on scan, but have CFU & not lost in pipeline 
  * 18114 ACC D and ACC E - not sent, don't include
  * don't include ones lost in the pipeline


### PLOT CHANGES
  * differentiate no barcode but CFU+ from steriles (i.e. replace masking with former)


In [1]:

import numpy as np
import pandas as pd
import os

In [8]:
FILE = '../data/Master Barcode_XYZ_7.14.16-abbr3.xlsx'
xlsx = pd.ExcelFile(FILE)
sheetnames = xlsx.sheet_names
OUTPUT_FOLDER = "../data/wrangled/"

In [3]:
EXCLUDE = {'18114':['ACCD','ACCE']}

In [4]:
def prep_data(data, monkey):
    def clean_up_initial_data(row):
        row['lesion'] = row.lesion.split("_")[1]
        row['barcode'] = row.barcode.strip() if pd.notnull(row.barcode) else 'x'
        row['lesion_locus'] = row.lesion_tissue_type if pd.isnull(row.lesion_locus) else row.lesion_locus.strip()
        row['cfuonly'] = 1 if ((row['barcode'] == 'x') and (row['cfu']>0)) else 0
        
        return row

    # don't include ones lost or not recovered
    data = data.loc[(data.nx_neg==0) # not recovered in nx
                    & (data.lost_pp==0) # not lost in pipeline (or skeptical)
                    & (data.secondary==0) # not secondary lesion
                   ]
    # if custom exclusions
    
    data.drop(['comments','nx_neg', 'lost_pp', 'secondary'], axis=1, inplace=True)
    # clean up data
    data = data.apply(clean_up_initial_data, axis=1)
    data['present'] = 0
    data.reset_index(inplace=True,drop=True)
    if monkey in EXCLUDE: data = data.loc[data.lesion.isin(EXCLUDE[monkey])==False]

    return data

In [5]:
# construct lesions table sorted by z, and assign lesion_id, assign to data
def construct_lesions_table(data):
    def apply_num_barcodes(group):
        series = group[:1]
        series['num_barcodes'] = len(group.loc[group.barcode!='x'])
        return series
    lesions = data.groupby('lesion', as_index=False).apply(apply_num_barcodes).reset_index(drop=True)
    lesions.sort_values(by=['lesion_tissue_type','lesion_locus','z'],ascending=False,inplace=True) 
    lesions['lesion_id'] = range(len(lesions))
    lesions.set_index('lesion',inplace=True)
    data.loc[:,'lesion_id'] = data.lesion.apply(lambda x: lesions.loc[x,'lesion_id'])
    return data, lesions

In [15]:
# construct barcodes table and assign barcode_id based on lesion_id, assign to data
def construct_barcodes_table(data, lesions):
    def apply_lesion_info_to_barcodes(group):
        barcode = group.name
        keys = [ 'min_lesion_id','num_lesions', 'dissem_type','grans_present', 'lesions' ]
        num_lesions = len(group)
        gran_lesions_ids = group.loc[group.lesion_tissue_type=='Gran']
        min_lesion_id = min(gran_lesions_ids.lesion_id) if (barcode!='x') and (len(gran_lesions_ids)>0) else len(lesions)
        grans_present = True if (len(gran_lesions_ids)>0) and barcode!='x' else False
       
        dissem_type = 'reclassify' if barcode=='x' else 'disseminated' if len(group)>1 else 'contained'
        group_lesions = group.lesion.values
        return pd.Series( [min_lesion_id, num_lesions, dissem_type, grans_present, group_lesions], keys )
    
    def apply_barcode_info_to_data(row):
        barcode_info = barcodes.loc[row.barcode,:]
        row['barcode_id'] = barcode_info.barcode_id
        row['dissem_type'] = barcode_info.dissem_type
        return row

    barcodes = data.groupby(['barcode']).apply(apply_lesion_info_to_barcodes).reset_index()
    barcodes.sort_values(by=['dissem_type','grans_present','min_lesion_id','num_lesions','barcode'],
                         ascending=[False,False,True,False,True], inplace=True)
    barcodes.set_index(['barcode'], inplace=True)
    
    barcodes['barcode_id'] = range(len(barcodes))
    data = data.apply(apply_barcode_info_to_data, axis=1)
    
    data.loc[:,'dissem_type'] = data.apply(lambda x: x.dissem_type if x.dissem_type != 'reclassify' 
                                           else 'sterile' if x.cfu==0 else 'cfuonly', axis=1)
    return data, barcodes


In [16]:
all_data = {}
if os.path.isdir(OUTPUT_FOLDER) == False: os.mkdir(OUTPUT_FOLDER)
for sheet in sheetnames:
    monkey = str(sheet.split("_")[0])
    
    data = xlsx.parse(sheet)
    data = prep_data(data, monkey)
    data, lesions = construct_lesions_table(data)
    data, barcodes = construct_barcodes_table(data, lesions)
    
    data.loc[data.sterile==1,'present'] = 2
    data.loc[data.cfuonly==1,'present'] = 3
    all_data[monkey] = (data, lesions, barcodes)
    data.to_csv('../data/wrangled/wrangled-%s.csv'%monkey)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
