# Purpose:

2015-02-16 (Monday)

Generate code to import and summarize collection field records in pandas while the real DB is being finished.

# Implementation:

## Imports:

In [1]:
# imports
import csv
import itertools

import pandas as pd
import numpy as np

import munch

from tribool import Tribool

In [2]:
# for plotting
import ggplot as g


## File paths:

In [3]:
# define paths to files
spring_summer = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_spring_summer_from_rob.xlsx"
fall =          "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_fall_for_pandas.xlsx"


h5_out_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/hdf5/2014_collection_records.h5"

In [4]:
village_id_map_path = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/names/uganda_village_id_map.csv"

----
# Helper functions

In [5]:
def get_village_id_map(village_id_map_path):
    """
    Generates and returns a `dict` mapping the long-form village names to the letter codes.
    Letter codes map back to themselves to ensure a one way mapping.  
    Enforces both be all UPPERcase to allow case insensitivity as long as
    the map is used like: `map[text.upper()]`.
    :return: `dict`
    """

    village_id_map = {}

    with open(village_id_map_path, 'rb') as csv_file:
        village_ids = csv.reader(csv_file, delimiter=',')
        for pair in village_ids:
            village_id_map[unicode(pair[0].upper())] = unicode(pair[0].upper())
            village_id_map[unicode(pair[1].upper())] = unicode(pair[0].upper())

    return village_id_map

In [6]:
def load_xl_sheets(xl_path):
    dfs = munch.Munch()
    
    xls = pd.ExcelFile(xl_path)
    
    for sheet in xls.sheet_names:
        if sheet.upper().startswith("DISSECT"):
            dfs[sheet] = xls.parse(sheetname=sheet, 
                                header=0, 
                                skiprows=None, skip_footer=0, 
                                index_col=None, parse_cols=None, 
                                parse_dates=False, date_parser=None, 
                                na_values=['NA'], 
                                thousands=None, chunksize=None, 
                                convert_float=False, 
                                has_index_names=False, converters=None)
    return dfs

In [7]:
def recode_villages(df):
    map_func = lambda x: village_id_map[x.upper()]
        
    new_codes = df.Village.apply(map_func)
    df.Village = new_codes    

In [100]:
def recode_dead(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means DEAD
        # False means LIVE or NOT-DEAD
        # None means unknown

        try:
            # deal with Live type cases
            if x.upper().startswith('L'):
                return False


            if x.startswith('0'):
                return False


            # deal with Dead type cases
            if x.upper().startswith('D'):
                return True


            if x.startswith('1'):
                return True


            # deal with unknown type cases
            if x.upper().startswith('UN'):
                return None
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)

    new_dead = df.Dead.apply(recode_func)
    df.Dead = new_dead

##########################################

def recode_teneral(df):
    def recode_func(x):

        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means teneral
        # False means NOT-teneral
        # None means unknown

        try:
            # deal with NOT-teneral type cases
            if x.upper().startswith('N'):
                return False

            if x.startswith('0'):
                return False

            # deal with Teneral type cases
            if x.upper().startswith('T'):
                return True
            
            if x.startswith('1'):
                return True


            # Deal with unknown type cases
            if x.upper().startswith('UN'):
                return x
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_teneral = df.Teneral.apply(recode_func)
    df.Teneral = new_teneral

##########################################

def recode_positives(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        y = unicode(x)

        # deal with Unknown type cases
        if y.upper().startswith('UN'):
            return None

        if y.upper().startswith('DEAD'):
            return None


        # deal with Positive type cases
        if y.startswith('1'):
            return True


        if y.upper().startswith('TRUE'):
            return True

        if y.upper().startswith('P'):
            return True

        if y.upper().startswith('Y'):
            return True


        # deal with Negative type cases
        if y.upper().startswith('NO'):
            return False

        if y.upper().startswith('FALSE'):
            return False


        if y.startswith('0'):
            return False


        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)


    new_prob = df.prob.apply(recode_func)
    df.prob = new_prob
    
    new_midgut = df.midgut.apply(recode_func)
    df.midgut = new_midgut
    
    new_sal_gland = df.sal_gland.apply(recode_func)
    df.sal_gland = new_sal_gland

##########################################

def recode_species(df):

    recode_func = lambda x: ''.join(x.split('.')).capitalize()

    new_Species = df.Species.apply(recode_func)
    df.Species = new_Species

##########################################

def recode_sex(df):

    recode_func = lambda x: x.upper()

    new_Sex = df.Sex.apply(recode_func)
    df.Sex = new_Sex

In [101]:
def aggregate_column_from_df_list(df_list, col_name):
    agg_data = []
    for df in df_list:
        agg_data.extend(list(df[col_name]))
        
    
    return agg_data

### Functions that add new columns

In [102]:
def add_infection_state_col(df):
    
    df['infection_state'] = df[['prob','midgut','sal_gland']].any(skipna=True,axis=1)
    
        

----
# Helpful constants

In [103]:
village_id_map = get_village_id_map(village_id_map_path)

In [104]:
spring_summer_dfs = load_xl_sheets(spring_summer)
fall_dfs = load_xl_sheets(fall)

In [105]:
fall_dfs.values()[0].head(2)

Unnamed: 0,Village,Trap_No,Date,Species,Sex,Teneral,Dead,Fly_Number,Hunger_stage,prob,midgut,sal_gland,Wing_fray,Kept_in,Comment
0,Kitgum town council,1,2014-10-08,Gff,M,T,,1,,0,0,0,,Ethanol,
1,Kitgum town council,1,2014-10-08,Gff,M,NT,,2,3.0,0,0,0,2.0,Ethanol,


In [106]:
# pdb

In [107]:
# list all dfs and run our recoder's so far
df_all = spring_summer_dfs.values() + fall_dfs.values()

for df in df_all:
    recode_sex(df)
    recode_species(df)
    recode_villages(df)
    recode_positives(df)
    recode_dead(df)
    recode_teneral(df)
    add_infection_state_col(df)

In [108]:
df.head()

Unnamed: 0,Village,Trap_No,Date,Species,Sex,Teneral,Dead,Fly_Number,Hunger_stage,prob,midgut,sal_gland,Wing_fray,Kept_in,Comment,infection_state
0,LAG,13,2014-10-12,Gff,F,False,,1,3,False,False,False,2,ethanol,,False
1,LAG,13,2014-10-12,Gff,F,False,,2,2,False,False,False,2,ethanol,,False
2,LAG,13,2014-10-12,Gff,F,False,,3,2,False,False,False,3,ethanol,,False
3,LAG,12,2014-10-12,Gff,M,False,,4,3,False,False,False,2,ethanol,,False
4,LAG,12,2014-10-12,Gff,F,False,,5,1,False,False,False,3,ethanol,,False


In [109]:
df_all[2].head()

Unnamed: 0,Village,Trap_No,Date,Species,Sex,Teneral,Dead,Fly_Number,Hunger_stage,Wing_fray,prob,midgut,sal_gland,Kept_in,Comment,infection_state
0,GOR,2,2014/07/27,Gff,F,False,False,GOR-14 001,2,2,False,False,False,EtOH,,False
1,GOR,2,2014/07/27,Gff,F,False,False,GOR-14 002,2,2,False,False,False,EtOH,,False
2,GOR,2,2014/07/27,Gff,M,False,False,GOR-14 003,3,2,False,False,False,EtOH,,False
3,GOR,2,2014/07/27,Gff,F,False,False,GOR-14 004,3,2,False,False,False,EtOH,,False
4,GOR,2,2014/07/27,Gff,F,False,False,GOR-14 005,2,2,False,False,False,EtOH,,False


In [110]:
(df_all[2].prob | df_all[2].midgut | df_all[2].sal_gland).sum()

0

In [111]:
prob_all = aggregate_column_from_df_list(df_all, 'prob')

prob_all.count(True)

0

In [112]:
midgut_all = aggregate_column_from_df_list(df_all, 'midgut')

midgut_all.count(True)

51

In [113]:
sal_gland_all = aggregate_column_from_df_list(df_all, 'sal_gland')

sal_gland_all.count(True)

1

In [114]:
infection_state_all = aggregate_column_from_df_list(df_all, 'infection_state')

infection_state_all.count(True)

51

---
# Combine dataframes to one for ruling

In [115]:
df_big = pd.concat(df_all)

In [116]:
len(df_big)

5335

In [117]:
df_big.head()

Unnamed: 0,Comment,Date,Dead,Fly_Number,Hunger_stage,Kept_in,Sex,Species,Teneral,Trap_No,Village,Wing_fray,infection_state,midgut,prob,sal_gland
0,,2014/07/22,False,UWA-14 001,2,EtOH,F,Gff,False,1,UWA,2,False,False,False,False
1,,2014/07/22,False,UWA-14 002,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False
2,,2014/07/22,False,UWA-14 003,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False
3,,2014/07/22,False,UWA-14 004,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False
4,,2014/07/22,False,UWA-14 005,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False


---
# Pivot this table

In [121]:
table = pd.pivot_table(df_big.query('Species == ["Gff"]'),index=['Village','Sex','Hunger_stage'],
                      values=['Count'],
                      columns=['infection_state'],
                      fill_value=0,
                      aggfunc=[len])
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,len,len
Unnamed: 0_level_1,Unnamed: 1_level_1,infection_state,False,True
Village,Sex,Hunger_stage,Unnamed: 3_level_2,Unnamed: 4_level_2
ABO,M,3.0,2,0
ACA,F,1.0,20,1
ACA,F,2.0,147,8
ACA,F,3.0,128,1
ACA,M,2.0,15,1
ACA,M,3.0,51,1
AG,F,1.0,1,0
AG,F,2.0,5,0
AG,F,3.0,9,0
AIN,F,1.0,4,0


In [94]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,len,len,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Sex,F,M,m
Village,Date,Species,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ABO,21/05/14,Gff,0,5,0
ABO,22/05/14,Gff,2,2,0
ACA,18/5/2014,Gff,11,9,0
ACA,20/05/14,Gff,70,75,0
ACA,2014/07/16,Gff,57,43,0
ACA,2014/07/17,Gff,28,16,0
ACA,2014/07/18,Gff,100,75,0
ACA,2014/07/19,Gff,49,25,0
ACA,2014/07/20,Gff,100,55,0
ACA,21/05/14,Gff,43,8,0
