# Purpose:

2015-02-16 (Monday)

Generate code to import and summarize collection field records in pandas while the real DB is being finished.

# Implementation:

## Imports:

In [31]:
# imports
import csv
import itertools

import pandas as pd
import numpy as np

import munch

In [2]:
# for plotting
import ggplot as g


## File paths:

In [3]:
# define paths to files

spring_summer = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_spring_summer_from_rob.xlsx"
fall =          "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_fall_for_pandas.xlsx"


h5_out_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/hdf5/2014_collection_records.h5"

In [4]:
village_id_map_path = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/names/uganda_village_id_map.csv"

----
# Helper functions

In [48]:
def get_village_id_map(village_id_map_path):
    """
    Generates and returns a `dict` mapping the long-form village names to the letter codes.
    Letter codes map back to themselves to ensure a one way mapping.  
    Enforces both be all UPPERcase to allow case insensitivity as long as
    the map is used like: `map[text.upper()]`.
    :return: `dict`
    """

    village_id_map = {}

    with open(village_id_map_path, 'rb') as csv_file:
        village_ids = csv.reader(csv_file, delimiter=',')
        for pair in village_ids:
            village_id_map[unicode(pair[0].upper())] = unicode(pair[0].upper())
            village_id_map[unicode(pair[1].upper())] = unicode(pair[0].upper())

    return village_id_map

In [6]:
def load_xl_sheets(xl_path):
    dfs = munch.Munch()
    
    xls = pd.ExcelFile(xl_path)
    
    for sheet in xls.sheet_names:
        if sheet.upper().startswith("DISSECT"):
            dfs[sheet] = xls.parse(sheetname=sheet, 
                                header=0, 
                                skiprows=None, skip_footer=0, 
                                index_col=None, parse_cols=None, 
                                parse_dates=False, date_parser=None, 
                                na_values=['NA'], 
                                thousands=None, chunksize=None, 
                                convert_float=False, 
                                has_index_names=False, converters=None)
    return dfs

In [125]:
def recode_villages(df):
        map_func = lambda x: village_id_map[x.upper()]
        
    new_codes = df.Village.apply(map_func)
    df.Village = new_codes    

In [299]:
def recode_dead(df):
    def am_i_true_or_false(x):

        # True means DEAD
        # False means LIVE or NOT-DEAD

        try:
            # deal with Live type cases
            if x.upper().startswith('L'):
                return False

            # deal with Dead type cases
            if x.upper().startswith('D'):
                return True
            
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_dead = df.Dead.fillna(0).apply(am_i_true_or_false)
    df.Dead = new_dead

In [298]:
def recode_teneral(df):
    def am_i_true_or_false(x):

        # True means teneral
        # False means NOT-teneral

        try:
            # deal with NOT-teneral type cases
            if x.upper().startswith('NT'):
                return False

            # deal with teneral type cases
            if x.upper().startswith('T'):
                return True
            
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_teneral = df.Teneral.fillna(0).apply(am_i_true_or_false)
    df.Teneral = new_teneral

In [259]:
def recode_positives(df):
    def am_i_true_or_false(x):

        y = unicode(x)

        # deal with false type cases
        if y.upper().startswith('UN'):
            return False

        if y.upper().startswith('NO'):
            return False

        if y.upper().startswith('FALSE'):
            return False
        
        if y.upper().startswith('DEAD'):
            return False

        if x == 0:
            return False


        # deal with true type cases
        if x == 1:
            return True

        if y.upper().startswith('TRUE'):
            return True

        if y.upper().startswith('P'):
            return True

        if y.upper().startswith('Y'):
            return True

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_prob = df.prob.fillna(0).apply(am_i_true_or_false)
    df.prob = new_prob
    
    new_midgut = df.midgut.fillna(0).apply(am_i_true_or_false)
    df.midgut = new_midgut
    
    new_sal_gland = df.sal_gland.fillna(0).apply(am_i_true_or_false)
    df.sal_gland = new_sal_gland
    

In [44]:
def aggregate_column_from_df_list(df_list, col_name):
    agg_data = []
    for df in df_list:
        agg_data.extend(list(df[col_name]))
        
    
    return agg_data

### Functions that add new columns

In [281]:
def add_infection_state_col(df):
    
    df.infection_state = df.prob | df.midgut | df.sal_gland
    
        

----
# Helpful constants

In [96]:
village_id_map = get_village_id_map(village_id_map_path)

In [271]:
spring_summer_dfs = load_xl_sheets(spring_summer)
fall_dfs = load_xl_sheets(fall)

In [272]:
fall_dfs.values()[0].head(2)

Unnamed: 0,Village,Trap_No,Date,Species,Sex,Teneral,Dead,Fly_Number,Hunger_stage,prob,midgut,sal_gland,Wing_fray,Kept_in,Comment
0,Kitgum town council,1,2014-10-08,Gff,M,T,,1,,0,0,0,,Ethanol,
1,Kitgum town council,1,2014-10-08,Gff,M,NT,,2,3.0,0,0,0,2.0,Ethanol,


In [273]:
df = fall_dfs.values()[0]
recode_villages(df)

In [303]:
sum(['toy',True,True])

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [275]:
df.prob.unique()

array([  0.,  nan])

In [276]:
recode_positives(df)
df.prob.unique()

array([False], dtype=object)

In [277]:
# for df in df_all:
#     print set(df.columns)

In [286]:
(df.prob + df.midgut + df.sal_gland)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
86     False
87     False
88     False
89     False
90     False
91     False
92     False
93     False
94     False
95     False
96     False
97     False
98     False
99     False
100    False
Length: 101, dtype: bool

In [300]:
# list all dfs and run our recoder's so far
df_all = spring_summer_dfs.values() + fall_dfs.values()

for df in df_all:
    recode_villages(df)
    recode_positives(df)
    recode_dead(df)
    recode_teneral(df)
    add_infection_state_col(df)

ValueError: The value Unk. was not expected and this function must be corrected to continue.

In [294]:
df_all[2].head()

Unnamed: 0,Village,Trap_No,Date,Species,Sex,Teneral,Dead,Fly_Number,Hunger_stage,Wing_fray,prob,midgut,sal_gland,Kept_in,Comment
0,GOR,2,2014/07/27,G.ff,F,NT,L,GOR-14 001,2,2,False,False,False,EtOH,
1,GOR,2,2014/07/27,G.ff,F,NT,L,GOR-14 002,2,2,False,False,False,EtOH,
2,GOR,2,2014/07/27,G.ff,M,NT,L,GOR-14 003,3,2,False,False,False,EtOH,
3,GOR,2,2014/07/27,G.ff,F,NT,L,GOR-14 004,3,2,False,False,False,EtOH,
4,GOR,2,2014/07/27,G.ff,F,NT,L,GOR-14 005,2,2,False,False,False,EtOH,


In [289]:
prob_all = aggregate_column_from_df_list(df_all, 'prob')

sum(prob_all)

0

In [290]:
midgut_all = aggregate_column_from_df_list(df_all, 'midgut')

sum(midgut_all)

51

In [291]:
sal_gland_all = aggregate_column_from_df_list(df_all, 'sal_gland')

sum(sal_gland_all)

1

In [292]:
infection_state_all = aggregate_column_from_df_list(df_all, 'infection_state')

sum(infection_state_all)

KeyError: 'infection_state'

---
# somethign new