# Purpose:

2015-03-18 (Wednesday)

Query the database.

# Table of Searches:

[How-many-positives-did-we-find-after-Gisella-left-in-July?](#How-many-positives-did-we-find-after-Gisella-left-in-July?)

[Get-next-set-of-fly-IDs-for-Robert's-MicroSat-work](#Get-next-set-of-fly-IDs-for-Robert's-MicroSat-work.)

## Imports:

In [1]:
# imports
import csv
import itertools
import os
import re

import pandas as pd
import numpy as np
import datetime as dt

import scipy

import munch


In [2]:
# for plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import ggplot as gp


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 60)


import ggplot as g


## File paths:

In [3]:
# define paths to files
spring_summer = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_spring_summer_from_rob.xlsx"
fall =          "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_fall_for_pandas.xlsx"
december = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/DEC_2014_survey_for_pandas.xlsx"

h5_out_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/hdf5/2014_collection_records.h5"
json_out_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/hdf5/2014_collection_records.json"
pickle_out_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/hdf5/2014_collection_records.pkl"

In [4]:
village_id_map_path = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/names/uganda_village_id_map.csv"

----
# Helper functions

In [5]:
def date_is_between(test_date, start, end):
    
    try:
        return start <= test_date <= end
    except TypeError as exc:
        if "can't compare datetime" in exc.message:
            return False
        else:
            raise

In [6]:
def get_village_id_map(village_id_map_path):
    """
    Generates and returns a `dict` mapping the long-form village names to the letter codes.
    Letter codes map back to themselves to ensure a one way mapping.  
    Enforces both be all UPPERcase to allow case insensitivity as long as
    the map is used like: `map[text.upper()]`.
    :return: `dict`
    """

    village_id_map = {}

    with open(village_id_map_path, 'rb') as csv_file:
        village_ids = csv.reader(csv_file, delimiter=',')
        for pair in village_ids:
            village_id_map[unicode(pair[0].upper())] = unicode(pair[0].upper())
            village_id_map[unicode(pair[1].upper())] = unicode(pair[0].upper())

    return village_id_map

In [7]:
def load_xl_sheets(xl_path):
    dfs = munch.Munch()
    
    xls = pd.ExcelFile(xl_path)
    
    workbook_name = os.path.basename(xl_path)
    
    for sheet in xls.sheet_names:
        if sheet.upper().startswith("DISSECT"):
            worksheet_df = xls.parse(sheetname=sheet, 
                                header=0, 
                                skiprows=None, skip_footer=0, 
                                index_col=None, parse_cols=None, 
                                parse_dates=False, date_parser=None, 
                                na_values=['NA'], 
                                thousands=None, chunksize=None, 
                                convert_float=False, 
                                has_index_names=False, converters=None)
            
            worksheet_df['workbook'] = workbook_name
            worksheet_df['worksheet'] = sheet
            
            dfs[sheet] = worksheet_df 
            
    return dfs

In [8]:
def recode_villages(df):
    map_func = lambda x: village_id_map[x.upper()]
        
    new_codes = df.Village.apply(map_func)
    df.Village = new_codes    

In [79]:
def recode_dead(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means DEAD
        # False means LIVE or NOT-DEAD
        # None means unknown

        try:
            # deal with Live type cases
            if x.upper().startswith('L'):
                return False


            if x.startswith('0'):
                return False


            # deal with Dead type cases
            if x.upper().startswith('D'):
                return True


            if x.startswith('1'):
                return True


            # deal with unknown type cases
            if x.upper().startswith('UN'):
                return None
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)

    new_dead = df.Dead.apply(recode_func)
    df.Dead = new_dead

##########################################

def recode_teneral(df):
    def recode_func(x):

        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means teneral
        # False means NOT-teneral
        # None means unknown

        try:
            # deal with NOT-teneral type cases
            if x.upper().startswith('N'):
                return False

            if x.startswith('0'):
                return False

            # deal with Teneral type cases
            if x.upper().startswith('T'):
                return True
            
            if x.startswith('1'):
                return True


            # Deal with unknown type cases
            if x.upper().startswith('UN'):
                return x
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_teneral = df.Teneral.apply(recode_func)
    df.Teneral = new_teneral

##########################################

def recode_positives(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        y = unicode(x)

        # deal with Unknown type cases
        if y.upper().startswith('UN'):
            return None

        if y.upper().startswith('DEAD'):
            return None


        # deal with Positive type cases
        if y.startswith('1'):
            return True


        if y.upper().startswith('TRUE'):
            return True

        if y.upper().startswith('P'):
            return True

        if y.upper().startswith('Y'):
            return True


        # deal with Negative type cases
        if y.upper().startswith('NO'):
            return False

        if y.upper().startswith('FALSE'):
            return False


        if y.startswith('0'):
            return False


        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)


    new_prob = df.prob.apply(recode_func)
    df.prob = new_prob
    
    new_midgut = df.midgut.apply(recode_func)
    df.midgut = new_midgut
    
    new_sal_gland = df.sal_gland.apply(recode_func)
    df.sal_gland = new_sal_gland

##########################################

def recode_species(df):

    recode_func = lambda x: ''.join(x.split('.')).capitalize()

    new_Species = df.Species.apply(recode_func)
    df.Species = new_Species

##########################################

def recode_sex(df):

    recode_func = lambda x: x.upper()

    new_Sex = df.Sex.apply(recode_func)
    df.Sex = new_Sex
    
##########################################

date_delim = re.compile('[\./-]')

def cast_unicode_as_date(x):
    if not isinstance(x, unicode):
        return x
    
    parts = date_delim.split(x)
    
    if len(parts) != 3:
        return x
    
    if len(parts[0]) != 4:
        return x
    
    return dt.datetime(int(parts[0]), int(parts[1]), int(parts[2]))

def recode_date(df):
    new_date = df.Date.apply(cast_unicode_as_date)
    df.Date = new_date

##########################################

fly_no_delim = re.compile('[\W\s]', re.UNICODE)

def split_number(x):
#     ipdb.set_trace()
    
    # to prevent unicode creating a string with a '.' AFTER
    # the numbert we are intersted in!
    try:
        if isinstance(x,float):
            return int(x)
    except ValueError as exc:
        if 'NAN' in exc.message.upper():
            return x
    
    x = unicode(x)
    parts = fly_no_delim.split(x)
    
    try:
        number = int(parts[-1])
        return number
    except ValueError:
        return x


def recode_fly_number(df):
    
    new_fly_number = df.Fly_Number.apply(split_number)
    df.Fly_Number = new_fly_number

In [80]:
def aggregate_column_from_df_list(df_list, col_name):
    agg_data = []
    for df in df_list:
        agg_data.extend(list(df[col_name]))
        
    
    return agg_data

### Functions that add new columns

In [81]:
def add_infection_state_col(df):
    df['infection_state'] = df[['prob','midgut','sal_gland']].any(skipna=True,axis=1)      

## Recode Fly_Number

In [82]:
# pdb
# import ipdb

In [83]:
df = pd.read_pickle(pickle_out_path)
df2 = pd.read_pickle(pickle_out_path)
recode_fly_number(df)

ValueError: cannot convert float NaN to integer

In [69]:
NGO = df2.query("Village == 'NGO'")

In [78]:
split_number(NGO.Fly_Number.iloc[0])

0

In [15]:
df_old_flno = pd.read_pickle(pickle_out_path)

In [16]:
df_old_flno.head()

Unnamed: 0,Comment,Date,Dead,Fly_Number,Hunger_stage,Kept_in,Sex,Species,Teneral,Trap_No,Village,Wing_fray,infection_state,midgut,prob,sal_gland,workbook,worksheet
0,,2014-07-22,False,UWA-14 001,2,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
1,,2014-07-22,False,UWA-14 002,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
2,,2014-07-22,False,UWA-14 003,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
3,,2014-07-22,False,UWA-14 004,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
4,,2014-07-22,False,UWA-14 005,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya


----
# Helpful constants

In [17]:
village_id_map = get_village_id_map(village_id_map_path)

In [18]:
# pdb

In [19]:
# # Example pivot_table
# tableXXXX = pd.pivot_table(df.query("XXXX == XXXX"),index=['XXXX'],
#                       values=['XXXX'],
#                       columns=['XXXX'],
#                       fill_value=0,
#                       aggfunc=[len])
# tableXXXX.head()

---
# Query the data

### How many positives did we find after Gisella left in July?

In [20]:
date_mask = df.Date.apply(lambda x: date_is_between(x,
                                                          dt.datetime(2014, 7, 11),
                                                          dt.datetime(2014, 8, 11)))
post_gisella_july = df[date_mask]

In [21]:
post_gisella_july_positive = post_gisella_july.query('infection_state == True')

In [22]:
post_gisella_july_negative = post_gisella_july.query('infection_state == False')

In [23]:
len(post_gisella_july_positive)

30

In [24]:
len(post_gisella_july_negative)

2376

In [25]:
# post_gisella_july_positive.to_csv('/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/seraps_stuff/positives_from_gus_gisella_time.tsv', sep='\t')

In [26]:
# post_gisella_july_negative.to_csv('/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/seraps_stuff/negatives_from_gus_gisella_time.tsv', sep='\t')

### Get next set of fly IDs for Robert's MicroSat work.

In [40]:
def get_random_rows(df, sample_size):
    return df.iloc[np.random.choice(df.index, sample_size, replace=False)]

In [46]:
get_random_rows(df, 10)

Unnamed: 0,Comment,Date,Dead,Fly_Number,Hunger_stage,Kept_in,Sex,Species,Teneral,Trap_No,Village,Wing_fray,infection_state,midgut,prob,sal_gland,workbook,worksheet
24,,2014-07-22,False,25,,EtOH,F,Gff,True,1,UWA,,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
13,,2014-03-23,False,14,2,EtOH,F,Gff,False,7,MWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Kole
370,,2014-07-23,False,370,3,EtOH,M,Gff,False,8,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
532,,2014-07-23,,532,Unk.,EtOH,M,Gpd,False,12,UWA,Unk.,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
77,,2014-07-22,True,77,,EtOH,F,Gff,False,9,UWA,,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
36,,2014-07-22,False,37,3,EtOH,M,Gff,False,5,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
534,,2014-07-23,,534,Unk.,EtOH,M,Gpd,False,12,UWA,Unk.,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
660,,2014-07-24,True,660,,EtOH,F,Gff,False,2,UWA,,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
306,,2014-07-23,True,306,,EtOH,M,Gff,False,2,UWA,,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
96,,2014-07-22,False,96,,EtOH,F,Gff,True,9,UWA,,False,,,,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya


In [56]:
ngo_m = df.query('Sex == "M"').query('Village == "NGO"')
ngo_f = df.query('Sex == "F"').query('Village == "NGO"')

chu_m = df.query('Sex == "M"').query('Village == "CHU"')
chu_f = df.query('Sex == "F"').query('Village == "CHU"')

tum_m = df.query('Sex == "M"').query('Village == "TUM"')
tum_f = df.query('Sex == "F"').query('Village == "TUM"')

In [63]:
df.query('Village == "NGO"')
chu_f.Fly_Number

2     0
3     0
4     0
5     0
7     0
8     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
...
90     0
92     0
93     0
96     0
99     0
100    0
107    0
108    0
109    0
113    0
114    0
115    0
116    0
121    0
122    0
Name: Fly_Number, Length: 69, dtype: object