# Purpose:

- Michelle wants extra info for the flies that she is testing for spiroplasma.
    - sex
    - tryp status
    - Location of of the `AJ` samples

## Imports:

In [3]:
# imports
import datetime as dt

import csv
import itertools
import os
import re

import pandas as pd
pd.options.display.max_rows = 1000
import numpy as np
import datetime as dt

import scipy

import munch


## File paths:

In [4]:
# define paths to files
spring_summer = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_spring_summer_from_rob.xlsx"
fall =          "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/2014_fall_for_pandas.xlsx"
december = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/collection_data/DEC_2014_survey_for_pandas.xlsx"



In [5]:
village_id_map_path = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/names/uganda_village_id_map.csv"

In [6]:
!cat $village_id_map_path

ABO,Abok,
ACA,Acankoma,
AG,Agoba B,
AIN,Aina,
AKA,Akayo-debe,
AKA,Akayodebe,
AKY,AkayoIdebe,
ALE,Alege,
ALI,Alim,
ALI,Aliodri,
AMI,Aminikawach,
APU,Aputu-Lwaa,
ASW,Aswa Bridge,
ATM,Atanga Mission,
BOL,Bola,
CE,Cefo,Moyo
CHU,Chua,
DUK,Duku,
GAN,Gangu,
GOR,Gorodona,
JIA,Jiako,
KIL,Kilak,
KTC,Kitgum town council,
LAG,Lagwel,
LAK,Lakwala,
LEA,Lea,
LIB,Liba,
MOP,Moyipi,
MWA,Mwanya,
NGO,Ngomoromo,
OCA,Ocala,
OCL,Ocol,
OCU,Oculoi,
OD,Odworo,
OGU,Oguk,
OKS,Okidi South,
OLE,Olepo,
OLO,Olobo,
OLW,Olwi,
OMI,Omido,
OPU,Opuyu,
ORB,Orubakulemi,
ORI,Ori,
ORV,Orivu,
OSG,Osugo East,
OSG,Osugo West,
PAW,Pawor,
TEO,Te-Okot,
TUM,Tumangu,
UWA,Uganda Wildlife Authority,
WEN,Wende,
,,
,,
ESI,Esia,Adjumani
OYA,Oringya,Adjumani
PAG,Pagirinya,Adjumani
MAD,Madilu,Adjumani
,,
,,
LPI,Laropi,Moyo
BLA,Belameling,Moyo


----
# Helper functions

In [7]:
def date_is_between(test_date, start, end):
    
    try:
        return start <= test_date <= end
    except TypeError as exc:
        if "can't compare datetime" in exc.message:
            return False
        else:
            raise

In [8]:
def get_village_id_map(village_id_map_path):
    """
    Generates and returns a `dict` mapping the long-form village names to the letter codes.
    Letter codes map back to themselves to ensure a one way mapping.  
    Enforces both be all UPPERcase to allow case insensitivity as long as
    the map is used like: `map[text.upper()]`.
    :return: `dict`
    """

    village_id_map = {}

    with open(village_id_map_path, 'rb') as csv_file:
        village_ids = csv.reader(csv_file, delimiter=',')
        for pair in village_ids:
            village_id_map[unicode(pair[0].upper())] = unicode(pair[0].upper())
            village_id_map[unicode(pair[1].upper())] = unicode(pair[0].upper())

    return village_id_map

In [9]:
def load_xl_sheets(xl_path):
    dfs = munch.Munch()
    
    xls = pd.ExcelFile(xl_path)
    
    workbook_name = os.path.basename(xl_path)
    
    for sheet in xls.sheet_names:
        if sheet.upper().startswith("DISSECT"):
            worksheet_df = xls.parse(sheetname=sheet, 
                                header=0, 
                                skiprows=None, skip_footer=0, 
                                index_col=None, parse_cols=None, 
                                parse_dates=False, date_parser=None, 
                                na_values=['NA'], 
                                thousands=None, chunksize=None, 
                                convert_float=False, 
                                has_index_names=False, converters=None)
            
            worksheet_df['workbook'] = workbook_name
            worksheet_df['worksheet'] = sheet
            
            dfs[sheet] = worksheet_df 
            
    return dfs

In [10]:
def recode_villages(df):
    map_func = lambda x: village_id_map[x.upper()]
        
    new_codes = df.Village.apply(map_func)
    df.Village = new_codes    

In [11]:
def recode_dead(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means DEAD
        # False means LIVE or NOT-DEAD
        # None means unknown

        try:
            # deal with Live type cases
            if x.upper().startswith('L'):
                return False


            if x.startswith('0'):
                return False


            # deal with Dead type cases
            if x.upper().startswith('D'):
                return True


            if x.startswith('1'):
                return True


            # deal with unknown type cases
            if x.upper().startswith('UN'):
                return None
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)

    new_dead = df.Dead.apply(recode_func)
    df.Dead = new_dead

##########################################

def recode_teneral(df):
    def recode_func(x):

        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        x = unicode(x)

        # True means teneral
        # False means NOT-teneral
        # None means unknown

        try:
            # deal with NOT-teneral type cases
            if x.upper().startswith('N'):
                return False

            if x.startswith('0'):
                return False

            # deal with Teneral type cases
            if x.upper().startswith('T'):
                return True
            
            if x.startswith('1'):
                return True


            # Deal with unknown type cases
            if x.upper().startswith('UN'):
                return x
        except AttributeError:
            return x

        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)
    
    
    new_teneral = df.Teneral.apply(recode_func)
    df.Teneral = new_teneral

##########################################

def recode_positives(df):
    def recode_func(x):
        # this is treated as an unknown case
        if pd.isnull(x):
            return x

        y = unicode(x)

        # deal with Unknown type cases
        if y.upper().startswith('UN'):
            return None

        if y.upper().startswith('DEAD'):
            return None


        # deal with Positive type cases
        if y.startswith('1'):
            return True


        if y.upper().startswith('TRUE'):
            return True

        if y.upper().startswith('P'):
            return True

        if y.upper().startswith('Y'):
            return True


        # deal with Negative type cases
        if y.upper().startswith('NO'):
            return False

        if y.upper().startswith('FALSE'):
            return False


        if y.startswith('0'):
            return False


        msg = "The value {x} was not expected and this function must be corrected to continue.".format(x=x)
        raise ValueError(msg)


    new_prob = df.prob.apply(recode_func)
    df.prob = new_prob
    
    new_midgut = df.midgut.apply(recode_func)
    df.midgut = new_midgut
    
    new_sal_gland = df.sal_gland.apply(recode_func)
    df.sal_gland = new_sal_gland

##########################################

def recode_species(df):

    recode_func = lambda x: ''.join(x.split('.')).capitalize()

    new_Species = df.Species.apply(recode_func)
    df.Species = new_Species

##########################################

def recode_sex(df):

    recode_func = lambda x: x.upper()

    new_Sex = df.Sex.apply(recode_func)
    df.Sex = new_Sex
    
##########################################

date_delim = re.compile('[\./-]')

def cast_unicode_as_date(x):
    if not isinstance(x, unicode):
        return x
    
    parts = date_delim.split(x)
    
    if len(parts) != 3:
        return x
    
    if len(parts[0]) != 4:
        return x
    
    return dt.datetime(int(parts[0]), int(parts[1]), int(parts[2]))

def recode_date(df):
    new_date = df.Date.apply(cast_unicode_as_date)
    df.Date = new_date

##########################################

fly_no_delim = re.compile('[\W\s]', re.UNICODE)

def split_number(x):
#     ipdb.set_trace()
    
    # to prevent unicode creating a string with a '.' AFTER
    # the numbert we are intersted in!
    try:
        if isinstance(x,float):
            return int(x)
    except ValueError as exc:
        if 'NAN' in exc.message.upper():
            return x
    
    x = unicode(x)
    parts = fly_no_delim.split(x)
    
    try:
        number = int(parts[-1])
        return number
    except ValueError:
        return x


def recode_fly_number(df):
    
    new_fly_number = df.Fly_Number.apply(split_number)
    df.Fly_Number = new_fly_number

In [12]:
def aggregate_column_from_df_list(df_list, col_name):
    agg_data = []
    for df in df_list:
        agg_data.extend(list(df[col_name]))
        
    
    return agg_data

### Functions that add new columns

In [13]:
def add_infection_state_col(df):
    df['infection_state'] = df[['prob','midgut','sal_gland']].any(skipna=True,axis=1)      

## Recode Fly_Number

In [14]:
# pdb
# import ipdb

In [13]:
dfp = pd.read_pickle(pickle_out_path)
dfp.reset_index(level=0, inplace=True)
dfp.drop(['index'], axis=1, inplace=True)
recode_fly_number(dfp)
dfp = dfp.query("Species in ['Gff']")
dfp

Unnamed: 0,Comment,Date,Dead,Fly_Number,Hunger_stage,Kept_in,Sex,Species,Teneral,Trap_No,Village,Wing_fray,infection_state,midgut,prob,sal_gland,workbook,worksheet
0,,2014-07-22,False,1,2,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
1,,2014-07-22,False,2,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
2,,2014-07-22,False,3,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
3,,2014-07-22,False,4,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
4,,2014-07-22,False,5,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
5,,2014-07-22,False,6,2,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
6,,2014-07-22,False,7,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
7,,2014-07-22,False,8,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
8,,2014-07-22,False,9,3,EtOH,F,Gff,False,1,UWA,3,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
9,,2014-07-22,False,10,2,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya


----
# Helpful constants

In [14]:
village_id_map = get_village_id_map(village_id_map_path)