# Purpose:

2015-03-24 (Tuesday)

Use newly discovered database excel files to reason about where to look for infected flies in our collection.

# Implementation:

## Imports:

In [15]:
# imports
import sys
import os
import re
import pandas as pd
import numpy as np


from shapely.geometry import Polygon, Point
from geopy.distance import great_circle
from geopandas import GeoDataFrame
from descartes import PolygonPatch


import seaborn as sns

In [16]:
%matplotlib inline

## File paths:

In [17]:
# define paths to files
source_dir = '/home/gus/Dropbox/uganda_data/tsetse_database/districts'

traps_gps_geojson = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/gps/traps/uganda_villages_gps_beadell.geojson"
village_gps_geojson = "/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/gps/villages/uganda_villages_gps_beadell.geojson"

## Collect the files and create master dataframe

In [18]:
workbooks = []

In [19]:
for name in os.listdir(source_dir):
#     print os.path.splitext(name)
    if os.path.splitext(name)[-1].startswith('.xls'):
        workbooks.append(name)

In [20]:
workbooks[:5]

['Wakiso.xlsx',
 'Masindi.xlsx',
 'Kamwenge.xlsx',
 'Busia_yoosook.xls',
 'Apac.xlsx']

In [21]:
colsite_sheets = []

In [22]:
def load_xl_sheets(xl_path):
    dfs = {}
    
    xls = pd.ExcelFile(xl_path)
    
    for sheet in xls.sheet_names:
        if sheet.upper().startswith("COLSITE"):
            dfs[sheet] = xls.parse(sheetname=sheet, 
                                header=0, 
                                skiprows=None, skip_footer=0, 
                                index_col=None, parse_cols=None, 
                                parse_dates=False, date_parser=None, 
                                na_values=['NA'], 
                                thousands=None, chunksize=None, 
                                convert_float=False, 
                                has_index_names=False, converters=None)
    return dfs

In [23]:
for wkbk in workbooks:
    dfs = load_xl_sheets(os.path.join(source_dir, wkbk))
    
    for name, df in dfs.items():
        dfs[name]['source_file'] = wkbk
        colsite_sheets.append(dfs[name])
        
    
    

In [38]:
def save_geojson(gdf,path):
    with open(path, 'w') as geo_out:
        geo_out.write(gdf.to_json())

In [24]:
# pdb

In [25]:
table = pd.concat(colsite_sheets)

## Pull out data for uganda and make the maps

In [26]:
uganda = table.query('country == "Uganda"')

In [27]:
letter_code = re.compile(u'^\D+', re.UNICODE)

def recode_id(df):

    recode_func = lambda x: letter_code.findall(x)[0]

    new_id = df.id.apply(recode_func)
    df.id = new_id

In [28]:
s = u'BB22'
letter_code.findall(s)

[u'BB']

In [29]:
ug_id_nnul = uganda[uganda.id.notnull()]
recode_id(ug_id_nnul)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [30]:
ug_id_nnul.head()

Unnamed: 0,id,name,gaz_id,latitude,longitude,country,district,province,category,submitter,submission_date,curator,curation_date,public_notes,internal_notes,source_file
0,AP?,Apala,,2.38278,33.045,Uganda,Lira,,,,,,,,,Wakiso.xlsx
1,AP?,Apac,,1.9756,32.5386,Uganda,Apac,,,,,,,,,Wakiso.xlsx
2,BB,Bulanga B,,1.068104,33.777342,Uganda,Budaka,20.0,,,,,,,,Wakiso.xlsx
3,BB,Bulanga B Trap 20,,1.0728,33.77953,Uganda,Budaka,20.0,,,,,,,,Wakiso.xlsx
4,BB,Bulanga B Trap 21,,1.0725,33.77938,Uganda,Budaka,21.0,,,,,,,,Wakiso.xlsx


# Collect all trap GPS, save it

In [32]:
def lat_lon_to_geometry(df,lat='Latitude',lon='Longitude'):
    df["geometry"] = map(Point, df[lon], df[lat])

In [33]:
traps_gps = pd.DataFrame()
villages_gps = pd.DataFrame()

In [36]:
traps_gps['Location'] = ug_id_nnul['id']
traps_gps['Latitude'] = ug_id_nnul['latitude']
traps_gps['Longitude'] = ug_id_nnul['longitude']

lat_lon_to_geometry(traps_gps,lat='Latitude',lon='Longitude')
traps_gps = GeoDataFrame(traps_gps, geometry='geometry')

In [37]:
traps_gps.head()

Unnamed: 0,Location,Latitude,Longitude,geometry
0,AP?,2.38278,33.045,POINT (33.045 2.38278)
1,AP?,1.9756,32.5386,POINT (32.5386 1.9756)
2,BB,1.068104,33.777342,POINT (33.777342 1.068104)
3,BB,1.0728,33.77953,POINT (33.77953 1.0728)
4,BB,1.0725,33.77938,POINT (33.77938 1.0725)


In [39]:
# save geojson
save_geojson(traps_gps,traps_gps_geojson)

In [40]:
print traps_gps_geojson

/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/gps/traps/uganda_villages_gps_beadell.geojson


# Generate Village GPS and save it

In [48]:
village_gps = pd.pivot_table(traps_gps,index=['Location'],
                      values=['Longitude','Latitude'],
                      fill_value=0,
                      aggfunc=[np.median])
village_gps.head()
village_gps = village_gps['median']
village_gps = village_gps.reset_index()
village_gps.head()

lat_lon_to_geometry(village_gps)
village_gps = GeoDataFrame(village_gps, geometry='geometry')
village_gps.head()

Unnamed: 0,Location,Latitude,Longitude,geometry
0,AP?,2.17919,32.7918,POINT (32.7918 2.17919)
1,BB,1.068104,33.77849,POINT (33.77849 1.068104)
2,BD,0.260897,33.929442,POINT (33.92944166666666 0.2608966666666667)
3,BG,1.619185,33.289898,POINT (33.28989846153846 1.619185)
4,BK,1.011333,33.861239,POINT (33.86123916666666 1.011333333333333)


In [49]:
len(village_gps)

37

In [51]:
# save geojson
save_geojson(village_gps,village_gps_geojson)
print village_gps_geojson

/home/gus/Dropbox/uganda_data/data_repos/field_data/locations/gps/villages/uganda_villages_gps_beadell.geojson
