In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import os
import re
import pprint

import folium
from folium import plugins

In [31]:
df_data = pickle.load(open('pickles/df_data_upd.pkl','rb'))
X_col = pickle.load(open('pickles/X_col_log.pkl','rb'))

In [3]:
df_data[X_col[1:]].head()

Unnamed: 0,month,latitude,longitude,msqto_log,tmax,tmin,tavg,delta,dewpt,wetbulb,...,winddirection,avgspeed,spray_targeted_prev,spray_targeted,species_culex_pipiens,species_culex_pipiens_restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans
0,6.0,41.991429,-87.747113,2.397986,86,65,76.0,6.0,67.0,69.0,...,18.0,11.6,0,False,0,1,0,0,0,0
1,8.0,41.678618,-87.559308,2.944492,84,72,78.0,5.0,71.0,73.0,...,29.0,6.0,0,False,0,0,0,0,0,1
2,9.0,41.948167,-87.730698,2.833272,84,58,71.0,7.0,55.0,61.0,...,33.0,2.1,0,False,0,1,0,0,0,0
3,8.0,41.702724,-87.536497,0.001,80,71,76.0,4.0,67.0,70.0,...,7.0,5.8,0,False,0,0,0,1,0,0
4,9.0,41.964242,-87.757639,1.098946,84,69,77.0,14.0,68.0,70.0,...,16.0,10.6,1,False,0,1,0,0,0,0


Expected inputs from user:
* month, expected mosquito count (low, avg, high), species present, whether the area was sprayed

Assumptions/approach:
* monthly average of daily mosquito counts
* lower quantile 0.1 per month
* upper quantile 0.9 per month
* assume uniform distribution amongst species types
* assume uniform distribution for all locations
* assume uniform distribution over days per month
* assume sprayed in both current and prev month if user selects to spray

In [97]:
df_data['msqto_cnt'] = [int(np.e**x) for x in df_data['msqto_log']]

In [98]:
df_data['msqto_avg'] = df_data.groupby(['month'])['msqto_cnt'].transform(lambda x: x.mean())

In [102]:
df_data['msqto_low'] = df_data[['month','msqto_cnt']].groupby(['month']).transform(lambda x: x.quantile(0.1))

In [100]:
df_data['msqto_high'] = df_data[['month','msqto_cnt']].groupby(['month']).transform(lambda x: x.quantile(0.9))

In [103]:
df_data.head()

Unnamed: 0,date,virus_present,year,month,species,latitude,longitude,mosquito_cnt,tmax,tmin,...,species_culex_tarsalis,species_culex_territans,msqto_log,heat_log,sunset_log,precip_log,msqto_cnt,msqto_avg,msqto_low,msqto_high
0,2009-06-19,0,2009.0,6.0,CULEX PIPIENS/RESTUANS,41.991429,-87.747113,11,86,65,...,0,0,2.397986,-6.907755,-4.036076,1.379018,11,9.622289,1.0,22.0
1,2007-08-09,0,2007.0,8.0,CULEX TERRITANS,41.678618,-87.559308,19,84,72,...,0,1,2.944492,-6.907755,-0.724982,-2.97593,19,12.127636,1.0,35.0
2,2009-09-14,0,2009.0,9.0,CULEX PIPIENS/RESTUANS,41.948167,-87.730698,17,84,58,...,0,0,2.833272,-6.907755,0.3607,-6.907755,17,7.887006,1.0,19.0
3,2007-08-15,0,2007.0,8.0,CULEX SALINARIUS,41.702724,-87.536497,1,80,71,...,0,0,0.001,-6.907755,-0.481806,-1.465338,1,12.127636,1.0,35.0
4,2013-09-19,0,2013.0,9.0,CULEX PIPIENS/RESTUANS,41.964242,-87.757639,3,84,69,...,0,0,1.098946,-6.907755,0.470628,-0.173164,3,7.887006,1.0,19.0


In [113]:
# create new df - msqto_avg/low/high are based on month so included as part of the month/lat/long unique key
# drop columns that would be supplied by user input
df_summary = (df_data[X_col[1:]+['msqto_avg','msqto_low','msqto_high']]
    .groupby(['month','latitude','longitude','msqto_avg','msqto_low','msqto_high'])
    .mean().reset_index()
    .drop(columns=['msqto_log','spray_targeted_prev','spray_targeted','species_culex_pipiens','species_culex_pipiens_restuans','species_culex_restuans','species_culex_salinarius','species_culex_tarsalis','species_culex_territans'])
)
df_summary.head()

Unnamed: 0,month,latitude,longitude,msqto_avg,msqto_low,msqto_high,tmax,tmin,tavg,delta,...,heat_log,cool,sunrise,sunset_log,precip_log,pressure,sealvlpressure,windspeed,winddirection,avgspeed
0,5.0,41.659112,-87.538693,2.802469,1.0,5.0,62.0,54.0,58.0,-5.0,...,1.946053,0.0,4.35,-1.382302,-6.907755,29.06,29.76,6.8,30.0,8.1
1,5.0,41.662014,-87.724608,2.802469,1.0,5.0,62.0,54.0,58.0,-5.0,...,1.946053,0.0,4.35,-1.382302,-6.907755,29.06,29.76,6.8,30.0,8.1
2,5.0,41.673408,-87.599862,2.802469,1.0,5.0,62.0,54.0,58.0,-5.0,...,1.946053,0.0,4.35,-1.382302,-6.907755,29.06,29.76,6.8,30.0,8.1
3,5.0,41.678618,-87.559308,2.802469,1.0,5.0,62.0,54.0,58.0,-5.0,...,1.946053,0.0,4.35,-1.382302,-6.907755,29.06,29.76,6.8,30.0,8.1
4,5.0,41.680946,-87.535198,2.802469,1.0,5.0,62.0,54.0,58.0,-5.0,...,1.946053,0.0,4.35,-1.382302,-6.907755,29.06,29.76,6.8,30.0,8.1


In [114]:
pickle.dump(df_summary,open('pickles/df_summary.pkl','wb'))

In [118]:
df_data[X_col[1:]].columns

Index(['month', 'latitude', 'longitude', 'msqto_log', 'tmax', 'tmin', 'tavg',
       'delta', 'dewpt', 'wetbulb', 'heat_log', 'cool', 'sunrise',
       'sunset_log', 'precip_log', 'pressure', 'sealvlpressure', 'windspeed',
       'winddirection', 'avgspeed', 'spray_targeted_prev', 'spray_targeted',
       'species_culex_pipiens', 'species_culex_pipiens_restuans',
       'species_culex_restuans', 'species_culex_salinarius',
       'species_culex_tarsalis', 'species_culex_territans'],
      dtype='object')

In [119]:
df_summary.columns

Index(['month', 'latitude', 'longitude', 'msqto_avg', 'msqto_low',
       'msqto_high', 'tmax', 'tmin', 'tavg', 'delta', 'dewpt', 'wetbulb',
       'heat_log', 'cool', 'sunrise', 'sunset_log', 'precip_log', 'pressure',
       'sealvlpressure', 'windspeed', 'winddirection', 'avgspeed'],
      dtype='object')

In [67]:
df_data['species'].value_counts()

CULEX PIPIENS/RESTUANS    3826
CULEX RESTUANS            2393
CULEX PIPIENS             1950
CULEX TERRITANS            216
CULEX SALINARIUS            83
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: species, dtype: int64

In [76]:
# messing around with the idea of using % per species...next time
for i in df_data['species'].value_counts().index:
    col_name = re.sub(r'[ /]','_',i).lower()
    df_data[col_name] = (df_data['species']==i)*df_data['msqto_cnt']
    
for i in df_data['species'].value_counts().index:
    col_name = re.sub(r'[ /]','_',i).lower()
    new_col_name = 'sum_'+re.sub(r'[ /]','_',i).lower()
    df_data[new_col_name] = df_data.groupby(['month','year',col_name])['msqto_cnt'].transform(lambda x: sum(x))

In [124]:
np_test = np.zeros((df_summary.shape[0],6))
np_test[:,2]=1

In [125]:
np_test

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [127]:
test_dict = {'pipiens':1,'territans':0}
for idx,val in enumerate(test_dict):
    print(idx,val,test_dict[val])

0 pipiens 1
1 territans 0


In [2]:
import api
from api import *

In [40]:
df_output = gen_data(5,0.5,1,1,1,1,1,1,1,1)

#colors = ['#B7A2DB','#9B5A8A','#55286F']
colors = ['#9EFFB6','#47B2A9','#417271']
gen_map(df_output,colors)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  df_new = pd.concat((df_new,df_loop),axis=0)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  df_new = pd.concat((df_new,df_loop),axis=0)


TypeError: __init__() got an unexpected keyword argument 'fill'

In [647]:
identify_bins(df_output,colors)

array([0.17785714, 0.1897619 , 0.25035714])

In [668]:
# more circles because more traps checked in June on average than in May

df_output = gen_data(6,0.1,0,0,0,0,1,0,0,0)

#colors = ['#B7A2DB','#9B5A8A','#55286F']
colors = ['#9EFFB6','#47B2A9','#417271']
gen_map(df_output,colors)

In [662]:
identify_bins(df_output,colors)

array([0.05, 0.1 , 0.21])

In [3]:
import itertools 
np_species = np.array([list(item) for item in itertools.product((0,1),(0,1),(0,1),(0,1),(0,1),(0,1),(0,1))])

In [16]:
np_species[1:]

array([[0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 1],
       [0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 1, 1, 0],
       [0, 0, 0, 1, 1, 1, 1],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 1],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 1],
       [0, 0, 1, 0, 1, 1, 0],
       [0, 0, 1, 0, 1, 1, 1],
       [0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 0],
       [0, 0, 1, 1, 0, 1, 1],
       [0, 0, 1, 1, 1, 0, 0],
       [0, 0, 1, 1, 1, 0, 1],
       [0, 0, 1, 1, 1, 1, 0],
       [0, 0, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0,

In [18]:
np_species[1:][0]

array([0, 0, 0, 0, 0, 0, 1])

In [35]:
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint
import folium
from folium import plugins

model = pickle.load(open('pickles/randomforest_10.pkl','rb'))
ssX = pickle.load(open('pickles/ssX.pkl','rb'))
df = pickle.load(open('pickles/df_summary.pkl','rb'))
col_order = pickle.load(open('pickles/X_col_log.pkl','rb'))[1:]

def gen_data2(month,msqto_qnt,spray,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans):
    df_input = df[df['month']==month].reset_index(drop=True).copy()

    # hardcoded for use in web app as options available for user input
    if msqto_qnt == 0.5:
        df_input['msqto_cnt'] = df_input['msqto_avg']
    elif msqto_qnt == 0.1:
        df_input['msqto_cnt'] = df_input['msqto_low']
    else:
        df_input['msqto_cnt'] = df_input['msqto_high']

    df_input['msqto_log'] = np.log(0.001+df_input['msqto_cnt'])
    df_input['spray_targeted'] = np.ones((df_input.shape[0],1)) if spray == 1 else np.zeros((df_input.shape[0],1))
    df_input['spray_targeted_prev'] = np.ones((df_input.shape[0],1)) if spray == 1 else np.zeros((df_input.shape[0],1))
    df_new = pd.DataFrame(columns=df_input.columns)

    species_dict = {'pipiens':pipiens,'pipiens_restuans':pipiens_restuans,'restuans':restuans,'salinarius':salinarius,'tarsalis':tarsalis,'territans':territans}

    for idx,val in enumerate(species_dict):
        df_loop = df_input.copy()
        np_species = np.zeros((df_loop.shape[0],len(species_dict)))
        if species_dict[val]==1:
            np_species[:,idx] = species_dict[val]
            df_species = pd.DataFrame(np_species,columns=['species_culex_'+x for x in species_dict.keys()])
            df_loop = df_loop.join(df_species)
            df_new = pd.concat((df_new,df_loop),axis=0)

    if erraticus == 1:
        np_species = np.zeros((df_loop.shape[0],len(species_dict)))
        df_loop = df_input.copy()
        df_species = pd.DataFrame(np_species,columns=['species_culex_'+x for x in species_dict.keys()])
        df_loop = df_loop.join(df_species)
        df_new = pd.concat((df_new,df_loop),axis=0)

    df_new.drop_duplicates().reset_index(drop=True,inplace=True)

    X_scaled = ssX.transform(df_new[col_order])

    # prob of 1
    y_pred = model.predict_proba(X_scaled)[:,1]

    df_new['virus_present'] = y_pred
    
    df_new['species'] = df_new.apply(func, axis=1)

    df_output = df_new[['spray_targeted','species','month','latitude','longitude','virus_present']].copy()
    df_output = df_output.groupby(['spray_targeted','species','month','latitude','longitude'])['virus_present'].apply(np.mean).reset_index()
    df_output['percentile'] = df_output['virus_present'].transform(lambda x: x.rank(pct=True))
    return(df_output)

def func(row):
    if row['species_culex_pipiens'] == 1:
        return 'culex pipiens'
    elif row['species_culex_pipiens_restuans'] == 1:
        return 'culex pipiens/restuans'
    elif row['species_culex_restuans'] == 1:
        return 'culex restuans'
    elif row['species_culex_salinarius'] == 1:
        return 'culex salinarius'
    elif row['species_culex_tarsalis'] == 1:
        return 'culex tarsalis'
    elif row['species_culex_territans'] == 1:
        return 'culex_territans'
    else:
        return 'culex erraticus'


In [32]:
df_data.columns

Index(['date', 'virus_present', 'year', 'month', 'species', 'latitude',
       'longitude', 'mosquito_cnt', 'tmax', 'tmin', 'tavg', 'delta', 'dewpt',
       'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'snowdepth',
       'snowfall', 'precip', 'pressure', 'sealvlpressure', 'windspeed',
       'winddirection', 'avgspeed', 'spray_dist', 'last_spray_dist',
       'spray_targeted_prev', 'spray_targeted', 'species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'species_culex_salinarius', 'species_culex_tarsalis',
       'species_culex_territans', 'msqto_log', 'heat_log', 'sunset_log',
       'precip_log'],
      dtype='object')

In [36]:
df_output_all = pd.DataFrame()

for month in range(5,11):
    for msqto_qnt in [0.1,0.5,0.9]:
        for spray in [0,1]:
            for species in np_species[1:]:
                df_loop = gen_data2(month,msqto_qnt,spray,species[0],species[1],species[2],species[3],species[4],species[5],species[6])
                df_output_all = pd.concat((df_output_all,df_loop),axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [37]:
df_output_all.shape

(1811712, 7)

In [38]:
pickle.dump(df_output_all,open('pickles/df_output_all.pkl','wb'))

In [39]:
df_output_all.to_csv('Data/output_all.csv',sep=',',index=False)