In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import datetime as dt
from sklearn import model_selection
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

# Load dataset 
train = pd.read_csv('asset/train.csv')
test = pd.read_csv('asset/test.csv')
sample = pd.read_csv('asset/sampleSubmission.csv')
weather = pd.read_csv('asset/weather.csv')
spray = pd.read_csv('asset/spray.csv')

# Get labels
labels = train.WnvPresent.values

# Creating dummy variables for the weather data
weather.CodeSum = weather.CodeSum.apply(str.split)
for (i,list_) in enumerate(weather.CodeSum):
    for item in list_:
        if item not in weather.columns:
            weather[item] = 0
        weather.set_value(col=item,index=i,value=1)


# Not using codesum anymore
weather = weather.drop('CodeSum', axis=1)
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)
weather.Date = pd.to_datetime(weather.Date)
spray.Date = pd.to_datetime(spray.Date)

In [301]:
min_spray = spray.groupby('Date').min()
max_spary = spray.groupby('Date').max()

In [302]:
spray_desc = spray.groupby('Date').describe()

In [297]:
spray_desc.Latitude['min']

Date
2011-08-29    42.389460
2011-09-07    41.968435
2013-07-17    41.714098
2013-07-25    41.939308
2013-08-08    41.917227
2013-08-15    41.887825
2013-08-16    41.911375
2013-08-22    41.713925
2013-08-29    41.758812
2013-09-05    41.976773
Name: min, dtype: float64

In [286]:
# lat = pd.DataFrame(spray_desc.Latitude['max']).join(spray_desc.Latitude['min'])

In [287]:
# lon = pd.DataFrame(spray_desc.Longitude['max']).join(spray_desc.Longitude['min'])

In [289]:
# plt.subplot()
# for c in lat.index:
#     spray[spray.Date == c].plot(x='Longitude', y='Latitude',style='.')
#     plt.autoscale()

In [290]:
# # Iterating over unique dates that sprays took place
# # Sprays took place over 10 days as trucks drove around chicago
# for date in spray.Date.unique():
#     spray_temp = spray[spray.Date == date]
#     spray_temp.index = range(0, len(spray_temp))
    
#     # I am creating a column for every unique date and initalizing it's rows to 0
#     # I will set these values to 1 when I find a trap that was sprayed
#     train['spray_'+str(date)] = 0
    
#     # Iterating over each row of our training data to determine if a trap is in the location
#     # of a spray. I am also checking to see if the spray was in the past
#     for r in range(0,len(train)):
#         if train.get_value(r,'Date') > date and train.get_value(r,'Date') < date + pd.Timedelta(weeks=2) :
            
#             # I am casting the lat and long to ints, and multiplaying by 1000 to truncate precision
#             # In other words, I'm taking pin points and making them into squares
#             cur_lat = int(train.get_value(r, 'Latitude') * 100)
#             cur_long = int(train.get_value(r, 'Longitude') * 100)
#             for i in range(0, len(spray_temp)):
                
#                 spray_lat = int(spray_temp.get_value(i,'Latitude')*100)
#                 spray_long = int(spray_temp.get_value(i,'Longitude')*100)
                
#                 # I am now checking if something is in the square +/- some threshold
#                 if (cur_lat < spray_lat + 10 and cur_lat > spray_lat - 10) and \
#                 (cur_long < spray_long + 10 and cur_long > spray_long - 10):
#                     train.set_value(r,'spray_'+str(date), 1)
#                     break

In [2]:
# This function allows you to create columns for the data_df input that will have a value of 1 or zero
# If a trap has been sprayed with in the time period specified

def has_been_sprayed(data_df, spray_df, time_period=2):
    # Iterating over unique dates that sprays took place
    # Sprays took place over 10 days as trucks drove around chicago
    for date in spray_df.Date.unique():
        # I only want data for this unique date
        spray_temp = spray_df[spray_df.Date == date]
        # Resetting index to make iterating easier
        spray_temp.index = range(0, len(spray_temp))
        
        # I am creating a column for every unique date and initalizing it's rows to 0
        # I will set these values to 1 when I find a trap that was sprayed
        col_name = 'spray_'+str(date)+"_"+str(time_period)
        data_df[col_name] = 0

        # Iterating over each row of our training data to determine if a trap is in the location
        # of a spray. I am also checking to see if the spray was in the past
        for r in range(0,len(data_df)):
            if data_df.get_value(r,'Date') > date and data_df.get_value(r,'Date') < date + pd.Timedelta(weeks=2) :

                # I am casting the lat and long to ints, and multiplaying by 100 to truncate precision
                # In other words, I'm taking pin points and making them into squares
                cur_lat = int(data_df.get_value(r, 'Latitude') * 100)
                cur_long = int(data_df.get_value(r, 'Longitude') * 100)
                
                # Iterating over each value in my spray data
                for i in range(0, len(spray_temp)):

                    spray_lat = int(spray_temp.get_value(i,'Latitude')*100)
                    spray_long = int(spray_temp.get_value(i,'Longitude')*100)

                    # I am now checking if something is in the square +/- some threshold
                    if (cur_lat < spray_lat + 10 and cur_lat > spray_lat - 10) and \
                    (cur_long < spray_long + 10 and cur_long > spray_long - 10):
                        data_df.set_value(r,col_name, 1)
                        break

In [3]:
has_been_sprayed(train,spray)

In [4]:
train.sum()

Block                                    374936.000000
Latitude                                 439583.009378
Longitude                               -921375.233272
AddressAccuracy                           82152.000000
NumMosquitos                             135039.000000
WnvPresent                                  551.000000
spray_2011-08-29T00:00:00.000000000_2         0.000000
spray_2011-09-07T00:00:00.000000000_2        91.000000
spray_2013-07-17T00:00:00.000000000_2       276.000000
spray_2013-07-25T00:00:00.000000000_2        81.000000
spray_2013-08-08T00:00:00.000000000_2        78.000000
spray_2013-08-15T00:00:00.000000000_2       100.000000
spray_2013-08-16T00:00:00.000000000_2       145.000000
spray_2013-08-22T00:00:00.000000000_2        94.000000
spray_2013-08-29T00:00:00.000000000_2        99.000000
spray_2013-09-05T00:00:00.000000000_2       106.000000
dtype: float64