In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 30)

df = pd.read_csv('data/train.csv')
# df['Date'] = pd.to_datetime(df['Date'])
# df.set_index('Date', inplace = True)
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [2]:
spray = pd.read_csv('data/spray.csv')
# spray['Date'] = pd.to_datetime(spray['Date'])
# spray.set_index('Date', inplace = True)
spray.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


The function below calculates the distance between two points on Earth in kilometers, given their latitude and longitude, using Haversine formula.

In [3]:
from math import sin, cos, sqrt, asin, radians
def distance(lat1,lon1,lat2,lon2):
    # transforming longitude and latitude to radians
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    # latitude and longitude difference
    dlat = lat2-lat1
    dlon = lon2-lon1
    
    # radius of the Earth in kn
    R = 6373 
    
    # complicated multiplier calculated separately
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    
    # Haversine formula
    return 2*asin(sqrt(a))*R

In [4]:
# list dates when the spray was sprayed
spray_dates = spray['Date'].unique()
spray_dates

array(['2011-08-29', '2011-09-07', '2013-07-17', '2013-07-25',
       '2013-08-08', '2013-08-15', '2013-08-16', '2013-08-22',
       '2013-08-29', '2013-09-05'], dtype=object)

The loop below calculates the distance between the spray and each observation. If the distance is shorter than two specified values (0.5 and 1 km), it will assign a value of 1 to a column named with the date of the spray, that will be added to the dataframe.

In [None]:
# choosing to consider 0.5 and 1 km distance
for dist in [0.5,1]:
    
    # copying the train df
    spray_train = df
    
    # for each date in the spray dataset
    for date in spray_dates:
        
        # extract a subset of the dataframe just for the date we are considering in the loop
        current_df = spray[spray['Date'] == date]
        
        # generate a name for the column to add to spray_train
        column_name = date + '_spray'
        
        # for each row of the spray df, get longitude and latitude
        for index, row in spray_train.iterrows():
            lon1 = row['Longitude']
            lat1 = row['Latitude']
            
            # for each row of the train df get latitude and longitude
            for index2, row2 in current_df.iterrows():
                lon2 = row2['Longitude']
                lat2 = row2['Latitude']
                
                # calculate the distance between the points
                dstnc = distance(lat1,lon1,lat2,lon2)
                
                # if the distance is lower than the limit, put 1 in the column, otherwise 0
                if dstnc <= dist:
                    spray_train.loc[index,column_name] = 1
                else:
                    spray_train.loc[index,column_name] = 0
                    
    # generate name for csv file
    csv_name = 'train_spray_'+str(dist)+'km'
    
    # save to csv
    spray_train.to_csv(csv_name)