In [37]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [38]:
train = pd.read_csv('../assets/train.csv')
test = pd.read_csv('../assets/test.csv')
wx = pd.read_csv('../datasets/cleaned_roll_wx.csv', index_col='Date', parse_dates=True)

In [39]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [40]:
test.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [41]:
train.shape

(10506, 12)

In [42]:
train.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64

In [43]:
test.isnull().sum()

Id                        0
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
dtype: int64

In [44]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
Date                      10506 non-null object
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null int64
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
dtypes: float64(2), int64(4), object(6)
memory usage: 985.1+ KB


In [45]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116293 entries, 0 to 116292
Data columns (total 11 columns):
Id                        116293 non-null int64
Date                      116293 non-null object
Address                   116293 non-null object
Species                   116293 non-null object
Block                     116293 non-null int64
Street                    116293 non-null object
Trap                      116293 non-null object
AddressNumberAndStreet    116293 non-null object
Latitude                  116293 non-null float64
Longitude                 116293 non-null float64
AddressAccuracy           116293 non-null int64
dtypes: float64(2), int64(3), object(6)
memory usage: 9.8+ MB


Features with wrong dtypes:
1. Date

We will convert the Date feature to Datetime type

In [46]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

Adding in additional column 'Station' in train and test dataset for merging with weather later on.

In [47]:
def distance_on_unit_sphere(lat1, long1, lat2, long2):

    degrees_to_radians = math.pi/180.0
   
    phi1 = (90.0 - lat1)*degrees_to_radians
    phi2 = (90.0 - lat2)*degrees_to_radians

    theta1 = long1*degrees_to_radians
    theta2 = long2*degrees_to_radians

    
    cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
    math.cos(phi1)*math.cos(phi2))
    arc = math.acos( cos )
    
    return arc

station_1_lat = 41.995
station_1_lon = -87.933
station_2_lat = 41.786
station_2_lon = -87.752

def closest_station(lat, lon):
    if (distance_on_unit_sphere(lat, lon, station_1_lat, station_1_lon) <
        distance_on_unit_sphere(lat, lon, station_2_lat, station_2_lon)):
        return 1
    else: return 2

In [48]:
train['Station'] = [closest_station(a,b) for (a, b) in zip(train.Latitude, train.Longitude)]

In [49]:
test['Station'] = [closest_station(a,b) for (a, b) in zip(test.Latitude, test.Longitude)]

Sum the number of Mosquitos for the same species and in the same trap on the same day.

If either of the groups of the same Species in the trap is positive, we will assume that all the other mosquitoes of the same species in the same trap to be positive.

In [51]:
train = train.groupby(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy', 'Station'], sort=False).sum()

In [53]:
train['WnvPresent'].value_counts()

0     8018
1      409
2       31
3        9
4        2
7        1
6        1
5        1
10       1
9        1
8        1
Name: WnvPresent, dtype: int64

In [54]:
train['WnvPresent'] = [1 if x>=1 else 0 for x in train['WnvPresent']]

In [55]:
# Checking if correctly mapped

train['WnvPresent'].value_counts()

0    8018
1     457
Name: WnvPresent, dtype: int64

In [56]:
train.reset_index(inplace=True)

In [57]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Station,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0


In [58]:
test.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Station
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1


In [59]:
train.to_csv('../datasets/cleaned_agg_train.csv', index=False)
test.to_csv('../datasets/cleaned_agg_test.csv', index=False)

In [60]:
wx = pd.read_csv('../datasets/cleaned_roll_wx.csv', index_col='Date', parse_dates=True)

In [61]:
pd.set_option('display.max_column',100)

In [62]:
train.set_index('Date', inplace=True)
test.set_index('Date', inplace=True)

In [63]:
train_wx = train.merge(wx, how='left', left_on=['Date', 'Station'], right_on=['Date','Station'])

In [64]:
test_wx = test.merge(wx, how='left', left_on=['Date', 'Station'], right_on=['Date','Station'])

In [65]:
train_wx

Unnamed: 0_level_0,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Station,NumMosquitos,WnvPresent,Sunrise,Sunset,CodeSum,ResultDir,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,1,0,0421,1917,BR HZ,18,75.433333,51.600000,63.733333,5.166667,45.266667,54.033333,3.866667,2.600000,0.0,0.059667,29.361333,30.085000,8.646667,10.363333
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,1,0,0421,1917,BR HZ,18,75.433333,51.600000,63.733333,5.166667,45.266667,54.033333,3.866667,2.600000,0.0,0.059667,29.361333,30.085000,8.646667,10.363333
2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0,0421,1917,BR HZ,18,75.433333,51.600000,63.733333,5.166667,45.266667,54.033333,3.866667,2.600000,0.0,0.059667,29.361333,30.085000,8.646667,10.363333
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0,0421,1917,BR HZ,18,75.433333,51.600000,63.733333,5.166667,45.266667,54.033333,3.866667,2.600000,0.0,0.059667,29.361333,30.085000,8.646667,10.363333
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,4,0,0421,1917,BR HZ,18,75.433333,51.600000,63.733333,5.166667,45.266667,54.033333,3.866667,2.600000,0.0,0.059667,29.361333,30.085000,8.646667,10.363333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-09-26,"5100 West 72nd Street, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 72ND ST,T035,"5100 W 72ND ST, Chicago, IL",41.763733,-87.742302,8,2,6,1,-,-,,9,78.966667,61.733333,70.233333,4.066667,56.400000,61.900000,1.500000,6.733333,0.0,0.076000,29.344333,29.990333,6.690000,8.233333
2013-09-26,"5800 North Ridge Avenue, Chicago, IL 60660, USA",CULEX PIPIENS/RESTUANS,58,N RIDGE AVE,T231,"5800 N RIDGE AVE, Chicago, IL",41.987280,-87.666066,8,1,5,0,0543,1742,BR,8,78.366667,59.633333,69.300000,4.066667,57.600000,62.300000,1.966667,6.266667,0.0,0.087333,29.278000,29.994333,6.726667,8.116667
2013-09-26,"1700 North Ashland Avenue, Chicago, IL 60622, USA",CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,"1700 N ASHLAND AVE, Chicago, IL",41.912563,-87.668055,9,2,1,0,-,-,,9,78.966667,61.733333,70.233333,4.066667,56.400000,61.900000,1.500000,6.733333,0.0,0.076000,29.344333,29.990333,6.690000,8.233333
2013-09-26,"7100 North Harlem Avenue, Chicago, IL 60631, USA",CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,"7100 N HARLEM AVE, Chicago, IL",42.009876,-87.807277,9,1,5,0,0543,1742,BR,8,78.366667,59.633333,69.300000,4.066667,57.600000,62.300000,1.966667,6.266667,0.0,0.087333,29.278000,29.994333,6.726667,8.116667


In [66]:
test_wx

Unnamed: 0_level_0,Id,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Station,Sunrise,Sunset,CodeSum,ResultDir,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2008-06-11,1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0416,1926,,18,71.800000,50.8,61.566667,-1.033333,48.900000,55.200000,5.966667,2.533333,0.0,0.125333,29.156,29.875667,8.513333,10.330000
2008-06-11,2,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0416,1926,,18,71.800000,50.8,61.566667,-1.033333,48.900000,55.200000,5.966667,2.533333,0.0,0.125333,29.156,29.875667,8.513333,10.330000
2008-06-11,3,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0416,1926,,18,71.800000,50.8,61.566667,-1.033333,48.900000,55.200000,5.966667,2.533333,0.0,0.125333,29.156,29.875667,8.513333,10.330000
2008-06-11,4,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0416,1926,,18,71.800000,50.8,61.566667,-1.033333,48.900000,55.200000,5.966667,2.533333,0.0,0.125333,29.156,29.875667,8.513333,10.330000
2008-06-11,5,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0416,1926,,18,71.800000,50.8,61.566667,-1.033333,48.900000,55.200000,5.966667,2.533333,0.0,0.125333,29.156,29.875667,8.513333,10.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-10-02,116289,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX SALINARIUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8,2,-,-,TSRA RA BR,17,74.433333,56.2,65.566667,0.200000,53.766667,58.733333,2.933333,3.500000,0.0,0.107333,29.419,30.074000,5.816667,7.876667
2014-10-02,116290,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TERRITANS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8,2,-,-,TSRA RA BR,17,74.433333,56.2,65.566667,0.200000,53.766667,58.733333,2.933333,3.500000,0.0,0.107333,29.419,30.074000,5.816667,7.876667
2014-10-02,116291,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TARSALIS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8,2,-,-,TSRA RA BR,17,74.433333,56.2,65.566667,0.200000,53.766667,58.733333,2.933333,3.500000,0.0,0.107333,29.419,30.074000,5.816667,7.876667
2014-10-02,116292,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8,2,-,-,TSRA RA BR,17,74.433333,56.2,65.566667,0.200000,53.766667,58.733333,2.933333,3.500000,0.0,0.107333,29.419,30.074000,5.816667,7.876667


In [67]:
train_wx.to_csv('../datasets/cleaned_agg_train_wx.csv', index_label='Date')
test_wx.to_csv('../datasets/cleaned_agg_test_wx.csv', index_label='Date')