In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier


pd.set_option('display.max_columns', 50)

df = pd.read_csv('../data/train.csv')
df.head()



Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [2]:
df.shape

(10506, 12)

In [3]:
test = pd.read_csv('../data/test.csv')
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [4]:
test.shape

(116293, 11)

In [5]:
# counting the number of repetitions per each date/trap.species combination
df.groupby(['Date', 'Trap', 'Species']).count()['Latitude'].to_dict()

{('2007-05-29', 'T002', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T002', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T007', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T015', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T015', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T045', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T046', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T048', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T048', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T049', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T050', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T054', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T054', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T086', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T086', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T091', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T094', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T096', 'CULEX PIPIENS'): 1,
 ('2007-05-29', 'T096', 'CULEX RESTUANS'): 1,
 ('2007-05-29', 'T129', 'CULEX PIPIENS/RESTUANS'): 1,
 ('2007-05-29', 'T129', '

There are duplicated rows in both train and test dataset because the number of mosquitos cannot exceed 50. I defined a function to count the repeated rows so it can be applied to both train and test dataset.

In [6]:
# Calculates number of duplicated rows by Date, Trap, Species
def DuplicatedRows(df): 
    grouped = df.groupby(['Date', 'Trap', 'Species'])
    num = grouped.count()['Latitude'].to_dict()
    df['Num_Duplicates'] = 1000
    for idx in df.index:
        d = df.loc[idx, 'Date']
        t = df.loc[idx, 'Trap']
        s = df.loc[idx, 'Species']
        df.loc[idx, 'Num_Duplicates'] = num[(d, t, s)]
    return df

In [7]:
DuplicatedRows(df)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Num_Duplicates
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0,1
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0,1
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,1
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,1
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,1
5,2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8,2,0,1
6,2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8,1,0,1
7,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,0,1
8,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,2,0,1
9,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,1,0,1


In [8]:
DuplicatedRows(test)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Num_Duplicates
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1


In [9]:
df.groupby(by=['Date', 'Trap', 'Species'], as_index=False)['WnvPresent'].max()

Unnamed: 0,Date,Trap,Species,WnvPresent
0,2007-05-29,T002,CULEX PIPIENS/RESTUANS,0
1,2007-05-29,T002,CULEX RESTUANS,0
2,2007-05-29,T007,CULEX RESTUANS,0
3,2007-05-29,T015,CULEX PIPIENS/RESTUANS,0
4,2007-05-29,T015,CULEX RESTUANS,0
5,2007-05-29,T045,CULEX RESTUANS,0
6,2007-05-29,T046,CULEX RESTUANS,0
7,2007-05-29,T048,CULEX PIPIENS/RESTUANS,0
8,2007-05-29,T048,CULEX RESTUANS,0
9,2007-05-29,T049,CULEX RESTUANS,0


In [10]:
grouped = df.groupby(by=['Date', 'Trap', 'Species'], as_index=False)['WnvPresent'].max() 
df.drop('WnvPresent', axis=1, inplace=True)
grouped.columns = ['Date', 'Trap', 'Species', 'WnvPresent']
df = df.merge(grouped, on=['Date', 'Trap', 'Species'], how="left") 
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,Num_Duplicates,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,1,0


Adding columns with Day, Year and Month of the observation to both train and test dataset.

In [11]:
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Day'] = pd.DatetimeIndex(df['Date']).day
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,Num_Duplicates,WnvPresent,Year,Month,Day
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0,2007,5,29
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0,2007,5,29
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,1,0,2007,5,29


In [12]:
test['Year'] = pd.DatetimeIndex(test['Date']).year
test['Month'] = pd.DatetimeIndex(test['Date']).month
test['Day'] = pd.DatetimeIndex(test['Date']).day
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Num_Duplicates,Year,Month,Day
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11


Creating dummy columns for the Species for both train and test dataset.

In [13]:
dummies=pd.get_dummies(df['Species'])
df = pd.concat([df, dummies], axis=1)
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,Num_Duplicates,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,1,0,0,0,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,0,1,0,0,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0,2007,5,29,0,0,0,1,0,0,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0,2007,5,29,0,0,1,0,0,0,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,1,0,2007,5,29,0,0,0,1,0,0,0


In [14]:
dummies=pd.get_dummies(test['Species'])
test = pd.concat([test, dummies], axis=1)
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,1,0,0,0,0,0
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,1,0,0,0,0
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,1,0,0,0,0,0,0
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,1,0,0,0
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,0,0,1,0


Defining a function to calculate Haversine distance between two points.

In [15]:
def distance(lat1,lon1,lat2,lon2):
    
    # transforming longitude and latitude to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    # latitude and longitude difference
    dlat = lat2-lat1
    dlon = lon2-lon1
    
    # radius of the Earth in km
    R = 6373 
    
    # multipliers 
    a = np.power(np.sin(dlat), 2) + np.multiply(np.cos(lat1), np.multiply(np.cos(lat2), np.power(np.sin(dlon/2), 2)))
    c = 2*np.arcsin(np.sqrt(a))
    return R * c

Appending to each observation (in both train and test dataset), the number of the closest station.

In [16]:
df['lat1'] = 41.995   # latitude of 1st station
df['lat2'] = 41.786   # latitude of 2nd station
df['lon1'] = -87.933  # longitude of 1st station
df['lon2'] = -87.752  # longitude of 2nd station

# calculate distance between each observation and the two stations
df['dist1'] = distance(df['Latitude'], df['Longitude'], df['lat1'], df['lon1']) 
df['dist2'] = distance(df['Latitude'], df['Longitude'], df['lat2'], df['lon2'])

# is first station the closest
st_1_closest = np.less_equal(df['dist1'], df['dist2'])

# vector of 1s (station 1 closest by default)
st = np.ones(df.shape[0])

# change values of the vector to 2 where station 2 is closer
st[st_1_closest == False] = 2

# adding column with number of the closest station
df['Station'] = st

# dropping columns I don't need anymore
df.drop(columns = ['dist1', 'dist2', 'lat1', 'lat2', 'lon1', 'lon2' ], inplace=True)

In [17]:
test['lat1'] = 41.995   # latitude of 1st station
test['lat2'] = 41.786   # latitude of 2nd station
test['lon1'] = -87.933  # longitude of 1st station
test['lon2'] = -87.752  # longitude of 2nd station

# calculate distance between each observation and the two stations
test['dist1'] = distance(test['Latitude'], test['Longitude'], test['lat1'], test['lon1']) 
test['dist2'] = distance(test['Latitude'], test['Longitude'], test['lat2'], test['lon2'])

# is first station the closest
st_1_closest = np.less_equal(test['dist1'], test['dist2'])

# vector of 1s (station 1 closest by default)
st = np.ones(test.shape[0])

# change values of the vector to 2 where station 2 is closer
st[st_1_closest == False] = 2

# adding column with number of the closest station
test['Station'] = st

# dropping columns I don't need anymore
test.drop(columns = ['dist1', 'dist2', 'lat1', 'lat2', 'lon1', 'lon2'], inplace=True)

In [18]:
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX,Station
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,1,0,0,0,0,0,1.0
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,1,0,0,0,0,1.0
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,1,0,0,0,0,0,0,1.0
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,1,0,0,0,1.0
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,0,0,1,0,1.0


In [19]:
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,Num_Duplicates,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Station
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,1,0,0,0,0,1.0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,0,1,0,0,0,1.0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0,2007,5,29,0,0,0,1,0,0,0,1.0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0,2007,5,29,0,0,1,0,0,0,0,1.0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,1,0,2007,5,29,0,0,0,1,0,0,0,1.0


Reading and cleaning the weather dataset.

In [20]:
weather = pd.read_csv('../data/weather.csv')

# replace "Trace" with 0.001, replace M and missing with NaN
weather.replace(['  T','M','-'], [0.001, np.nan, np.nan], inplace=True) 

# replace missing WetBulb of 1st station with the value of 2nd station
weather['WetBulb'].fillna(method='bfill', inplace=True)

# fill the rest of the null values
weather.fillna(method='pad', inplace=True)
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,448,1849,,0,,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,14,51,57,0,3,448,1849,,0,,0.0,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,447,1850,BR,0,,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,-3,42,47,13,0,447,1850,BR HZ,0,,0.0,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,446,1851,,0,,0.0,0.0,29.39,30.12,11.7,7,11.9


Merging train and test dataset with the weather data, based on Date and Station.

In [21]:
df = df.merge(weather, on=['Date', 'Station'], how="left",  left_index=True)
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,Num_Duplicates,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
56,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,1,0,0,0,0,1.0,88,60,74,10,58,65,0,9,421,1917,BR HZ,0,,0.0,0.0,29.39,30.11,5.8,18,6.5
56,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,1,0,2007,5,29,0,0,0,1,0,0,0,1.0,88,60,74,10,58,65,0,9,421,1917,BR HZ,0,,0.0,0.0,29.39,30.11,5.8,18,6.5
56,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,1,0,2007,5,29,0,0,0,1,0,0,0,1.0,88,60,74,10,58,65,0,9,421,1917,BR HZ,0,,0.0,0.0,29.39,30.11,5.8,18,6.5
56,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,1,0,2007,5,29,0,0,1,0,0,0,0,1.0,88,60,74,10,58,65,0,9,421,1917,BR HZ,0,,0.0,0.0,29.39,30.11,5.8,18,6.5
56,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,1,0,2007,5,29,0,0,0,1,0,0,0,1.0,88,60,74,10,58,65,0,9,421,1917,BR HZ,0,,0.0,0.0,29.39,30.11,5.8,18,6.5


In [22]:
test = test.merge(weather, on=['Date', 'Station'], how="left",  left_index=True)
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
450,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,1,0,0,0,0,0,1.0,86,61,74,7,56,64,0,9,416,1926,,0,,0.0,0.0,29.28,29.99,8.9,18,10.0
450,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,1,0,0,0,0,1.0,86,61,74,7,56,64,0,9,416,1926,,0,,0.0,0.0,29.28,29.99,8.9,18,10.0
450,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,1,0,0,0,0,0,0,1.0,86,61,74,7,56,64,0,9,416,1926,,0,,0.0,0.0,29.28,29.99,8.9,18,10.0
450,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,1,0,0,0,1.0,86,61,74,7,56,64,0,9,416,1926,,0,,0.0,0.0,29.28,29.99,8.9,18,10.0
450,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,2008,6,11,0,0,0,0,0,0,1,0,1.0,86,61,74,7,56,64,0,9,416,1926,,0,,0.0,0.0,29.28,29.99,8.9,18,10.0


Drop unnecessary columns from test and train datasets, then save to .csv files.

In [23]:
df.drop(['Species','AddressNumberAndStreet','Address','Block','Street','Trap','AddressAccuracy','Station',
        'Latitude','Longitude','Heat', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
        'SeaLevel', 'AvgSpeed','NumMosquitos'], axis = 1, inplace = True)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace = True)
df.head()

Unnamed: 0_level_0,Num_Duplicates,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2007-05-29,1,0,2007,5,29,0,0,1,0,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,1,0,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18


In [25]:
df.dtypes

Num_Duplicates              int64
WnvPresent                  int64
Year                        int64
Month                       int64
Day                         int64
CULEX ERRATICUS             uint8
CULEX PIPIENS               uint8
CULEX PIPIENS/RESTUANS      uint8
CULEX RESTUANS              uint8
CULEX SALINARIUS            uint8
CULEX TARSALIS              uint8
CULEX TERRITANS             uint8
Tmax                        int64
Tmin                        int64
Tavg                       object
Depart                     object
DewPoint                    int64
WetBulb                    object
Cool                       object
Sunrise                    object
Sunset                     object
PrecipTotal                object
ResultSpeed               float64
ResultDir                   int64
dtype: object

In [26]:
test.drop(['Id','Species','AddressNumberAndStreet','Address','Block','Street','Trap','AddressAccuracy','Station',
        'Latitude','Longitude','Heat', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
        'SeaLevel', 'AvgSpeed'], axis = 1, inplace = True)
test['Date'] = pd.to_datetime(test['Date'])
test.set_index('Date', inplace = True)
test.head()

Unnamed: 0_level_0,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2008-06-11,1,2008,6,11,0,0,1,0,0,0,0,0,86,61,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,1,0,0,0,0,86,61,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,1,0,0,0,0,0,0,86,61,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,0,1,0,0,0,86,61,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,0,0,0,1,0,86,61,74,7,56,64,9,416,1926,0.0,8.9,18
