In [18]:
import pandas as pd
import numpy as np

In [19]:
train_data_station1   = pd.read_csv('./train_station1.csv')
train_data_station2   = pd.read_csv('./train_Station2.csv')

test_data_station1   = pd.read_csv('./test_station1.csv')
test_data_station2   = pd.read_csv('./test_Station2.csv')

weather_data = pd.read_csv('./assets/weather.csv')

In [20]:
weather_data.shape

(2944, 22)

In [21]:
#maybe impute missing M instead?
weather_data.replace('M', 0, inplace=True)
weather_data.replace('T', 0, inplace=True)
weather_data.replace(' T', 0, inplace=True)
weather_data.replace('  T', 0, inplace=True)

In [22]:
#these have almost no data
#sunset/sunrise - not sure how these can contribute

weather_data.drop(columns=['Water1', 'SnowFall', 'Depth', 'Sunrise','Sunset'], axis=1, inplace=True)

In [23]:
#changing dtypes for some variables that are viewed as strings

weather_data['SeaLevel'] = weather_data['SeaLevel'].astype(float)
weather_data['AvgSpeed'] = weather_data['AvgSpeed'].astype(float)
weather_data['PrecipTotal'] = weather_data['PrecipTotal'].astype(float)
weather_data['StnPressure'] = weather_data['StnPressure'].astype(float)
weather_data['WetBulb'] = weather_data['WetBulb'].astype(int)
weather_data['Depart'] = weather_data['Depart'].astype(int)

In [24]:
#converting some more variables from strings to integers. For some reason the above approach
#did not work for these.

new =[]
for i in weather_data['Tavg']:
    new.append(int(i))
weather_data['Tavg'] = new

new =[]
for i in weather_data['Heat']:
    new.append(int(i))
weather_data['Heat'] = new

new =[]
for i in weather_data['Cool']:
    new.append(int(i))
weather_data['Cool'] = new

In [25]:
#see if changing precip into fewer categories would make a difference

new_precip = []
for i in weather_data['PrecipTotal']:
    if i == 0:
        new_precip.append('No_Precip')
    if i>0 and i<=1:
        new_precip.append('Trace')
    if i>1:
        new_precip.append('Precip')
weather_data['PrecipCat'] = new_precip

In [26]:
# Alternative handling of these weather patterns:
# (patterns present vs not) EXCEPT all the missing values count as 0

patterns = ['SH', 'DZ', 'RA', 'TS', 'BR']
string = 'SH'  
count = []
for i in weather_data['CodeSum']:
    if any(x in i for x in patterns):
        count.append(0)
    else:
        count.append(1)
len(count)
weather_data['NewCodeSum'] = count
weather_data['NewCodeSum'].value_counts()

1    1651
0    1293
Name: NewCodeSum, dtype: int64

In [27]:
# we only care about Shower(SH), drizzle(DZ), rain(RA)
# Thunderstorm(TS), Mist(BR) = Each of these will reduce the number of mosquitoes

#I THINK THIS MAY BE ONLY PICKING OBSERVATIONS WHERE WE SEE ONE OF THESE, NOT COMBINED. ASK BRIAN IF THAT WAS
#THE INTENT (if we have SH DZ - we don't want that?)


patterns = ['SH', 'DZ', 'RA', 'TS', 'BR']

# iterate through all the CodeSum to find the columns with the patterns list
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: [t for t in x.split('0') 
                                                                   if t in patterns])
# converts everything from the lists that were created in the previous code into strings
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: 
                                                x if not isinstance(x, list) else x[0] if len(x) else 'None')

weather_data.CodeSum.value_counts()

None    2520
RA       296
BR       110
TS        10
DZ         8
Name: CodeSum, dtype: int64

In [28]:
weather_data.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg             int64
Depart           int64
DewPoint         int64
WetBulb          int64
Heat             int64
Cool             int64
CodeSum         object
PrecipTotal    float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
PrecipCat       object
NewCodeSum       int64
dtype: object

In [29]:
weather_data.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,PrecipCat,NewCodeSum
0,1,2007-05-01,83,50,67,14,51,56,0,2,,0.0,29.1,29.82,1.7,27,9.2,No_Precip,1
1,2,2007-05-01,84,52,68,0,51,57,0,3,,0.0,29.18,29.82,2.7,25,9.6,No_Precip,1
2,1,2007-05-02,59,42,51,-3,42,47,14,0,BR,0.0,29.38,30.09,13.0,4,13.4,No_Precip,0
3,2,2007-05-02,60,43,52,0,42,47,13,0,,0.0,29.44,30.08,13.3,2,13.4,No_Precip,0
4,1,2007-05-03,66,46,56,2,40,48,9,0,,0.0,29.39,30.12,11.7,7,11.9,No_Precip,1


In [30]:
#Heating degree days are summations of negative differences between the mean daily temperature
#and the 65°F base; cooling degree days are summations of positive differences from the same base.
#For example, cooling degree days for a station with daily mean temperatures during a seven-day 
#period of 67,65,70,74,78,65 and 68, are 2,0,5,9,13,0,and 3, for a total for the week of 32 cooling degree days.

#SO TAVG IS ACTUALLY SAME AS HEAT/COOL? Drop those.

new_cool = []
for i in weather_data['Cool']:
    new_cool.append(65+i)
len(new_cool)

2944

In [31]:
weather_data.drop(columns=['Heat','Cool'], axis=1, inplace=True)

In [32]:
#split weather by stations

station1 = weather_data[weather_data['Station']==1]
print(station1.shape)
station2 = weather_data[weather_data['Station']==2]
print(station2.shape)

(1472, 17)
(1472, 17)


In [33]:
station1.head(1)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,PrecipCat,NewCodeSum
0,1,2007-05-01,83,50,67,14,51,56,,0.0,29.1,29.82,1.7,27,9.2,No_Precip,1


In [34]:
test_data_station1.head(50)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Station
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1


In [35]:
#apply weather data from the stations closest to the data/test sets, merge on 'Date'

train_data_station1 = train_data_station1.merge(station1,on="Date")
train_data_station2 = train_data_station2.merge(station2,on="Date")
test_data_station1 = test_data_station1.merge(station1,on="Date")
test_data_station2 = test_data_station2.merge(station2,on="Date")

In [36]:
#combining test and train sets that belong to station 1 and 2

train_data = pd.concat([train_data_station1, train_data_station2])
test_data = pd.concat([test_data_station1, test_data_station2])

In [37]:
train_data.drop(columns=['Station_x','Station_y'], axis=1, inplace=True)
test_data.drop(columns=['Station_x','Station_y'], axis=1, inplace=True)

In [38]:
#confirming that the number of rows/observations is still the same
train_data.shape, test_data.shape

((10506, 27), (116293, 26))

In [49]:
#changing the order back to the original order
test_data = test_data.sort_values(by=['Id'])

In [50]:
test_data.tail(5)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,WetBulb,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,PrecipCat,NewCodeSum
80887,116289,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX SALINARIUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,...,65,,0.72,29.1,29.78,7.2,17,7.9,Trace,0
80888,116290,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TERRITANS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,...,65,,0.72,29.1,29.78,7.2,17,7.9,Trace,0
80889,116291,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TARSALIS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,...,65,,0.72,29.1,29.78,7.2,17,7.9,Trace,0
80890,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,...,65,,0.72,29.1,29.78,7.2,17,7.9,Trace,0
80891,116293,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX ERRATICUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,...,65,,0.72,29.1,29.78,7.2,17,7.9,Trace,0


In [40]:
train_data.to_csv('train_final.csv', index = False)
test_data.to_csv('test_final.csv', index = False)